In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
def remove_quotes(value):
    value = value.replace("['", "[")
    value = value.replace("']", "]")
    value = value.replace(", '", ", ")
    value = value.replace("',", ",")
    return value

In [3]:
def remove_outer_bracket(value):
    return value[1:-1]

In [4]:
def add_asterisks(value):
    return '**' + value + '**'

In [5]:
# split the data set into 70% train,  20% test, 10% valid
complete_dataset = pd.read_csv('./2-NEL_Data/1-csv_format_1/webqsp_dataset.csv')
len(complete_dataset)

2990

In [6]:
train_val = complete_dataset.sample(frac=0.8,random_state=42) #random state is a seed value
web_test = complete_dataset.drop(train_val.index)

In [7]:
web_val = train_val.sample(frac=0.125,random_state=42)
web_train = train_val.drop(web_val.index)

In [8]:
web_train = web_train.reset_index(drop=True)
web_test = web_test.reset_index(drop=True)
web_val = web_val.reset_index(drop=True)

In [9]:
# print(len(web_train))
# print(len(web_train) / len(complete_dataset))
# print(len(web_test))
# print(len(web_test) / len(complete_dataset))
# print(len(web_val))
# print(len(web_val) / len(complete_dataset))

In [10]:
simple_train = pd.read_csv('./2-NEL_Data/1-csv_format_1/simple_questions_training_dataset.csv')

In [11]:
training_data = pd.concat([simple_train, web_train])

In [12]:
training_data['entity'] = training_data['entity'].apply(remove_quotes)
training_data['wikidata_reply'] = training_data['wikidata_reply'].apply(remove_quotes)

In [13]:
training_data['entity'] = training_data['entity'].apply(remove_outer_bracket)
training_data['wikidata_reply'] = training_data['wikidata_reply'].apply(remove_outer_bracket)

In [14]:
training_data.reset_index(inplace=True,drop=True)
training_data

Unnamed: 0,qid,question,entity,wikidata_reply
0,Q126399,what movie is produced by warner bros.,warner bros.,"[[Q126399, Warner Bros., American producer of ..."
1,Q12439,who is a musician born in detroit,detroit,"[[Q12439, Detroit, city in and county seat of ..."
2,Q7370831,who produced the film rough house rosie,rough house rosie,"[[Q4421558, Rough House Rosie, 1927 film by Fr..."
3,Q6817891,what is the language in which mera shikar was ...,mera shikar,"[[Q6817891, Mera Shikar, 1988 film by Keshu Ra..."
4,Q1297,Whats the name of a battle that happened in ch...,chicago,"[[Q1297, Chicago, city and county seat of Cook..."
...,...,...,...,...
36462,Q169982,what films has gerard butler starred in?,gerard butler,"[[Q169982, Gerard Butler, Scottish actor], [Q1..."
36463,Q717,what are the major sports played in venezuela?,venezuela,"[[Q717, Venezuela, sovereign state in northern..."
36464,Q193695,who played dorothy in the film wizard of oz?,wizard of oz,"[[Q193695, The Wizard of Oz, 1939 movie based ..."
36465,Q1024426,where is usc from?,usc,"[[Q4614, University of Southern California, pr..."


In [15]:
len(training_data[training_data['wikidata_reply'].str.len() <= 2].index)

2226

In [16]:
training_data = training_data.drop(training_data[training_data['wikidata_reply'].str.len() <= 2].index)
training_data.reset_index(inplace=True,drop=True)
training_data

Unnamed: 0,qid,question,entity,wikidata_reply
0,Q126399,what movie is produced by warner bros.,warner bros.,"[[Q126399, Warner Bros., American producer of ..."
1,Q12439,who is a musician born in detroit,detroit,"[[Q12439, Detroit, city in and county seat of ..."
2,Q7370831,who produced the film rough house rosie,rough house rosie,"[[Q4421558, Rough House Rosie, 1927 film by Fr..."
3,Q6817891,what is the language in which mera shikar was ...,mera shikar,"[[Q6817891, Mera Shikar, 1988 film by Keshu Ra..."
4,Q1297,Whats the name of a battle that happened in ch...,chicago,"[[Q1297, Chicago, city and county seat of Cook..."
...,...,...,...,...
34236,Q169982,what films has gerard butler starred in?,gerard butler,"[[Q169982, Gerard Butler, Scottish actor], [Q1..."
34237,Q717,what are the major sports played in venezuela?,venezuela,"[[Q717, Venezuela, sovereign state in northern..."
34238,Q193695,who played dorothy in the film wizard of oz?,wizard of oz,"[[Q193695, The Wizard of Oz, 1939 movie based ..."
34239,Q1024426,where is usc from?,usc,"[[Q4614, University of Southern California, pr..."


In [17]:
training_data['question'] = training_data['question'].apply(add_asterisks)
training_data['entity'] = training_data['entity'].apply(add_asterisks)
training_data['wikidata_reply'] = training_data['wikidata_reply'].apply(add_asterisks)
training_data

Unnamed: 0,qid,question,entity,wikidata_reply
0,Q126399,**what movie is produced by warner bros.**,**warner bros.**,"**[[Q126399, Warner Bros., American producer o..."
1,Q12439,**who is a musician born in detroit**,**detroit**,"**[[Q12439, Detroit, city in and county seat o..."
2,Q7370831,**who produced the film rough house rosie**,**rough house rosie**,"**[[Q4421558, Rough House Rosie, 1927 film by ..."
3,Q6817891,**what is the language in which mera shikar wa...,**mera shikar**,"**[[Q6817891, Mera Shikar, 1988 film by Keshu ..."
4,Q1297,**Whats the name of a battle that happened in ...,**chicago**,"**[[Q1297, Chicago, city and county seat of Co..."
...,...,...,...,...
34236,Q169982,**what films has gerard butler starred in?**,**gerard butler**,"**[[Q169982, Gerard Butler, Scottish actor], [..."
34237,Q717,**what are the major sports played in venezuel...,**venezuela**,"**[[Q717, Venezuela, sovereign state in northe..."
34238,Q193695,**who played dorothy in the film wizard of oz?**,**wizard of oz**,"**[[Q193695, The Wizard of Oz, 1939 movie base..."
34239,Q1024426,**where is usc from?**,**usc**,"**[[Q4614, University of Southern California, ..."


In [18]:
training_data.to_csv('./2-NEL_Data/2-csv_format_2/training_data.csv', index=False)

In [19]:
simple_val = pd.read_csv('./2-NEL_Data/1-csv_format_1/simple_questions_val_dataset.csv')

In [20]:
val_data = pd.concat([simple_val, web_val])

In [21]:
val_data['entity'] = val_data['entity'].apply(remove_quotes)
val_data['wikidata_reply'] = val_data['wikidata_reply'].apply(remove_quotes)

In [22]:
val_data['entity'] = val_data['entity'].apply(remove_outer_bracket)
val_data['wikidata_reply'] = val_data['wikidata_reply'].apply(remove_outer_bracket)

In [23]:
val_data.reset_index(inplace=True,drop=True)
val_data

Unnamed: 0,qid,question,entity,wikidata_reply
0,Q3541144,Who was the trump ocean club international hot...,trump ocean club,"[[Q3541144, JW Marriott Panama, hotel in Panama]]"
1,Q318926,where was sasha vujačić born,sasha vujačić,"[[Q318926, Sasha Vujačić, Slovenian basketball..."
2,Q2568216,What is a film directed by wiebke von carolsfeld?,wiebke von carolsfeld,"[[Q2568216, Wiebke Carolsfeld, German film edi..."
3,Q2275923,What was Seymour Parker Gilbert's profession?,Seymour Parker Gilbert,"[[Q2275923, Seymour Parker Gilbert, American d..."
4,Q2856873,in what french city did antoine de févin die,antoine de févin,"[[Q2856873, Antoine de Févin, French composer]]"
...,...,...,...,...
5161,Q325374,where did phil mickelson go to college?,phil mickelson,"[[Q325374, Phil Mickelson, American profession..."
5162,Q49542,what do ethnic russians look like?,russians,"[[Q49542, Russians, East Slavic ethnic group, ..."
5163,Q39,what other languages does switzerland speak?,switzerland,"[[Q39, Switzerland, country in Central Europe]..."
5164,Q40715,when did jennifer lopez start on in living color?,jennifer lopez,"[[Q40715, Jennifer Lopez, American artist and ..."


In [24]:
len(val_data[val_data['wikidata_reply'].str.len() <= 2].index)

329

In [25]:
val_data=val_data.drop(val_data[val_data['wikidata_reply'].str.len() <= 2].index)
val_data.reset_index(inplace=True,drop=True)
val_data

Unnamed: 0,qid,question,entity,wikidata_reply
0,Q3541144,Who was the trump ocean club international hot...,trump ocean club,"[[Q3541144, JW Marriott Panama, hotel in Panama]]"
1,Q318926,where was sasha vujačić born,sasha vujačić,"[[Q318926, Sasha Vujačić, Slovenian basketball..."
2,Q2568216,What is a film directed by wiebke von carolsfeld?,wiebke von carolsfeld,"[[Q2568216, Wiebke Carolsfeld, German film edi..."
3,Q2275923,What was Seymour Parker Gilbert's profession?,Seymour Parker Gilbert,"[[Q2275923, Seymour Parker Gilbert, American d..."
4,Q2856873,in what french city did antoine de févin die,antoine de févin,"[[Q2856873, Antoine de Févin, French composer]]"
...,...,...,...,...
4832,Q325374,where did phil mickelson go to college?,phil mickelson,"[[Q325374, Phil Mickelson, American profession..."
4833,Q49542,what do ethnic russians look like?,russians,"[[Q49542, Russians, East Slavic ethnic group, ..."
4834,Q39,what other languages does switzerland speak?,switzerland,"[[Q39, Switzerland, country in Central Europe]..."
4835,Q40715,when did jennifer lopez start on in living color?,jennifer lopez,"[[Q40715, Jennifer Lopez, American artist and ..."


In [26]:
val_data['question'] = val_data['question'].apply(add_asterisks)
val_data['entity'] = val_data['entity'].apply(add_asterisks)
val_data['wikidata_reply'] = val_data['wikidata_reply'].apply(add_asterisks)
val_data

Unnamed: 0,qid,question,entity,wikidata_reply
0,Q3541144,**Who was the trump ocean club international h...,**trump ocean club**,"**[[Q3541144, JW Marriott Panama, hotel in Pan..."
1,Q318926,**where was sasha vujačić born**,**sasha vujačić**,"**[[Q318926, Sasha Vujačić, Slovenian basketba..."
2,Q2568216,**What is a film directed by wiebke von carols...,**wiebke von carolsfeld**,"**[[Q2568216, Wiebke Carolsfeld, German film e..."
3,Q2275923,**What was Seymour Parker Gilbert's profession?**,**Seymour Parker Gilbert**,"**[[Q2275923, Seymour Parker Gilbert, American..."
4,Q2856873,**in what french city did antoine de févin die**,**antoine de févin**,"**[[Q2856873, Antoine de Févin, French compose..."
...,...,...,...,...
4832,Q325374,**where did phil mickelson go to college?**,**phil mickelson**,"**[[Q325374, Phil Mickelson, American professi..."
4833,Q49542,**what do ethnic russians look like?**,**russians**,"**[[Q49542, Russians, East Slavic ethnic group..."
4834,Q39,**what other languages does switzerland speak?**,**switzerland**,"**[[Q39, Switzerland, country in Central Europ..."
4835,Q40715,**when did jennifer lopez start on in living c...,**jennifer lopez**,"**[[Q40715, Jennifer Lopez, American artist an..."


In [27]:
val_data.to_csv('./2-NEL_Data/2-csv_format_2/val_data.csv', index=False)

In [42]:
simple_test = pd.read_csv('./2-NEL_Data/1-csv_format_1/simple_questions_test_dataset.csv')

In [43]:
test_data = pd.concat([simple_test, web_test])

In [44]:
test_data['entity'] = test_data['entity'].apply(remove_quotes)
test_data['wikidata_reply'] = test_data['wikidata_reply'].apply(remove_quotes)

In [45]:
test_data['entity'] = test_data['entity'].apply(remove_outer_bracket)
test_data['wikidata_reply'] = test_data['wikidata_reply'].apply(remove_outer_bracket)

In [46]:
test_data.reset_index(inplace=True,drop=True)
test_data

Unnamed: 0,qid,question,entity,wikidata_reply
0,Q5487302,Which genre of album is harder.....faster?,,
1,Q16330302,what city was alex golfis born in,alex golfis,"[[Q16330302, Alex Golfis, Greek actor (1948-20..."
2,Q16225521,what film is by the writer phil hay?,phil hay,"[[Q16225521, Phil Hay, screenwriter], [Q718198..."
3,Q7358590,Where did roger marquis die,roger marquis,"[[Q7358592, Roger Marquis, 2nd Earl of Woolton..."
4,Q154335,what was the cause of death of yves klein,yves klein,"[[Q154335, Yves Klein, 20th century French vis..."
...,...,...,...,...
10554,Q74394,where is fresno texas located?,fresno texas,[]
10555,Q96,what type of government does mexico use?,mexico,"[[Q96, Mexico, sovereign state in North Americ..."
10556,Q270975,what team rasheed wallace?,rasheed wallace,"[[Q270975, Rasheed Wallace, American basketbal..."
10557,Q706908,what team does pudge rodriguez play for?,pudge rodriguez,"[[Q706908, Iván Rodríguez, Puerto Rican Hall o..."


In [47]:
len(test_data[test_data['wikidata_reply'].str.len() <= 2].index)

653

In [48]:
test_data=test_data.drop(test_data[test_data['wikidata_reply'].str.len() <= 2].index)
test_data.reset_index(inplace=True,drop=True)

In [49]:
test_data['question'] = test_data['question'].apply(add_asterisks)
test_data['entity'] = test_data['entity'].apply(add_asterisks)
test_data['wikidata_reply'] = test_data['wikidata_reply'].apply(add_asterisks)

In [50]:
test_data

Unnamed: 0,qid,question,entity,wikidata_reply
0,Q16330302,**what city was alex golfis born in**,**alex golfis**,"**[[Q16330302, Alex Golfis, Greek actor (1948-..."
1,Q16225521,**what film is by the writer phil hay?**,**phil hay**,"**[[Q16225521, Phil Hay, screenwriter], [Q7181..."
2,Q7358590,**Where did roger marquis die**,**roger marquis**,"**[[Q7358592, Roger Marquis, 2nd Earl of Woolt..."
3,Q154335,**what was the cause of death of yves klein**,**yves klein**,"**[[Q154335, Yves Klein, 20th century French v..."
4,Q1761,**Which equestrian was born in dublin?**,**dublin**,"**[[Q1761, Dublin, capital city of Ireland], [..."
...,...,...,...,...
9901,Q242654,**what party did paul keating belong to?**,**paul keating**,"**[[Q242654, Paul Keating, Australian politici..."
9902,Q96,**what type of government does mexico use?**,**mexico**,"**[[Q96, Mexico, sovereign state in North Amer..."
9903,Q270975,**what team rasheed wallace?**,**rasheed wallace**,"**[[Q270975, Rasheed Wallace, American basketb..."
9904,Q706908,**what team does pudge rodriguez play for?**,**pudge rodriguez**,"**[[Q706908, Iván Rodríguez, Puerto Rican Hall..."


In [51]:
test_data.to_csv('./2-NEL_Data/2-csv_format_2/test_data.csv', index=False)