In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
FILENAMEC = 'child_triple.txt'
FILENAMEP = 'parent_triple.txt'
FILENAMES = 'spouse_triple.txt'

In [3]:
def load_data(filename):
    
    # reading our data from the respective text files
    data = pd.read_csv(filename, delimiter="\t", header=None)
    
    # dropping columns that are empty
    data.dropna(axis = 1, inplace = True)
    
    # renaming our columns from 0, 2, and 4 to person1, relation, and person2
    data.rename(columns={0:'person1', 2:'relation', 4:'person2'}, inplace = True)
    
    return data

In [4]:
def check(dataframe):
    
    #checking the number of rows and columns in the dataset
    print(f'The data has {dataframe.shape[0]} rows and {dataframe.shape[1]} colums.')
    
    #checking number of unique entries in the respective columns
    unique1 = dataframe['person1'].unique()
    unique2 = dataframe['person2'].unique()
    print(f'There are {len(unique1)} unique values in Person1 column and {len(unique2)} unique values in Person2 column.')
    
    print(f'Number of empty rows in the respective columns are person1: {dataframe.isnull().sum()[0]}, relation: {dataframe.isnull().sum()[1]}, person2: {dataframe.isnull().sum()[2]}')

In [5]:
def merge_column(data_x, data_xy, data_y, col1='person1', col2='relation', col3='person2', diff=True):
    
    #create a new col named 'relation triplet' and assign it the value of all columns combined
    if diff:
        data_xy['relation triplet'] = data_x[col1].astype(str) + ' ' + data_xy[col2].astype(str) + ' ' + data_y[col3].astype(str)
    else:
        data_xy['relation triplet'] = data_xy[col1].astype(str) + ' ' + data_xy[col2].astype(str) + ' ' + data_xy[col3].astype(str)
    
    return data_xy

In [6]:
child_data = load_data(FILENAMEC)
child_data.head()

Unnamed: 0,person1,relation,person2
0,Abraham Lincoln,child,Robert Todd Lincoln
1,Abraham Lincoln,child,Edward Baker Lincoln
2,Abraham Lincoln,child,William Wallace Lincoln
3,Abraham Lincoln,child,Tad Lincoln
4,Alfred Hitchcock,child,Patricia Hitchcock


**The structure of the Child text file is "Parent - Relation - Child".**

In [7]:
parent_data = load_data(FILENAMEP)
parent_data.head()

Unnamed: 0,person1,relation,person2
0,Andrei Tarkovsky,parent,Arseny Tarkovsky
1,Alexander the Great,parent,Philip II of Macedon
2,Alexander the Great,parent,Olympias
3,Attila,parent,Mundzuk
4,Alp Arslan,parent,Chaghri Beg


**The structure of the Parent text file is "Child - Relation - Parent".**

In [8]:
spouse_data = load_data(FILENAMES)
spouse_data.head()

Unnamed: 0,person1,relation,person2
0,Abraham Lincoln,spouse,Mary Todd Lincoln
1,Allan Dwan,spouse,Pauline Bush (actress)
2,Andrei Tarkovsky,spouse,Irma Raush
3,Andrei Tarkovsky,spouse,Larisa Tarkovskaya
4,Albert Einstein,spouse,Mileva Marić


**The structure of the Spouse text file is "Spouse - Relation - Spouse".**
#### Gender information is unknown in all three files.

In [9]:
print("Child text data")
check(child_data)
print("------------------------------------------------------------------------------------------------")
print("Parent text data")
check(parent_data)
print("------------------------------------------------------------------------------------------------")
print("Spouse text data")
check(spouse_data)
print("------------------------------------------------------------------------------------------------")

Child text data
The data has 14139 rows and 3 colums.
There are 10003 unique values in Person1 column and 12356 unique values in Person2 column.
Number of empty rows in the respective columns are person1: 0, relation: 0, person2: 0
------------------------------------------------------------------------------------------------
Parent text data
The data has 27824 rows and 3 colums.
There are 19172 unique values in Person1 column and 17971 unique values in Person2 column.
Number of empty rows in the respective columns are person1: 0, relation: 0, person2: 0
------------------------------------------------------------------------------------------------
Spouse text data
The data has 34163 rows and 3 colums.
There are 30223 unique values in Person1 column and 31475 unique values in Person2 column.
Number of empty rows in the respective columns are person1: 0, relation: 0, person2: 0
------------------------------------------------------------------------------------------------


#### We don't have any empty values in all three of our dataframes.

In [10]:
14139 + 27824 + 34163

76126

In [11]:
10003 + 19172 + 30223

59398

In [12]:
12356 + 17971 + 31475

61802

There are **76,126** rows in our dataset. Approximately we have around **59,398** unique values in *'person1'* column and **61,802** unique values in *'person2'* column. This is a rough estimate for now. We will find out the actual number of unique values when we merge the three dataframes into one and the three columns of this dataframe into one column. 

In [13]:
# just an example of how unclean the data is
spouse_info = spouse_data['person2'].tolist()
spouse_info

['Mary Todd Lincoln',
 'Pauline Bush (actress)',
 'Irma Raush',
 'Larisa Tarkovskaya',
 'Mileva Marić',
 'Elsa Löwenthal',
 'Roxana',
 'Stateira II',
 'Parysatis II',
 'Kreka',
 'Ildico',
 'Alma Reville',
 'Mabel Hubbard',
 'Yōko Yaguchi',
 'Edna Mayne Hull',
 'William King-Noel, 1st Earl of Lovelace',
 'Archie Christie',
 'Max Mallowan',
 'Helene Bresslau Schweitzer',
 'Harry Bresslau',
 'Asandhimitra',
 'Jane Mackenzie',
 'Faustina the Elder',
 'Clodia Pulchra',
 'Scribonia',
 'Livia',
 'Abby May',
 'Rose Edith Kelly',
 'Bonnie MacBird',
 'Dorothea of Denmark, Duchess of Prussia',
 'Anna Marie of Brunswick-Lüneburg',
 'Sophie of Winzenburg',
 'Rabia Sultan',
 'Emine Mihrişah Sultan',
 'Rabia Şermi Sultan',
 'Kösem Sultan',
 'Mahfiruz Hatice Sultan',
 'Philip II of France',
 'Germanicus',
 'Gnaeus Domitius Ahenobarbus (consul 32)',
 'Gaius Sallustius Crispus Passienus',
 'Claudius',
 'Chlothsind',
 'Rosamund (Gepid)',
 'Joan of England, Queen of Scots',
 'Marie de Coucy',
 'Helena of 

The relation column does not need any kind of cleaning or modification. However, the person1 and person2 columns of all three dataframes need extensive cleaning. There is a lot of trivial information in these columns. A lot of the rows have years mentioned, while others have titles like "Queen of England", "Earl of Lovalace", "Duchess of Prussia" to name a few. 

###  I have two methods to clean these columns with maximum retention of information. 

1. The first method will split a column into multiple columns based on the presence whitespaces.    

* This will turn a column that looks like "**Prince William Harry, the third**" into **"Prince", "William", "Harry,", "the", "third"**.    


2. The second method will split a column into multiple columns based on the presence of commas.     

* This will turn a column that looks like "**Prince William Harry, the third**" into "**Prince William Harry**", and "**the third**".  

In [14]:
def split_column(data, how = ' ', exp = True, col_name = 'Name'):
    
    # splitting a column in our dataframe based on whitespace, comma, or any other specification
    data_split = data.str.split(how, expand = exp)
    
    if how == ',':
        data_split.rename(columns={0:col_name}, inplace = True)
    else:
        data_split.rename(columns={0:"First Name", 1:"Middle Name", 2:"Last Name"}, inplace = True)
    
    return data_split

In [15]:
spouse_person2 = split_column(spouse_data['person2'], ',')

In [16]:
spouse_person2.iloc[1:10]

Unnamed: 0,Name,1,2
1,Pauline Bush (actress),,
2,Irma Raush,,
3,Larisa Tarkovskaya,,
4,Mileva Marić,,
5,Elsa Löwenthal,,
6,Roxana,,
7,Stateira II,,
8,Parysatis II,,
9,Kreka,,


In [17]:
# taking a look at row 15 after splitting our column
spouse_person2.iloc[[15]]

Unnamed: 0,Name,1,2
15,William King-Noel,1st Earl of Lovelace,


In [18]:
# taking a look at row 59 after splitting our column
spouse_person2.iloc[[59]]

Unnamed: 0,Name,1,2
59,Beatrice D'Este (Queen Consort of Hungary),,


Initially, the first approach seems promising but in some cases like row 59 we end up with useless information inside parenthesis. If we decide to go with this approach we will have a lot of trivial information in our training data.

In [19]:
spouse_person2_ws = split_column(spouse_data['person2'], ' ') 

In [20]:
spouse_person2_ws.iloc[1:10]

Unnamed: 0,First Name,Middle Name,Last Name,3,4,5,6,7,8,9,10,11,12,13
1,Pauline,Bush,(actress),,,,,,,,,,,
2,Irma,Raush,,,,,,,,,,,,
3,Larisa,Tarkovskaya,,,,,,,,,,,,
4,Mileva,Marić,,,,,,,,,,,,
5,Elsa,Löwenthal,,,,,,,,,,,,
6,Roxana,,,,,,,,,,,,,
7,Stateira,II,,,,,,,,,,,,
8,Parysatis,II,,,,,,,,,,,,
9,Kreka,,,,,,,,,,,,,


In [21]:
# taking a look at row 15 after splitting our column
spouse_person2_ws.iloc[[15]]

Unnamed: 0,First Name,Middle Name,Last Name,3,4,5,6,7,8,9,10,11,12,13
15,William,"King-Noel,",1st,Earl,of,Lovelace,,,,,,,,


In [22]:
# taking a look at row 59 after splitting our column
spouse_person2_ws.iloc[[59]]

Unnamed: 0,First Name,Middle Name,Last Name,3,4,5,6,7,8,9,10,11,12,13
59,Beatrice,D'Este,(Queen,Consort,of,Hungary),,,,,,,,


However, if we assume that a name consists of atmost 2-3 words then we can simply use the second approach and split our column in first, middle and last names. Later we can combine these and drop the rest of the colums. A big risk of this approach is our assumption because the dataset is too large to actually check all entries manually.

Even after this, some anomalities exits. While going through the data an entry **'1983 Major League Baseball season'** was found. The presence of such an entry in the person2 column of spouse data renders the entire row useless. At this point, it is unknown as to how many entries are similar to this, because we are only interested in names of people and relations between them.

## Using regular expressions to clean the data.

I ran some experiments on the data to clean it using the regex module of python. In short these regular expressions have helped us to remove **initials, non alphanumeric characters, anything within parenthesis, roman numerals, and multiple spaces** from our data. At last we are converting our data into lower case alphabets.

Detailed experiments can be found below in this notebook.

In [23]:
def clean_column(text):
    
    # removing initials from names
    text = re.sub(r"[A-Z]\.", r" ", text)
    # removing non alphanumeric characters
    text = re.sub(r"([-;;.,!?<=>])", r" ", text)
    
    # removing anything within parenthesis for e.g (1943-46), (actress)
    text = re.sub(r'([/-;;.,"!&?<=>\\])', r"", text)
    # removing roman numerals like I, II, V, IX etc.
    text = re.sub(r"\b(?=[MDCLXVIΙ])M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})([IΙ]X|[IΙ]V|V?[IΙ]{0,3})\b\.?", r" ", text)
    # removing multiple spaces
    text = re.sub(" +", " ", text)
    
    # convert the text into lower case
    text = text.lower()
    # removing any blank spaces from the beginning or ending of text
    text = text.strip()
    
    return text

### Removing any information present within parenthesis.

If we use the below regular expression we can convert **"Allan Dwan spouse Pauline Bush(Actress)"** to **'allan dwan spouse pauline bush'**.

In [24]:
text = "Allan Dwan spouse Pauline Bush(Actress)"
text = re.sub(r"\([^()]*\)", r"", text)
text.strip()
text.lower()

'allan dwan spouse pauline bush'

In [25]:
text = 'Beatrice of Castile(1242–1303)'
text = re.sub(r"\([^()]*\)", r"", text)
text.strip()
text.lower()

'beatrice of castile'

### Removing roman numerls from our data.

Roman numerical system: **I = 1, V = 5, X = 10, L = 50, C = 100, D = 500, M = 1,000**. Using a combination of these alphabets we can represent any number in the roman numerical system. For e.g. 4 is represented by IV. The used regular expression is robust as it will just remove the roman numerls and not the corresponding English alphabets like I, V, X, L, C, D, and M.

If we use the below regular expression we can convert **"Alan TuringX XVII I IV IX M"** to **'Alan TuringX'**.

In [26]:
text = "Alan TuringX XVII I IV IX M"
text = re.sub(r"\b(?=[MDCLXVIΙ])M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})([IΙ]X|[IΙ]V|V?[IΙ]{0,3})\b\.?", r"", text)
text.strip()

'Alan TuringX'

### Removing initials from a name.

If we use the below regular expression we can remove any initial from a name as long as the initial is followed by a dot. We can convert **'A. D. van Vogt'** to **'van vogt'**.

In [27]:
text = 'A. D. van vogt'
text = re.sub(r'[A-Z]\.', r'', text)
text.strip()

'van vogt'

### Removing special character from a name

Special characters include - '**/**', '**-**', '**;**', '**.**', '**"**', '**!**', '**&**', '**?**', '**<**', '**>**', and '**\**'. If we use the below regular expression we can remove any of the above mentioned special characters from a name. We can convert **'Charles// \\"Buddy\\" &R;o:gers'** to **'Charles Buddy Rogers'**.

In [28]:
text = 'Charles// \\"Buddy\\" &R;o:gers'
text = re.sub(r'([/-;;.,"!&?<=>\\])', r"", text)
text.strip()

'Charles Buddy Rogers'

In [29]:
text = 'duke of prussia spouse anna marie of //brunswick" - /lüneburg'
text = re.sub(r'([/-;;.,"!?<=>])', r"", text)
text = re.sub(" +", " ", text)
text.strip()

'duke of prussia spouse anna marie of brunswick - lüneburg'

### Removing trivial information under names from our data.

While taking a look at our dataset we came across some trivial entries under the names of people. Some of these entries were '**1993 in film**', '**1997 in film**', and '**1983 Major League Baseball season**'. To handle these kinds of entries we will use a custom function to check if a name starts with a non alphabet and we will note down the index values of such entries for removing them later.

In [30]:
def check_name(data, col_name='person2'):
    
    # initializing an empty list 
    idx = []
    
    # iterating over the values in a specific column
    for i, val in enumerate(data[col_name]):
        # checking if the value starts with an english alphabet or not
        if val[0].isalpha():
            continue
        # appending the index of the value that does not start with an alphabet
        else:
            idx.append(i)
            
    return idx

### Splitting and merging columns to remove trivial information from data.

In [31]:
child_split_1 = split_column(child_data['person1'], ',', col_name = 'person1')
child_split_1.head()

Unnamed: 0,person1,1,2
0,Abraham Lincoln,,
1,Abraham Lincoln,,
2,Abraham Lincoln,,
3,Abraham Lincoln,,
4,Alfred Hitchcock,,


In [32]:
child_split_2 = split_column(child_data['person2'], ',', col_name = 'person2')
child_split_2.head()

Unnamed: 0,person2,1,2
0,Robert Todd Lincoln,,
1,Edward Baker Lincoln,,
2,William Wallace Lincoln,,
3,Tad Lincoln,,
4,Patricia Hitchcock,,


In [33]:
#child_data['merged'] = child_split_1['person1'].astype(str) + ' ' + child_data['relation'].astype(str) + ' ' + child_split_2['person2'].astype(str)
#child_data.head()

In [34]:
child_data = merge_column(child_split_2, child_data, child_split_1, col1='person2', col2='relation', col3='person1')
child_data.head()

Unnamed: 0,person1,relation,person2,relation triplet
0,Abraham Lincoln,child,Robert Todd Lincoln,Robert Todd Lincoln child Abraham Lincoln
1,Abraham Lincoln,child,Edward Baker Lincoln,Edward Baker Lincoln child Abraham Lincoln
2,Abraham Lincoln,child,William Wallace Lincoln,William Wallace Lincoln child Abraham Lincoln
3,Abraham Lincoln,child,Tad Lincoln,Tad Lincoln child Abraham Lincoln
4,Alfred Hitchcock,child,Patricia Hitchcock,Patricia Hitchcock child Alfred Hitchcock


In [35]:
parent_split_1 = split_column(parent_data['person1'], ',', col_name = 'person1')
parent_split_2 = split_column(parent_data['person2'], ',', col_name = 'person2')

In [36]:
parent_data = merge_column(parent_split_2, parent_data, parent_split_1, col1='person2', col2='relation', col3='person1')
parent_data.head()

Unnamed: 0,person1,relation,person2,relation triplet
0,Andrei Tarkovsky,parent,Arseny Tarkovsky,Arseny Tarkovsky parent Andrei Tarkovsky
1,Alexander the Great,parent,Philip II of Macedon,Philip II of Macedon parent Alexander the Great
2,Alexander the Great,parent,Olympias,Olympias parent Alexander the Great
3,Attila,parent,Mundzuk,Mundzuk parent Attila
4,Alp Arslan,parent,Chaghri Beg,Chaghri Beg parent Alp Arslan


In [37]:
parent_split_1.head()

Unnamed: 0,person1,1
0,Andrei Tarkovsky,
1,Alexander the Great,
2,Alexander the Great,
3,Attila,
4,Alp Arslan,


In [38]:
spouse_split_1 = split_column(spouse_data['person1'], ',', col_name = 'person1')
spouse_split_2 = split_column(spouse_data['person2'], ',', col_name = 'person2')

In [39]:
spouse_data = merge_column(spouse_split_1, spouse_data, spouse_split_2)
spouse_data.head()

Unnamed: 0,person1,relation,person2,relation triplet
0,Abraham Lincoln,spouse,Mary Todd Lincoln,Abraham Lincoln spouse Mary Todd Lincoln
1,Allan Dwan,spouse,Pauline Bush (actress),Allan Dwan spouse Pauline Bush (actress)
2,Andrei Tarkovsky,spouse,Irma Raush,Andrei Tarkovsky spouse Irma Raush
3,Andrei Tarkovsky,spouse,Larisa Tarkovskaya,Andrei Tarkovsky spouse Larisa Tarkovskaya
4,Albert Einstein,spouse,Mileva Marić,Albert Einstein spouse Mileva Marić


In [40]:
spouse_data.drop_duplicates(subset=['person1'], keep='first', inplace=True, ignore_index = True)

In [41]:
check(spouse_data)

The data has 30223 rows and 4 colums.
There are 30223 unique values in Person1 column and 28255 unique values in Person2 column.
Number of empty rows in the respective columns are person1: 0, relation: 0, person2: 0


In [42]:
spouse_data.drop_duplicates(subset=['person2'], keep='first', inplace=True, ignore_index = True)

In [43]:
check(spouse_data)

The data has 28255 rows and 4 colums.
There are 28255 unique values in Person1 column and 28255 unique values in Person2 column.
Number of empty rows in the respective columns are person1: 0, relation: 0, person2: 0


In [44]:
spouse_data.head()

Unnamed: 0,person1,relation,person2,relation triplet
0,Abraham Lincoln,spouse,Mary Todd Lincoln,Abraham Lincoln spouse Mary Todd Lincoln
1,Allan Dwan,spouse,Pauline Bush (actress),Allan Dwan spouse Pauline Bush (actress)
2,Andrei Tarkovsky,spouse,Irma Raush,Andrei Tarkovsky spouse Irma Raush
3,Albert Einstein,spouse,Mileva Marić,Albert Einstein spouse Mileva Marić
4,Alexander the Great,spouse,Roxana,Alexander the Great spouse Roxana


In [45]:
spouse_data.tail()

Unnamed: 0,person1,relation,person2,relation triplet
28250,John Graham (Australian politician),spouse,Jenny McAllister,John Graham (Australian politician) spouse Jen...
28251,Lisa Joy,spouse,Jonathan Nolan,Lisa Joy spouse Jonathan Nolan
28252,Shahriar Sayeed Husain,spouse,Farah Mahbub (judge),Shahriar Sayeed Husain spouse Farah Mahbub (ju...
28253,Hazrat Khanoum Nosrat Saltaneh,spouse,Ed-Dowleh,Hazrat Khanoum Nosrat Saltaneh spouse Ed-Dowleh
28254,Chantal Hochuli,spouse,Prince Ernst August of Hanover (born 1954),Chantal Hochuli spouse Prince Ernst August of ...


In [46]:
text = '1993 in film'
text[0].isdigit()

True

In [47]:
test_data = {"person1": ['Ajay', 'Anu', 'Abhinav', 'Tommy'], "relation": ['siblings', 'siblings', 'siblings', 'siblings'], "person2": ['1993 in game', 'Rashi', 'Vanshi', '#psy']}
df = pd.DataFrame(data=test_data)

In [48]:
df

Unnamed: 0,person1,relation,person2
0,Ajay,siblings,1993 in game
1,Anu,siblings,Rashi
2,Abhinav,siblings,Vanshi
3,Tommy,siblings,#psy


In [49]:
x, _ = df.shape
x

4

In [50]:
idx = []
for i, val in enumerate(df['person2']):
    if val[0].isalpha():
        continue
    else:
        idx.append(i)

In [51]:
idx

[0, 3]

In [64]:
def drop_rows(data, labels, axis=0, inplace=True):
    
    # dropping specific rows from our dataframe using a list of indexes
    data.drop(labels=labels, axis=axis, inplace=inplace)
    # reset the index of our dataframe
    data.reset_index(inplace=inplace)
    
    return data

In [65]:
idx = check_name(spouse_data)

In [66]:
idx

[100,
 927,
 1048,
 1840,
 1880,
 2106,
 2134,
 2989,
 3309,
 3400,
 3453,
 3550,
 3631,
 4474,
 5146,
 5434,
 5926,
 5937,
 6671,
 6805,
 7027,
 7623,
 7688,
 7756,
 7937,
 8467,
 8789,
 8839,
 9423,
 10045,
 10149,
 10163,
 11209,
 12157,
 12383,
 12963,
 16826,
 19086,
 20718,
 21219,
 22015,
 24948,
 27349,
 27882]

In [55]:
spouse_data.iloc[[2134]]

Unnamed: 0,person1,relation,person2,relation triplet
2134,Peter Fonda,spouse,1974 in film,Peter Fonda spouse 1974 in film


In [56]:
len(idx)

44

In [57]:
test = []
for val in idx:
    test.append(spouse_data.iloc[[val]])

In [58]:
test

[        person1 relation                            person2  \
 100  Bob Costas   spouse  1983 Major League Baseball season   
 
                                       relation triplet  
 100  Bob Costas spouse 1983 Major League Baseball s...  ,
              person1 relation              person2  \
 927  Whoopi Goldberg   spouse  66th Academy Awards   
 
                                relation triplet  
 927  Whoopi Goldberg spouse 66th Academy Awards  ,
           person1 relation              person2  \
 1048  Bette Davis   spouse  11th Academy Awards   
 
                             relation triplet  
 1048  Bette Davis spouse 11th Academy Awards  ,
            person1 relation      person2                 relation triplet
 1840  Phil Collins   spouse  1999 (song)  Phil Collins spouse 1999 (song),
              person1 relation             person2  \
 1880  Dwayne Johnson   spouse  1997 Slammy Awards   
 
                               relation triplet  
 1880  Dwayne Johnson sp

In [68]:
spouse_data.head()

Unnamed: 0,person1,relation,person2,relation triplet
0,Abraham Lincoln,spouse,Mary Todd Lincoln,Abraham Lincoln spouse Mary Todd Lincoln
1,Allan Dwan,spouse,Pauline Bush (actress),Allan Dwan spouse Pauline Bush (actress)
2,Andrei Tarkovsky,spouse,Irma Raush,Andrei Tarkovsky spouse Irma Raush
3,Albert Einstein,spouse,Mileva Marić,Albert Einstein spouse Mileva Marić
4,Alexander the Great,spouse,Roxana,Alexander the Great spouse Roxana


In [69]:
spouse_data.shape

(28255, 4)

In [70]:
spouse_data = drop_rows(spouse_data, idx)

In [71]:
spouse_data.shape

(28211, 5)

In [72]:
idx = check_name(spouse_data, col_name='person1')

In [73]:
idx

[74, 27032]

In [74]:
spouse_data.iloc[[74]]

Unnamed: 0,index,person1,relation,person2,relation triplet
74,74,`Abdu'l-Bahá,spouse,Munírih Khánum,`Abdu'l-Bahá spouse Munírih Khánum


In [75]:
spouse_data.iloc[[27032]]

Unnamed: 0,index,person1,relation,person2,relation triplet
27032,27074,(George) Clyde Fisher,spouse,Te Ata Fisher,(George) Clyde Fisher spouse Te Ata Fisher


In [77]:
spouse_data.drop(['index'], axis=1, inplace=True)

In [78]:
spouse_data

Unnamed: 0,person1,relation,person2,relation triplet
0,Abraham Lincoln,spouse,Mary Todd Lincoln,Abraham Lincoln spouse Mary Todd Lincoln
1,Allan Dwan,spouse,Pauline Bush (actress),Allan Dwan spouse Pauline Bush (actress)
2,Andrei Tarkovsky,spouse,Irma Raush,Andrei Tarkovsky spouse Irma Raush
3,Albert Einstein,spouse,Mileva Marić,Albert Einstein spouse Mileva Marić
4,Alexander the Great,spouse,Roxana,Alexander the Great spouse Roxana
...,...,...,...,...
28206,John Graham (Australian politician),spouse,Jenny McAllister,John Graham (Australian politician) spouse Jen...
28207,Lisa Joy,spouse,Jonathan Nolan,Lisa Joy spouse Jonathan Nolan
28208,Shahriar Sayeed Husain,spouse,Farah Mahbub (judge),Shahriar Sayeed Husain spouse Farah Mahbub (ju...
28209,Hazrat Khanoum Nosrat Saltaneh,spouse,Ed-Dowleh,Hazrat Khanoum Nosrat Saltaneh spouse Ed-Dowleh


In [79]:
idx = check_name(parent_data)

In [80]:
idx

[9289, 9307]

In [81]:
parent_data.iloc[[9289]]

Unnamed: 0,person1,relation,person2,relation triplet
9289,Sharaf al-Dawla,parent,'Adud al-Dawla,'Adud al-Dawla parent Sharaf al-Dawla


In [82]:
parent_data.iloc[[9307]]

Unnamed: 0,person1,relation,person2,relation triplet
9307,Samsam al-Dawla,parent,'Adud al-Dawla,'Adud al-Dawla parent Samsam al-Dawla


In [83]:
idx = check_name(parent_data, col_name='person1')

In [84]:
idx

[120, 121, 3634, 10311, 10312, 21983, 25945, 27264]

In [85]:
parent_data.iloc[[120]]

Unnamed: 0,person1,relation,person2,relation triplet
120,`Abdu'l-Bahá,parent,Bahá’u’lláh,Bahá’u’lláh parent `Abdu'l-Bahá


In [86]:
parent_data.iloc[[121]]

Unnamed: 0,person1,relation,person2,relation triplet
121,`Abdu'l-Bahá,parent,Ásíyih Khánum,Ásíyih Khánum parent `Abdu'l-Bahá


In [87]:
parent_data.iloc[[3634]]

Unnamed: 0,person1,relation,person2,relation triplet
3634,'Abd al-Ilah,parent,Ali of Hejaz,Ali of Hejaz parent 'Abd al-Ilah


In [88]:
parent_data.iloc[[10311]]

Unnamed: 0,person1,relation,person2,relation triplet
10311,'Adud al-Dawla,parent,Rukn al-Dawla,Rukn al-Dawla parent 'Adud al-Dawla


In [89]:
parent_data.iloc[[10312]]

Unnamed: 0,person1,relation,person2,relation triplet
10312,'Adud al-Dawla,parent,Firuzanids,Firuzanids parent 'Adud al-Dawla


In [90]:
parent_data.iloc[[21983]]

Unnamed: 0,person1,relation,person2,relation triplet
21983,'Abd ar-Rahman ibn Muhammad al-Amin,parent,Muhammad al-Amin al-Kanemi,Muhammad al-Amin al-Kanemi parent 'Abd ar-Rahm...


In [91]:
parent_data.iloc[[25945]]

Unnamed: 0,person1,relation,person2,relation triplet
25945,2014 Isla Vista killings,parent,Peter Rodger,Peter Rodger parent 2014 Isla Vista killings


In [92]:
parent_data.iloc[[27264]]

Unnamed: 0,person1,relation,person2,relation triplet
27264,28th Ruler,parent,Yik'in Chan K'awiil,Yik'in Chan K'awiil parent 28th Ruler


In [93]:
idx = check_name(child_data)

In [94]:
idx

[8904, 9504, 10849]

In [96]:
child_data.iloc[[8904]]

Unnamed: 0,person1,relation,person2,relation triplet
8904,Suresh Nanda,child,1999 Delhi hit-and-run case,1999 Delhi hit-and-run case child Suresh Nanda


In [97]:
child_data.iloc[[9504]]

Unnamed: 0,person1,relation,person2,relation triplet
9504,Roberto Micheletti,child,3 (number),3 (number) child Roberto Micheletti


In [98]:
child_data.iloc[[10849]]

Unnamed: 0,person1,relation,person2,relation triplet
10849,Jack Dellal,child,£,£ child Jack Dellal


In [99]:
idx = check_name(child_data, col_name='person1')

In [100]:
idx

[]