In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
introduction_text = ('This tutorial is about Natural Language Processing in Spacy.')
introduction_doc = nlp(introduction_text)
# Extract tokens for the given doc
print ([token.text for token in introduction_doc])


['This', 'tutorial', 'is', 'about', 'Natural', 'Language', 'Processing', 'in', 'Spacy', '.']


In [10]:
for token in introduction_doc:
  print(token, token.pos_)

This DET
tutorial NOUN
is AUX
about ADP
Natural PROPN
Language PROPN
Processing PROPN
in ADP
Spacy PROPN
. PUNCT


In [15]:
#sample text 
text = ['On August 21 1826 a son was born to John Bon and named him Francis.',
'On June 11 1813 a daughter was born to James Donne naming her Mary Sarah.',
'On January 1 1832 a son was born to his father David Borne and named him John.']


In [16]:
import pandas as pd
df = pd.DataFrame(text,columns=['text'])
df.head()

Unnamed: 0,text
0,On August 21 1826 a son was born to John Bon a...
1,On June 11 1813 a daughter was born to James D...
2,On January 1 1832 a son was born to his father...


In [17]:
text = df['text'][0]

doc = nlp(text)

In [18]:
features = []
for token in doc:
    features.append({'token' : token.text, 'pos' : token.pos_})

In [19]:
fdf = pd.DataFrame(features)
fdf.head(len(fdf))

Unnamed: 0,token,pos
0,On,ADP
1,August,PROPN
2,21,NUM
3,1826,NUM
4,a,DET
5,son,NOUN
6,was,AUX
7,born,VERB
8,to,ADP
9,John,PROPN


In [20]:
first_tokens = ['to', 'father']
last_tokens = ['and', 'naming']

pattern_father = [[{'LOWER' : {'IN' : first_tokens}},
           {'POS':'PROPN', 'OP' : '+'},
           {'LOWER': {'IN' : last_tokens}} ]]

In [23]:
from spacy.matcher import Matcher

def get_father(x):
    doc = nlp(x)
    matcher = Matcher(nlp.vocab) 
    matcher.add("matching_father", pattern_father)
    matches = matcher(doc)
    sub_text = ''    
    if(len(matches) > 0):
        span = doc[matches[0][1]:matches[0][2]] 
        sub_text = span.text
    tokens = sub_text.split(' ')
    
    name, surname = tokens[1:-1]
    return name, surname

In [26]:
new_columns = ['father name','surname']
for n,col in enumerate(new_columns):
    df[col] = df['text'].apply(lambda x: get_father(x)).apply(lambda x: x[n])
    
df

Unnamed: 0,text,father name,surname
0,On August 21 1826 a son was born to John Bon a...,John,Bon
1,On June 11 1813 a daughter was born to James D...,James,Donne
2,On January 1 1832 a son was born to his father...,David,Borne


In [28]:
first_tokens = ['him', 'her']
last_tokens = ['.']
pattern_son = [[{'LOWER' : {'IN' : first_tokens}},
           {'POS':'PROPN', 'OP' : '+'},
           {'LOWER': {'IN' : last_tokens}} ]]

def get_child(x):
    doc = nlp(x)
    matcher = Matcher(nlp.vocab) 
    matcher.add("matching_son", pattern_son)
    
    matches = matcher(doc)
    sub_text = ''    
    if(len(matches) > 0):
        span = doc[matches[0][1]:matches[0][2]] 
        sub_text = span.text
    # remove punct
    sub_text = sub_text[:-1]
    tokens = sub_text.split(' ')
    
    return ' '.join(tokens[1:])

df['child'] = df['text'].apply(lambda x: get_child(x))
df

Unnamed: 0,text,father name,surname,child
0,On August 21 1826 a son was born to John Bon a...,John,Bon,Francis
1,On June 11 1813 a daughter was born to James D...,James,Donne,Mary Sarah
2,On January 1 1832 a son was born to his father...,David,Borne,John


In [31]:
def get_date(x):
    months={"January":"01","February":"02","March":"03","April":"04","May":"05","June":"06",
            "July":"07","August":"08","September":"09","October":"10","November":"11","December":"12",}
    tokens = x.split(" ")
    # month
    month = months[tokens[1]]
    # day
    day=tokens[2]
    if(len(day)==1):
        day="0"+day
    
    # year
    year = x.split(" ")[3]
    
    return (year+"-"+month+"-"+day)

df['date'] = df['text'].apply(lambda x: get_date(x))
df

Unnamed: 0,text,father name,surname,child,date
0,On August 21 1826 a son was born to John Bon a...,John,Bon,Francis,1826-08-21
1,On June 11 1813 a daughter was born to James D...,James,Donne,Mary Sarah,1813-06-11
2,On January 1 1832 a son was born to his father...,David,Borne,John,1832-01-01


In [32]:
def get_gender(x):
    if 'son' in x:
        return 'M'
    return 'F'
df['gender'] = df['text'].apply(lambda x: get_gender(x))
df

Unnamed: 0,text,father name,surname,child,date,gender
0,On August 21 1826 a son was born to John Bon a...,John,Bon,Francis,1826-08-21,M
1,On June 11 1813 a daughter was born to James D...,James,Donne,Mary Sarah,1813-06-11,F
2,On January 1 1832 a son was born to his father...,David,Borne,John,1832-01-01,M
