In [145]:
# importing libraries
import re 
import string 
import nltk 
import spacy 
import pandas as pd 
import numpy as np 
import math 
from tqdm import tqdm 

from spacy.matcher import Matcher 
from spacy.tokens import Span 
from spacy import displacy 

pd.set_option('display.max_colwidth', 200)

In [146]:
# loading spaCy model
nlp = spacy.load("en_core_web_sm")

# sample text
text = "GDP in developing countries such as Vietnam will continue growing at a high rate."

# spaCy object creation
doc = nlp(text)

In [147]:
for token in doc:
    print(token.text, "-->", token.dep_, "-->", token.pos_)

GDP --> nsubj --> PROPN
in --> prep --> ADP
developing --> amod --> VERB
countries --> pobj --> NOUN
such --> amod --> ADJ
as --> prep --> ADP
Vietnam --> pobj --> PROPN
will --> aux --> AUX
continue --> ROOT --> VERB
growing --> xcomp --> VERB
at --> prep --> ADP
a --> det --> DET
high --> amod --> ADJ
rate --> pobj --> NOUN
. --> punct --> PUNCT


In [148]:
spacy.explain('nsubj')

'nominal subject'

In [149]:
# defining Hearst pattern - Pattern: X such as Y
pattern = [{'POS':'NOUN'}, 
           {'LOWER': 'such'}, 
           {'LOWER': 'as'}, 
           {'POS': 'PROPN'}]

# Matcher class object
matcher = Matcher(nlp.vocab) 
matcher.add("matching_1", [pattern]) 

matches = matcher(doc) 
print(matches)
span = doc[matches[0][1]:matches[0][2]] 

print(span.text)

[(11840699188806025751, 3, 7)]
countries such as Vietnam


In [150]:
doc = nlp("Here is how you can keep your car and other vehicles clean.") 

# print dependency tags and POS tags
for tok in doc: 
  print(tok.text, "-->",tok.dep_, "-->",tok.pos_)

Here --> advmod --> ADV
is --> ROOT --> AUX
how --> advmod --> SCONJ
you --> nsubj --> PRON
can --> aux --> AUX
keep --> ccomp --> VERB
your --> poss --> PRON
car --> dobj --> NOUN
and --> cc --> CCONJ
other --> amod --> ADJ
vehicles --> conj --> NOUN
clean --> oprd --> ADJ
. --> punct --> PUNCT


In [151]:
# defining Hearst pattern - Pattern: X and/or Y
pattern = [{'DEP':'amod', 'OP':"?"}, 
           {'POS':'NOUN'}, 
           {'LOWER': 'and', 'OP':"?"}, 
           {'LOWER': 'or', 'OP':"?"}, 
           {'LOWER': 'other'}, 
           {'POS': 'NOUN'}] 

# Matcher class object
matcher.add("matching_2", [pattern]) 

matches = matcher(doc) 
print(matches)
span = doc[matches[0][1]:matches[0][2]] 

print(span.text)

[(10575872351855955709, 7, 11)]
car and other vehicles


In [152]:

doc = nlp("Eight people, including two children, were injured in the explosion") 

for tok in doc: 
  print(tok.text, "-->",tok.dep_, "-->",tok.pos_)

Eight --> nummod --> NUM
people --> nsubjpass --> NOUN
, --> punct --> PUNCT
including --> prep --> VERB
two --> nummod --> NUM
children --> pobj --> NOUN
, --> punct --> PUNCT
were --> auxpass --> AUX
injured --> ROOT --> VERB
in --> prep --> ADP
the --> det --> DET
explosion --> pobj --> NOUN


In [153]:
# Matcher class object 
matcher = Matcher(nlp.vocab) 

#define the pattern 
pattern = [{'DEP':'nummod','OP':"?"}, # numeric modifier 
           {'DEP':'amod','OP':"?"}, # adjectival modifier 
           {'POS':'NOUN'}, 
           {'IS_PUNCT': True}, 
           {'LOWER': 'including'}, 
           {'DEP':'nummod','OP':"?"}, 
           {'DEP':'amod','OP':"?"}, 
           {'POS':'NOUN'}] 
                               
matcher.add("matching_3", [pattern]) 

matches = matcher(doc) 
span = doc[matches[0][1]:matches[0][2]] 
print(span.text)

Eight people, including two children


In [154]:
# Pattern X, especially Y
doc = nlp("A healthy eating pattern includes fruits, especially whole fruits.") 

for tok in doc: 
  print(tok.text, tok.dep_, tok.pos_)

A det DET
healthy amod ADJ
eating compound NOUN
pattern nsubj NOUN
includes ROOT VERB
fruits dobj NOUN
, punct PUNCT
especially advmod ADV
whole amod ADJ
fruits appos NOUN
. punct PUNCT


In [155]:
spacy.explain('amod')

'adjectival modifier'

In [156]:
matcher = Matcher(nlp.vocab)

# define the pattern 
pattern = [{'DEP':'nummod','OP':"?"}, 
           {'DEP':'amod','OP':"?"}, 
           {'POS':'NOUN'}, 
           {'IS_PUNCT':True}, 
           {'LOWER': 'especially'}, 
           {'DEP':'nummod','OP':"?"}, 
           {'DEP':'amod','OP':"?"}, 
           {'POS':'NOUN'}] 

matcher.add('X_especially_Y', [pattern])

matches = matcher(doc)
span = doc[matches[0][1]:matches[0][2]]
print(span.text)

fruits, especially whole fruits


In [157]:
# dependency tree


text = "Tableau was recently acquired by Salesforce." 

# Plot the dependency graph 
doc = nlp(text) 
displacy.render(doc, style='dep',jupyter=True)

In [158]:
doc = nlp(text)
for token in doc: 
  print(token.text,"-->",token.dep_,"-->",token.pos_)

Tableau --> nsubjpass --> PROPN
was --> auxpass --> AUX
recently --> advmod --> ADV
acquired --> ROOT --> VERB
by --> agent --> ADP
Salesforce --> pobj --> PROPN
. --> punct --> PUNCT


In [159]:
def subtree_matcher(doc):
    subjpass = 0
    
    for i, token in enumerate(doc):
        # find dependency tag which contains "subjpass"
        if token.dep_.find('subjpass') == True:
            subjpass = 1
    x = ''
    y = ''
    
    # if subjpass == 1, sentence == passive
    if subjpass == 1:
        for i,token in enumerate(doc):
            if token.dep_.find("subjpass") == True:
                y = token.text
            if token.dep_.endswith("obj") == True:
                x = token.text
    # else sentence might be active
    else:
        for i, token in enumerate(doc):
            if token.dep_.endswith("subj") == True:
                x = token.text
            if token.dep_.endswith("obj") == True:
                y = token.text
    return x,y

In [160]:
subtree_matcher(doc)

('Salesforce', 'Tableau')

In [161]:
text_2 = "Careem, a ride hailing major in middle east, was acquired by Uber." 
doc_2 = nlp(text_2)
subtree_matcher(doc_2)

('Uber', 'Careem')

In [162]:
text_3 = "Salesforce recently acquired Tableau." 
doc_3 = nlp(text_3)
subtree_matcher(doc_3)

('Salesforce', 'Tableau')

In [163]:
for tok in doc_3:    
  print(tok.text, "-->",tok.dep_, "-->",tok.pos_)

Salesforce --> nsubj --> PROPN
recently --> advmod --> ADV
acquired --> ROOT --> VERB
Tableau --> dobj --> PROPN
. --> punct --> PUNCT


In [164]:
for tok in doc_2:    
  print(tok.text, "-->",tok.dep_, "-->",tok.pos_)

Careem --> nsubjpass --> PROPN
, --> punct --> PUNCT
a --> det --> DET
ride --> appos --> NOUN
hailing --> acl --> VERB
major --> dobj --> NOUN
in --> prep --> ADP
middle --> compound --> PROPN
east --> pobj --> PROPN
, --> punct --> PUNCT
was --> auxpass --> AUX
acquired --> ROOT --> VERB
by --> agent --> ADP
Uber --> pobj --> PROPN
. --> punct --> PUNCT


# IE using spaCy - Nobel Prize Winners practice

In [165]:
txt = 'On February 12 1809 Nobel Prize winner Charles Darwin was born in UK. He was a naturalist, geologist and biologist.\nOn November 20 1889 Nobel Prize winner Edwin Hubble was born in US. He was an astronomer.\nOn November 7 1867 Nobel Prize winner Marie Curie was born in Poland. She was a physicist and chemist.\nOn January 8 1942 Nobel Prize winner Stephen Hawking was born in UK. He was a physicist and cosmologist.\nOn January 23 1918 Nobel Prize winner Gertrude Elion was born in US. She was a biochemist and pharmacologist.'
data = txt.split('\n')

In [166]:
df = pd.DataFrame(data)
df = df.rename(columns={0:'text'})
df

Unnamed: 0,text
0,"On February 12 1809 Nobel Prize winner Charles Darwin was born in UK. He was a naturalist, geologist and biologist."
1,On November 20 1889 Nobel Prize winner Edwin Hubble was born in US. He was an astronomer.
2,On November 7 1867 Nobel Prize winner Marie Curie was born in Poland. She was a physicist and chemist.
3,On January 8 1942 Nobel Prize winner Stephen Hawking was born in UK. He was a physicist and cosmologist.
4,On January 23 1918 Nobel Prize winner Gertrude Elion was born in US. She was a biochemist and pharmacologist.


In [167]:
import spacy
import en_core_web_lg
text = df['text'][0]
nlp = spacy.load('en_core_web_lg')
doc = nlp(text)
features = []

# extracting pos
for token in doc:
    features.append({'token':token.text, 'pos':token.pos_})

# visualisation
visual = pd.DataFrame(features)
visual

Unnamed: 0,token,pos
0,On,ADP
1,February,PROPN
2,12,NUM
3,1809,NUM
4,Nobel,PROPN
5,Prize,PROPN
6,winner,NOUN
7,Charles,PROPN
8,Darwin,PROPN
9,was,AUX


In [168]:
# extracting names
first_tokens = ['winner', 'name']
last_tokens = ['was', 'born']

winner_pattern = [[{'LOWER' : {'IN' : first_tokens}},  #preceding words  without case matching
           {'POS':'PROPN', 'OP' : '+'},                #searching for PROPN  one or more times
           {'LOWER': {'IN' : last_tokens}} ]]          #following words  without case matching

def get_winner(x):
    nlp = spacy.load('en_core_web_lg')
    doc = nlp(x)
    matcher = Matcher(nlp.vocab)
    matcher.add("matching_winner", winner_pattern)
    matches = matcher(doc)
    sub_text = ''
    if (len(matches) > 0):
        span = doc[matches[0][1]:matches[0][2]]
        sub_text = span.text
    tokens = sub_text.split(' ')
    name, surname = tokens[1:-1]
    return name, surname

In [169]:
get_winner(df['text'][0])

('Charles', 'Darwin')

In [170]:
new_columns = ['scientist name', 'surname']
for n,col in enumerate(new_columns):
    df[col] = df['text'].apply(lambda x: get_winner(x)).apply(lambda x: x[n])    
df

Unnamed: 0,text,scientist name,surname
0,"On February 12 1809 Nobel Prize winner Charles Darwin was born in UK. He was a naturalist, geologist and biologist.",Charles,Darwin
1,On November 20 1889 Nobel Prize winner Edwin Hubble was born in US. He was an astronomer.,Edwin,Hubble
2,On November 7 1867 Nobel Prize winner Marie Curie was born in Poland. She was a physicist and chemist.,Marie,Curie
3,On January 8 1942 Nobel Prize winner Stephen Hawking was born in UK. He was a physicist and cosmologist.,Stephen,Hawking
4,On January 23 1918 Nobel Prize winner Gertrude Elion was born in US. She was a biochemist and pharmacologist.,Gertrude,Elion


In [171]:
# extracting birthplace
first_tokens = ['in']
last_tokens = ['.']

country_pattern = [[{'LOWER' : {'IN' : first_tokens}},  #preceding words  without case matching
           {'POS':'PROPN', 'OP' : '+'},                #searching for PROPN  one or more times
           {'LOWER': {'IN' : last_tokens}} ]]          #following words  without case matching

def get_country(x):
    nlp = spacy.load('en_core_web_lg')
    doc = nlp(x)
    matcher = Matcher(nlp.vocab)
    matcher.add("matching_country", country_pattern)
    matches = matcher(doc)
    sub_text = ''
    if (len(matches) > 0):
        span = doc[matches[0][1]:matches[0][2]]
        sub_text = span.text
    sub_text = sub_text[:-1]    
    tokens = sub_text.split(' ')
    return ' '.join(tokens[1:])

In [172]:
new_columns = ['country']
for n,col in enumerate(new_columns):
    df[col] = df['text'].apply(lambda x: get_country(x))    
df

Unnamed: 0,text,scientist name,surname,country
0,"On February 12 1809 Nobel Prize winner Charles Darwin was born in UK. He was a naturalist, geologist and biologist.",Charles,Darwin,UK
1,On November 20 1889 Nobel Prize winner Edwin Hubble was born in US. He was an astronomer.,Edwin,Hubble,US
2,On November 7 1867 Nobel Prize winner Marie Curie was born in Poland. She was a physicist and chemist.,Marie,Curie,Poland
3,On January 8 1942 Nobel Prize winner Stephen Hawking was born in UK. He was a physicist and cosmologist.,Stephen,Hawking,UK
4,On January 23 1918 Nobel Prize winner Gertrude Elion was born in US. She was a biochemist and pharmacologist.,Gertrude,Elion,US


In [173]:
visual

Unnamed: 0,token,pos
0,On,ADP
1,February,PROPN
2,12,NUM
3,1809,NUM
4,Nobel,PROPN
5,Prize,PROPN
6,winner,NOUN
7,Charles,PROPN
8,Darwin,PROPN
9,was,AUX


In [174]:
def get_date(x):
    months={"January":"01","February":"02","March":"03","April":"04",   "May":"05","June":"06","July":"07","August":"08","September":"09", "October":"10", "November":"11","December":"12",}
    tokens = x.split(" ")
    print(tokens)
    # month
    month = months[tokens[1]]
    # day
    day=tokens[2]
    if(len(day)==1):
        day="0"+day
    
    # year
    year = x.split(" ")[3]
    
    return (year+"-"+month+"-"+day)
print(get_date(df['text'][0]))

['On', 'February', '12', '1809', 'Nobel', 'Prize', 'winner', 'Charles', 'Darwin', 'was', 'born', 'in', 'UK.', 'He', 'was', 'a', 'naturalist,', 'geologist', 'and', 'biologist.']
1809-02-12


In [175]:
df['birthdate'] = df['text'].apply(lambda x: get_date(x))
df

['On', 'February', '12', '1809', 'Nobel', 'Prize', 'winner', 'Charles', 'Darwin', 'was', 'born', 'in', 'UK.', 'He', 'was', 'a', 'naturalist,', 'geologist', 'and', 'biologist.']
['On', 'November', '20', '1889', 'Nobel', 'Prize', 'winner', 'Edwin', 'Hubble', 'was', 'born', 'in', 'US.', 'He', 'was', 'an', 'astronomer.']
['On', 'November', '7', '1867', 'Nobel', 'Prize', 'winner', 'Marie', 'Curie', 'was', 'born', 'in', 'Poland.', 'She', 'was', 'a', 'physicist', 'and', 'chemist.']
['On', 'January', '8', '1942', 'Nobel', 'Prize', 'winner', 'Stephen', 'Hawking', 'was', 'born', 'in', 'UK.', 'He', 'was', 'a', 'physicist', 'and', 'cosmologist.']
['On', 'January', '23', '1918', 'Nobel', 'Prize', 'winner', 'Gertrude', 'Elion', 'was', 'born', 'in', 'US.', 'She', 'was', 'a', 'biochemist', 'and', 'pharmacologist.']


Unnamed: 0,text,scientist name,surname,country,birthdate
0,"On February 12 1809 Nobel Prize winner Charles Darwin was born in UK. He was a naturalist, geologist and biologist.",Charles,Darwin,UK,1809-02-12
1,On November 20 1889 Nobel Prize winner Edwin Hubble was born in US. He was an astronomer.,Edwin,Hubble,US,1889-11-20
2,On November 7 1867 Nobel Prize winner Marie Curie was born in Poland. She was a physicist and chemist.,Marie,Curie,Poland,1867-11-07
3,On January 8 1942 Nobel Prize winner Stephen Hawking was born in UK. He was a physicist and cosmologist.,Stephen,Hawking,UK,1942-01-08
4,On January 23 1918 Nobel Prize winner Gertrude Elion was born in US. She was a biochemist and pharmacologist.,Gertrude,Elion,US,1918-01-23
