In [37]:
import spacy
from spacy.matcher import Matcher 
import pandas as pd
from spacy import displacy 
import visualise_spacy_tree
from IPython.display import Image, display

# load english language model
nlp = spacy.load('en_core_web_sm',disable=['textcat'])

In [108]:
text = 'Gandhi transferred to Rollins College in Florida due to security threats following his father\'s, the late Prime Minister Rajiv Gandhi\'s, assassination'
# create spacy 
doc = nlp(text)

for token in doc:
    print(token.text,'->',token.pos_)

Gandhi -> PROPN
transferred -> VERB
to -> ADP
Rollins -> PROPN
College -> PROPN
in -> ADP
Florida -> PROPN
due -> ADP
to -> ADP
security -> NOUN
threats -> NOUN
following -> VERB
his -> DET
father -> NOUN
's -> PART
, -> PUNCT
the -> DET
late -> ADJ
Prime -> PROPN
Minister -> PROPN
Rajiv -> PROPN
Gandhi -> PROPN
's -> PART
, -> PUNCT
assassination -> NOUN


In [99]:
for entity in doc.ents:
  print(entity)
  print(entity.label_)
  print(str(spacy.explain(entity.label_)))
  print("n")

Gandhi
PERSON
People, including fictional
n
Rollins College
ORG
Companies, agencies, institutions, etc.
n
Florida
GPE
Countries, cities, states
n
Rajiv Gandhi
PERSON
People, including fictional
n


In [109]:
i = 0
with open("rahul.txt") as file:
  df = pd.DataFrame(columns={'line'})
  x = file.read().split('.')
  
for j in x:
  df.loc[i, 'line'] = j  
  i+=1

df.head()

Unnamed: 0,line
0,Rahul Gandhi (About this soundpronunciation (h...
1,"A member of the Indian National Congress, he ..."
2,Gandhi is the chairperson of the Indian Youth...
3,"\n\nBorn in New Delhi, Gandhi spent his early ..."
4,He attained primary education in New Delhi an...


In [110]:
import re
def clean(text):
  text = re.sub('[0-9]+.\t','',str(text))
  # removing new line characters
  text = re.sub('\n ','',str(text))
  text = re.sub('\n',' ',str(text))
  # removing apostrophes
  text = re.sub("'s",'',str(text))
  # removing hyphens
  text = re.sub("-",' ',str(text))
  text = re.sub("— ",'',str(text))
  # removing quotation marks
  text = re.sub('\"','',str(text))
  # removing salutations
  text = re.sub("Mr\.",'Mr',str(text))
  text = re.sub("Mrs\.",'Mrs',str(text))
  # removing any reference to outside text
  text = re.sub("[\(\[].*?[\)\]]", "", str(text))
  
  return text

df['lineClean'] = df['line'].apply(clean)


In [112]:
def find_names(text):
    names = []
    # spacy doc
    doc = nlp(text)
    
    prog_list = ['son', 'mother', 'grandson', 'father', 'brother', 'daughter', 'sister']

    pattern = [
              {'LOWER':{'IN':prog_list},'OP':'+'},
              {'POS':'ADP','OP':'?'},
              {'POS':'PROPN','OP':'?'},
              {'POS':'PROPN','OP':'?'},
              {'IS_PUNCT': True,'OP':'?'},
              {'POS':'PROPN', 'DEP':'compound'},
              ]
                
    # Matcher class object 
    matcher = Matcher(nlp.vocab) 
    matcher.add("names", None, pattern) 

    matches = matcher(doc)

    # finding patterns in the text
    for i in range(0,len(matches)):
        
        # match: id, start, end
        token = doc[matches[i][1]:matches[i][2]]
        # append token to list
        names.append(str(token))
    
    # Only keep sentences containing Indian PMs
    rel = {}
    for name in names:
        if (name.split()[1] == ',') and (name.split()[2] != "Sonia"):
                names.remove(name)
        if (len(names) > 0):
          rel = {'Relation type': names[-1].split()[0], 'Name':names[-1].split()[-1]}
    return rel

# apply function
df['Relation'] = df['lineClean'].apply(find_names)

In [113]:
df

Unnamed: 0,line,lineClean,Relation
0,Rahul Gandhi (About this soundpronunciation (h...,Rahul Gandhi Hindustani pronunciation: ; born...,{}
1,"A member of the Indian National Congress, he ...","A member of the Indian National Congress, he ...",{}
2,Gandhi is the chairperson of the Indian Youth...,Gandhi is the chairperson of the Indian Youth...,{}
3,"\n\nBorn in New Delhi, Gandhi spent his early ...","Born in New Delhi, Gandhi spent his early ch...",{}
4,He attained primary education in New Delhi an...,He attained primary education in New Delhi an...,{}
...,...,...,...
192,\n\nAlso read: Hindu voters in UP don’t need e...,Also read: Hindu voters in UP don’t need ext...,{}
193,The Left has regularly characterised it as fa...,The Left has regularly characterised it as fa...,{}
194,"When asked about the Emergency, Rahul Gandhi ...","When asked about the Emergency, Rahul Gandhi ...",{}
195,’ He went on to differentiate between the two ...,’ He went on to differentiate between the two ...,{}


In [9]:
!pip install visualise-spacy-tree

Collecting visualise-spacy-tree
  Downloading visualise_spacy_tree-0.0.6-py3-none-any.whl (5.0 kB)
Collecting pydot==1.4.1
  Downloading pydot-1.4.1-py2.py3-none-any.whl (19 kB)
Installing collected packages: pydot, visualise-spacy-tree
  Attempting uninstall: pydot
    Found existing installation: pydot 1.3.0
    Uninstalling pydot-1.3.0:
      Successfully uninstalled pydot-1.3.0
Successfully installed pydot-1.4.1 visualise-spacy-tree-0.0.6
