# Tokenizing the text

In [None]:
import spacy, pandas, networkx, matplotlib.pyplot as plt

In [None]:
from spacy.matcher import Matcher
from operator import itemgetter

In [None]:
from collections import Counter

### Loading model and reading the document


In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
#run for multiple files
inputtext = ""
for i in range(1,20):
    file_name =f"{i}.txt"
    #print(file_name)
    inputtext+=(open(file_name).read())

In [None]:
inputtext
input_doc = nlp(inputtext)

In [None]:
#run for single file
file_name = '15.txt'
input_text = open(file_name).read()
#print(type(input_text))
input_doc = nlp(input_text)

### Sentence Segmentation

In [None]:
sentences = list(input_doc.sents)

In [None]:
sentences

### Visualizing dependencies and entities in text

In [None]:
from spacy import displacy

options = {"compact": True,"fine_grained":True, "ents":["EVENT","PERSON","LOC","ORDINAL","CARDINAL","ORG"]}
displacy.render(sentences, style='dep', jupyter=True,options = options)


In [None]:
displacy.render(sentences, style='ent', jupyter=True)

### Extracting entities from document

In [None]:
def getEntities(sentence):
    ent1 = ""
    ent2 = ""
    prev_dep = ""
    prev_token = ""
    prefix = ""
    modifier = ""
    
    for tok in nlp(sentence):
        if not tok.is_punct:
            if tok.dep_ == "compound":
                prefix = tok.text
                if prev_dep == "compound":
                    prefix = prev_token + " " + tok.text
            if tok.dep_.endswith("mod") == True:
                modifier = tok.text
                if prev_dep == "compound":
                    modifier = prev_token + " " + tok.text
                
            #adding subject
            if tok.dep_.find("subj") == True:
                ent1 = modifier + " "+ prefix + " "+tok.text
                #reset everything
                prefix = ""
                modifier = ""
                prev_dep = ""
                prev_token = ""
            
            #adding object
            if tok.dep_.find("obj") == True:
                ent2 = modifier + " " + prefix + " "+ tok.text
                
            #updating variables
            prev_dep = tok.dep_
            prev_token = tok.text
            
    return [ent1.strip(), ent2.strip()]

In [None]:
getEntities("I watched a film") #checking if it works fine

In [None]:
entity_pairs = []
for s in sentences:
    entity_pairs.append(getEntities(str(s)))

In [None]:
entity_pairs

### Extracting relations

In [None]:
def getRelation(sentence):
    doc = nlp(sentence)
    matcher = Matcher(nlp.vocab)
    
    pattern = [{'DEP':'ROOT'},
              {'DEP':'prep','OP':"?"},
              {'DEP':'agent','OP':"?"},  
              {'POS':'ADJ','OP':"?"}]
    
    matcher.add("matching_1",None, pattern) #match id
    matches = matcher(doc)
    k = len(matches) -1
    span = doc[matches[k][1]:matches[k][2]]
    
    return span.text

In [None]:
getRelation("I watched a film") #checking

In [None]:
relations = [getRelation(str(s)) for s in sentences]

In [None]:
relations

In [None]:
print(Counter(relations)) #just to see the most common relations

# Graph Construction

In [None]:
source = [i[0].lower() for i in entity_pairs]
target = [i[1].lower() for i in entity_pairs]
    
s_graph = pandas.DataFrame({'source' : source, 'target' : target, 'edge': relations})

In [None]:
G=networkx.from_pandas_edgelist(s_graph, "source", "target", 
                          edge_attr=True, create_using=networkx.MultiDiGraph())

In [None]:
plt.figure(figsize=(12,12))

pos = networkx.spring_layout(G,k=0.5)
networkx.draw(G, with_labels=True, node_color='skyblue', node_size =1500, edge_cmap=plt.cm.Blues, pos = pos)
plt.show()

In [None]:
G2 = networkx.DiGraph(G)
eigenvector_dict = networkx.eigenvector_centrality(G2,max_iter=1500) # Run eigenvector centrality
networkx.set_node_attributes(G, eigenvector_dict, 'eigenvector')
sorted_deg = sorted(eigenvector_dict.items(),key=itemgetter(1),reverse=True)
for d in sorted_deg:
    print(d)

In [None]:
degree_dict = dict(G.degree(G.nodes()))
networkx.set_node_attributes(G, degree_dict, 'degree')


In [None]:
sorted_degree = sorted(degree_dict.items(),key=itemgetter(1) ,reverse=True)

In [None]:
for d in sorted_degree:
    print(d)