# Information Extraction (NLP Project)

From Chapter 4 of Getting Started with Natural Language Processing (2022, Kochmar)

# 0. Practice

In [4]:
import spacy

# initialize nlp pipeline from spacy
nlp = spacy.load("en_core_web_sm")

# input text to doc
doc = nlp("On Friday board members meet with senior managers " +
         "to discuss future development of the company.")

def tabularize_tokens(doc):
    # start to construct table output, add headers to columns
    rows = []
    rows.append(["Word", "Position", "Lowercase", "Lemma", "POS", 
                 "Alphanumeric", "Stopword"])

    # grab attributes of tokens in doc
    for token in doc:
        rows.append([token.text, str(token.i), token.lower_, token.lemma_, 
                     token.pos_, str(token.is_alpha), str(token.is_stop)])

    # make columns out of rows and set column widths
    columns = zip(*rows)
    column_widths = [max(len(item) for item in col)
                    for col in columns]

    # print tokens and attributes into table
    for row in rows:
        print(''.join(' {:{width}} '.format(
            row[i], width = column_widths[i]) 
                      for i in range(0, len(row))))

tabularize_tokens(doc)

 Word         Position  Lowercase    Lemma        POS    Alphanumeric  Stopword 
 On           0         on           on           ADP    True          True     
 Friday       1         friday       Friday       PROPN  True          False    
 board        2         board        board        NOUN   True          False    
 members      3         members      member       NOUN   True          False    
 meet         4         meet         meet         VERB   True          False    
 with         5         with         with         ADP    True          True     
 senior       6         senior       senior       ADJ    True          False    
 managers     7         managers     manager      NOUN   True          False    
 to           8         to           to           PART   True          True     
 discuss      9         discuss      discuss      VERB   True          False    
 future       10        future       future       ADJ    True          False    
 development  11        deve

In [6]:
# trying above code on jabberwocky by lewis carroll

# input text to doc
doc2 = nlp("Beware the Jabberwock, my son! " +
         "The jaws that bite, the claws that catch! " +
         "Beware the Jubjub bird, and shun " +
         "The frumious Bandersnatch!")

tabularize_tokens(doc2)

 Word          Position  Lowercase     Lemma         POS    Alphanumeric  Stopword 
 Beware        0         beware        beware        VERB   True          False    
 the           1         the           the           DET    True          True     
 Jabberwock    2         jabberwock    Jabberwock    PROPN  True          False    
 ,             3         ,             ,             PUNCT  False         False    
 my            4         my            my            PRON   True          True     
 son           5         son           son           NOUN   True          False    
 !             6         !             !             PUNCT  False         False    
 The           7         the           the           DET    True          True     
 jaws          8         jaws          jaw           NOUN   True          False    
 that          9         that          that          PRON   True          True     
 bite          10        bite          bite          VERB   True          Fa

In [13]:
# practice with spacy's parser

doc3 = nlp("On Friday, board members meet with senior managers " +
          "to discuss future development of the company.")
    
def tabularize_with_parser(doc):
    # start to construct table output, add headers to columns
    rows = []
    rows.append(["Chunk Text", "Root", "Dependency", "Head"])

    # grab attributes of tokens in doc
    for chunk in doc.noun_chunks:
        rows.append([chunk.text, chunk.root.text, chunk.root.dep_,
          chunk.root.head.text])

    # make columns out of rows and set column widths
    columns = zip(*rows)
    column_widths = [max(len(item) for item in col)
                    for col in columns]

    # print tokens and attributes into table
    for row in rows:
        print(''.join(' {:{width}} '.format(
            row[i], width = column_widths[i]) 
                      for i in range(0, len(row))))

tabularize_with_parser(doc3)

 Chunk Text          Root         Dependency  Head    
 Friday              Friday       pobj        On      
 board members       members      nsubj       meet    
 senior managers     managers     nsubj       discuss 
 future development  development  dobj        discuss 
 the company         company      pobj        of      


In [14]:
# visualize dependencies with displacy
# more info: spacy.io/usage/visualizers

# this version stores the output to an external file

from spacy import displacy
from pathlib import Path

svg = displacy.render(doc, style = 'dep', jupyter = False)
file_name = '-'.join([w.text for w in doc if not w.is_punct]) + ".svg"
output_path = Path(file_name)
output_path.open("w", encoding = "utf-8").write(svg)

12222

In [15]:
# this version displays output in the jupyter notebook

from spacy import displacy

svg = displacy.render(doc, style = 'dep', jupyter = True)

In [21]:
# print info about head and dependents for each word in sentence

def tabularize_sentence(doc):
    # start to construct table output, add headers to columns
    rows = []
    rows.append(["Token Text", "Dependency", "Head", "Head POS", "Children"])

    # grab attributes of tokens in doc
    for token in doc:
        rows.append([token.text, token.dep_, token.head.text,
          token.head.pos_, str([child for child in token.children])])

    # make columns out of rows and set column widths
    columns = zip(*rows)
    column_widths = [max(len(item) for item in col)
                    for col in columns]

    # print tokens and attributes into table
    for row in rows:
        print(''.join(' {:{width}} '.format(
            row[i], width = column_widths[i]) 
                      for i in range(0, len(row))))
# for token in doc:
#     print(token.text, token.dep_, token.head.text, token.head.pos_,
#          [child for child in token.children])

tabularize_sentence(doc3)

 Token Text   Dependency  Head         Head POS  Children                    
 On           prep        meet         VERB      [Friday]                    
 Friday       pobj        On           ADP       []                          
 ,            punct       meet         VERB      []                          
 board        compound    members      NOUN      []                          
 members      nsubj       meet         VERB      [board]                     
 meet         ROOT        meet         VERB      [On, ,, members, with, .]   
 with         prep        meet         VERB      [discuss]                   
 senior       amod        managers     NOUN      []                          
 managers     nsubj       discuss      VERB      [senior]                    
 to           aux         discuss      VERB      []                          
 discuss      pcomp       with         ADP       [managers, to, development] 
 future       amod        development  NOUN      []             

# 1. Information Extractor Code

In [23]:
sentences = ["On Friday, board members meet with sernior managers " +
            "to discuss future development of the company.",
            "Boris Johnson met with the Queen last week.",
            "Donald Trump meets the Queen at Buckingham Palace.",
            "The two leaders also posed for photographs and " +
            "the President talked to reporters."]

def extract_information(doc):
    action = ""
    participant1 = ""
    participant2 = ""
    for token in doc:
        if (token.lemma_ == 'meet' and token.pos_ == 'VERB'
           and token.dep_ == 'ROOT'):
            action = token.text
            children = [child for child in token.children]
            for child1 in children:
                if child1.dep_ == 'nsubj':
                    participant1 = " ".join([attr.text for attr in child1.children]
                                           ) + " " + child1.text
                elif child1.text == 'with':
                    action += " " + child1.text
                    child1_children = [child for child in child1.children]
                    for child2 in child1_children:
                        if (child2.pos_ == 'NOUN' or child2.pos_ == 'PROPN'):
                            participant2 = " ".join([attr.text for attr in child2.children]
                                                   ) + " " + child2.text
                elif (child1.dep_ == 'dobj' and (child1.pos_ == 'NOUN' or child1.pos_ == 'PROPN')):
                    participant2 = " ".join([attr.text for attr in child1.children]
                                           ) + " " + child1.text
    print(f"Participant1 = {participant1}")
    print(f"Action = {action}")
    print(f"Participant2 = {participant2}")

for sent in sentences:
    print(f"\nSentence = {sent}")
    doc = nlp(sent)
    extract_information(doc)


Sentence = On Friday, board members meet with sernior managers to discuss future development of the company.
Participant1 = board members
Action = meet with
Participant2 = sernior managers

Sentence = Boris Johnson met with the Queen last week.
Participant1 = Boris Johnson
Action = met with
Participant2 = the Queen

Sentence = Donald Trump meets the Queen at Buckingham Palace.
Participant1 = Donald Trump
Action = meets
Participant2 = the at Queen

Sentence = The two leaders also posed for photographs and the President talked to reporters.
Participant1 = 
Action = 
Participant2 = 
