In [1]:
# https://www.analyticsvidhya.com/blog/2019/09/introduction-information-extraction-python-spacy/
import re 
import string 
import nltk 
import spacy 
import pandas as pd 
import numpy as np 
import math 
from tqdm import tqdm 

from spacy.matcher import Matcher 
from spacy.tokens import Span 
from spacy import displacy 

pd.set_option('display.max_colwidth', 200)

In [2]:
nlp = spacy.load("en_core_web_lg")

In [3]:
text = """When a toot is posted, from any lowly soul
That's aiming for attention in the digital role
It may be overlooked or seen with ire
But 'liking' it can help build a pyre

For 'likes' are nothing but simple approval
A whispery wave of love and revulsion;
To keep conversations alive and flowing like wine
Extending threads at each person's own time.

With people we like our bond grows stronger still,
Richer than what cities or wealth can fill;
So empowering Mammut once again,
We make Mastodon an aiding friend.
"""

In [37]:
text = """
WASHINGTON (AP) — The Senate passed bipartisan legislation Tuesday to protect same-sex marriages, an extraordinary sign of shifting national politics on the issue and a measure of relief for the hundreds of thousands of same-sex couples who have married since the Supreme Court’s 2015 decision that legalized gay marriage nationwide.

The bill, which would ensure that same-sex and interracial marriages are enshrined in federal law, was approved 61-36 on Tuesday, including support from 12 Republicans. Senate Majority Leader Chuck Schumer said the legislation was “a long time coming” and part of America’s “difficult but inexorable march towards greater equality.”

Democrats are moving quickly, while the party still holds the majority in both chambers of Congress. The legislation now moves to the House for a final vote.

President Joe Biden praised the bipartisan vote and said he will sign the bill “promptly and proudly” if it is passed by the House. He said it will ensure that LGBTQ youth “will grow up knowing that they, too, can lead full, happy lives and build families of their own.”

The bill has gained steady momentum since the Supreme Court’s June decision that overturned the federal right to an abortion, a ruling that included a concurring opinion from Justice Clarence Thomas that suggested same-sex marriage could also come under threat. Bipartisan Senate negotiations got a kick-start this summer when 47 Republicans unexpectedly voted for a House bill and gave supporters new optimism. """

In [38]:
len(text)

1513

In [46]:
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x7f3a5c74e180>

In [55]:
import spacy 
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe('sentencizer')

maxlen = 350
posts = []
doc = nlp(text)
post = ""
for sent in [str(sent) for sent in doc.sents]:
    if len(sent) > maxlen:
        # edge case, just truncate it
        sent = sent[:maxlen - 3] + '...😱'
    if len(post) + len(sent) < maxlen:
        post = f"{post} {sent}"
    else:
        posts.append('💓' + post)
        post = f"{sent}"

if post:
    posts.append('🙀' + post)

In [56]:
print(''.join(posts))

💓 
WASHINGTON (AP) — The Senate passed bipartisan legislation Tuesday to protect same-sex marriages, an extraordinary sign of shifting national politics on the issue and a measure of relief for the hundreds of thousands of same-sex couples who have married since the Supreme Court’s 2015 decision that legalized gay marriage nationwide.💓

The bill, which would ensure that same-sex and interracial marriages are enshrined in federal law, was approved 61-36 on Tuesday, including support from 12 Republicans. Senate Majority Leader Chuck Schumer said the legislation was “a long time coming” and part of America’s “difficult but inexorable march towards greater equality.”💓

Democrats are moving quickly, while the party still holds the majority in both chambers of Congress. The legislation now moves to the House for a final vote. 

President Joe Biden praised the bipartisan vote and said he will sign the bill “promptly and proudly” if it is passed by the House.💓He said it will ensure that LGBTQ 

In [3]:
# sample text 
text = "Anna and Rob are excited to be part of the Fediverse and meet people from all over the world who share their interests." 

# create a spaCy object 
doc = nlp(text)

In [4]:
foo = list(doc.sents)[:2]

In [5]:
' '.join([d.text for d in doc.sents][:3])

'Anna and Rob are excited to be part of the Fediverse and meet people from all over the world who share their interests.'

In [6]:
# print token, dependency, POS tag 
for tok in doc: 
  print(tok.text, "-->",tok.dep_,"-->", tok.pos_)

Anna --> nsubj --> PROPN
and --> cc --> CCONJ
Rob --> conj --> PROPN
are --> ROOT --> AUX
excited --> acomp --> ADJ
to --> aux --> PART
be --> xcomp --> AUX
part --> attr --> NOUN
of --> prep --> ADP
the --> det --> DET
Fediverse --> pobj --> PROPN
and --> cc --> CCONJ
meet --> conj --> VERB
people --> dobj --> NOUN
from --> prep --> ADP
all --> advmod --> ADV
over --> prep --> ADP
the --> det --> DET
world --> pobj --> NOUN
who --> nsubj --> PRON
share --> relcl --> VERB
their --> poss --> PRON
interests --> dobj --> NOUN
. --> punct --> PUNCT


In [8]:
patterns =[
    [{'POS':'NOUN'}, {'LOWER': 'such'}, {'LOWER': 'as'}, {'POS': 'PROPN'}],
    [{'DEP':'amod', 'OP':"?"}, {'POS':'NOUN'}, {'LOWER': 'such'}, {'LOWER': 'as'}, {'POS': 'PROPN'}],
    [{'DEP':'amod', 'OP':"?"}, {'POS':'NOUN'}, {'LOWER': 'and', 'OP':"?"}, {'LOWER': 'or', 'OP':"?"}, {'LOWER': 'other'}, {'POS': 'NOUN'}],
    [{'DEP':'nummod','OP':"?"}, {'DEP':'amod','OP':"?"}, {'POS':'NOUN'}, {'IS_PUNCT': True}, {'LOWER': 'including'}, {'DEP':'nummod','OP':"?"}, {'DEP':'amod','OP':"?"}, {'POS':'NOUN'}],
    [{'DEP':'nummod','OP':"?"}, {'DEP':'amod','OP':"?"}, {'POS':'NOUN'}, {'IS_PUNCT':True}, {'LOWER': 'especially'}, {'DEP':'nummod','OP':"?"}, {'DEP':'amod','OP':"?"}, {'POS':'NOUN'}],
]

texts = [
    "GDP in developing countries such as Vietnam will continue growing at a high rate.",
    "Here is how you can keep your car and other vehicles clean.",
    "Eight people, including two children, were injured in the explosion",
    "A healthy eating pattern includes fruits, especially whole fruits."
]

In [9]:
matcher = Matcher(nlp.vocab) 
matcher.add("all", patterns) 

In [10]:
for text in texts:
    doc = nlp(text)
    matches = matcher(doc) 
    span = doc[matches[0][1]:matches[0][2]] 

    print(span.text)
    displacy.render(span, style='dep',jupyter=True)

developing countries such as Vietnam


car and other vehicles


Eight people, including two children


fruits, especially whole fruits


In [11]:
text = "That's not funny." 

# Plot the dependency graph 
doc = nlp(text) 
displacy.render(doc, style='dep',jupyter=True)

In [52]:
text = "Anna: My nephew Roderick will be in town next week." 
doc = nlp(text) 

for tok in doc: 
  print(tok.text,"-->",tok.dep_,"-->",tok.pos_)

Anna --> npadvmod --> PROPN
: --> punct --> PUNCT
My --> poss --> PRON
nephew --> nsubj --> NOUN
Roderick --> appos --> PROPN
will --> aux --> AUX
be --> ROOT --> AUX
in --> prep --> ADP
town --> pobj --> NOUN
next --> amod --> ADP
week --> npadvmod --> NOUN
. --> punct --> PUNCT


In [53]:
def subtree_matcher(doc): 
  x = '' 
  y = '' 
  
  # iterate through all the tokens in the input sentence 
  for i,tok in enumerate(doc): 
    # extract subject 
    if tok.dep_.find("subjpass") == True: 
      y = tok.text 
      
    # extract object 
    if tok.dep_.endswith("obj") == True: 
      x = tok.text 
      
  return x,y

In [54]:
text_2 = "Rob is hungry." 

doc_2 = nlp(text_2) 
subtree_matcher(doc_2)

('', '')

In [55]:
subtree_matcher(doc)

('town', '')

In [56]:
text_3 = "That's not funny." 
doc_3 = nlp(text_3) 
subtree_matcher(doc_3)

('', '')

In [57]:
def subtree_matcher(doc):
  subjpass = 0

  for i,tok in enumerate(doc):
    # find dependency tag that contains the text "subjpass"    
    if tok.dep_.find("subjpass") == True:
      subjpass = 1

  x = ''
  y = ''

  # if subjpass == 1 then sentence is passive
  if subjpass == 1:
    for i,tok in enumerate(doc):
      if tok.dep_.find("subjpass") == True:
        y = tok.text

      if tok.dep_.endswith("obj") == True:
        x = tok.text
  
  # if subjpass == 0 then sentence is not passive
  else:
    for i,tok in enumerate(doc):
      if tok.dep_.endswith("subj") == True:
        x = tok.text

      if tok.dep_.endswith("obj") == True:
        y = tok.text

  return x,y

In [58]:
text_4 = "Anna and Rob are old friends." 
doc_4 = nlp(text_4) 
subtree_matcher(doc_4)

('Anna', '')

In [60]:
for d in (doc, doc_2, doc_3, doc_4):
    displacy.render(d)
    print(d, subtree_matcher(d), [next(tok.ancestors) for tok in d if tok.dep_ in ['agent','dobj'] ])
    print(d.ents)

Anna: My nephew Roderick will be in town next week. ('nephew', 'town') []
(Anna, Roderick, next week)


Rob is hungry. ('Rob', '') []
(Rob,)


That's not funny. ('That', '') []
()


Anna and Rob are old friends. ('Anna', '') []
(Anna, Rob)


In [61]:
for ent in doc_4.ents:
    print(ent)

Anna
Rob


In [62]:
[next(tok.ancestors) for tok in doc_4 if tok.dep_ in ['agent','dobj'] ]

[]

In [65]:
doc_4[3].lemma_

'be'

In [110]:
nlp = spacy.load("en_core_web_lg")

nlp.add_pipe("merge_entities")
nlp.add_pipe("merge_noun_chunks")

TEXTS = [
    "Net income was $9.4 million compared to the prior year of $2.7 million.",
    "Revenue exceeded twelve billion dollars, with a loss of $1b.",
]
for doc in nlp.pipe(lines):
    for token in doc:
        if True:
#         if token.ent_type_ == "MONEY":
            print(token, token.ent_type_)
            # We have an attribute and direct object, so check for subject
            if token.dep_ in ("attr", "dobj"):
                subj = [w for w in token.head.lefts if w.dep_ == "nsubj"]
                if subj:
                    print(subj[0], "-->", token)
            # We have a prepositional object with a preposition
            elif token.dep_ == "pobj" and token.head.dep_ == "prep":
                print(token.head.head, "-->", token)

Anna PERSON
: 
My nephew 
Roderick PERSON
will 
be 
in 
town 
be --> town
next week DATE
. 
That 
's 
not 
funny 
. 
Anna PERSON
and 
Rob PERSON
are 
old friends 
Anna --> old friends
. 
A hacker cat 
wearing 
a hoodie 
. 
Let 
's 
be 
friends 
's --> friends
. 


In [115]:
lines = [
    "Anna: My nephew Roderick will be in town next week.",
    "That's not funny.",
    "Anna and Rob are old friends.",
    "A hacker cat wearing a hoodie.",
    "Let's be friends."
]

In [116]:
def eprime(doc):
    for tok in doc:
        if tok.lemma_ == 'be':
            displacy.render(doc)
            print(
                doc, '\n', 
                tok, '\n', 
                [n.lemma_ for n in doc.noun_chunks], '\n'
            )
            

In [120]:
nlp = spacy.load("en_core_web_lg")

In [121]:
for line in lines:
    eprime(nlp(line))


Anna: My nephew Roderick will be in town next week. 
 be 
 ['my nephew', 'Roderick', 'town'] 



That's not funny. 
 's 
 ['that'] 



Anna and Rob are old friends. 
 are 
 ['Anna', 'Rob', 'old friend'] 



Let's be friends. 
 be 
 ['us', 'friend'] 

