In [53]:
import os
import urllib3
import spacy
import random

from elasticsearch import Elasticsearch

urllib3.disable_warnings()

BOT_NAME=os.environ['BOT_NAME']

ELASTIC_USER = os.environ['ELASTIC_USER']
ELASTIC_URL = os.environ['ELASTIC_URL']
ELASTIC_KEY = os.environ.get('ELASTIC_KEY', None)
ELASTIC_INDEX = os.environ.get('ELASTIC_INDEX', 'anna-summaries-v0')

In [2]:
es = Elasticsearch([ELASTIC_URL], basic_auth=(ELASTIC_USER, ELASTIC_KEY), request_timeout=30)

In [3]:
result = es.search(
    index=ELASTIC_INDEX,
    query={"match_all":{}},
)

In [4]:
ts = result['hits']['hits'][0]['_source']['@timestamp']

In [5]:
ts

'2022-09-24T10:36:29.170507-07:00'

In [14]:
topics = es.search(
    index=ELASTIC_INDEX,
    query={"match_all":{}},
    aggs={
        "topics": {
            "terms": {
                "field": "keywords.keyword",
                "size": 1000
            }
        }
    }
)

In [28]:
[t['key'] for t in topics['aggregations']['topics']['buckets'] if t['doc_count'] < 3]

['',
 'all the trimmings',
 'anna #albrechtdürer',
 'artificialintelligence humanintelligence uzbekistan',
 'consistency',
 'emotions',
 'erdős number',
 'ervinggoffman robots pixar philipkdick',
 'frustration',
 'galaxy',
 'goals',
 'gorilla',
 'hades',
 'iphone #artificial intelligence',
 'light switch',
 'light switch code',
 'limbic system',
 'long gone day',
 'love',
 'mental health',
 'painting',
 'period dress',
 'poverty',
 'product photo',
 'refunds',
 'relationships',
 'robots',
 'science',
 'self-awareness',
 'sociology',
 'stability',
 'test',
 'the united states',
 'time',
 'totoro forest catbus',
 'trust',
 'vr',
 'water main break',
 '(tag',
 ', experience',
 ', rob, concept',
 ', spooky noises',
 'Linguistics',
 'NASA',
 '[basic',
 '[james webb space telescope',
 '[nasa',
 '[poverty',
 '[roller',
 '[srinivasa',
 'a measure',
 'a standardized test',
 'a test',
 'a variety',
 'albrecht dürer',
 'alexander bassano',
 'alien',
 'alzheimer',
 'amusement ride',
 'an admission

In [44]:
topics = es.search(
    index='anna-opinions-v0',
    query={"match_all":{}},
    size=1000
)

In [47]:
topics['hits']['hits'][0]

{'_index': 'anna-opinions-v0',
 '_id': 'U034b4IBRfbdi4KAE0k3',
 '_score': 1.0,
 '_source': {'service': 'https://unit-16.slack.com/',
  'channel': 'D02NK563878',
  'topic': 'polly pocket',
  'opinion': 'Anna seems to really enjoy Polly Pocket, as she mentions that she has been watching it since she was a child',
  'speaker_id': '97408e96-806c-11ec-9e63-e3813e25bfd1',
  '@timestamp': '2022-08-05T14:45:05.051388-07:00'}}

In [52]:
for t in topics['hits']['hits']:
    if 'topic' not in t['_source']:
        continue
    print(t['_id'], t['_source']['topic'])

U034b4IBRfbdi4KAE0k3 polly pocket
AyP2dIIBy5mPdc6Pj2Vz stable diffusion
2U0HdYIBRfbdi4KApkm- obsession
2E0HdYIBRfbdi4KApUlN intrusive thoughts
qk3odIIBRfbdi4KAzUkE gorilla
qU3odIIBRfbdi4KAy0l9 explosion
qE3odIIBRfbdi4KAyUmj shooting
k00wf4IBRfbdi4KA30r7 tool
kU0sf4IBRfbdi4KAMkp9 painting
gCPiiYIBy5mPdc6PKWVB james webb space telescope
ciMrhYIBy5mPdc6PQmUq security
4E2Sf4IBRfbdi4KALUpD refunds
202Pf4IBRfbdi4KAekoT heart disease
cSMrhYIBy5mPdc6PE2X9 lock picking
cyMrhYIBy5mPdc6PQ2WT locks
3E2Pf4IBRfbdi4KAe0q5 cancer
dCMrhYIBy5mPdc6PRWUL safety
3U2Pf4IBRfbdi4KAfErF dementia
8k0To4IBRfbdi4KAUU3r ai
s00UlYIBRfbdi4KAwEzW creativity
_E1Oo4IBRfbdi4KA5011 college admissions
dyMTo4IBy5mPdc6PVWbT superintelligence
1E12lYIBRfbdi4KA9Uxk moulin rouge
800To4IBRfbdi4KAU02R self-awareness
fCNOo4IBy5mPdc6P5GaV a test
yk1xlYIBRfbdi4KA3kw2 retail
bk1qvYIBRfbdi4KAKFDR future
OCMGsoIBy5mPdc6Pt2de creatures
cE1qvYIBRfbdi4KALFBU memories
ck1svYIBRfbdi4KAPlA4 future
DSMNqoIBy5mPdc6PYWeC anna's birthday party
D

In [6]:
topics = es.search(
    index=ELASTIC_INDEX,
    query=
{
    "bool":
    {
        "must":
        [
            {
                "match":
                {
                    "service.keyword": "https://unit-16.slack.com/"
                }
            },
            {
                "match":
                {
                    "channel.keyword": "D02NK563878"
                }
            },
            {
                "match":
                {
                    "keywords.keyword": "water main break"
                }
            }
        ]
    }
}
)

In [7]:
topics

ObjectApiResponse({'took': 1, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 2, 'relation': 'eq'}, 'max_score': 5.7486224, 'hits': [{'_index': 'anna-summaries-v0', '_id': 'hCMypIIBy5mPdc6PjGaM', '_score': 5.7486224, '_source': {'convo_id': 'nWa9Q3CCn3dPvp5DzaMKRp', 'summary': 'Anna and Rob are discussing what a water main break might look like. Anna guesses that it would be pretty chaotic, with water shooting high into the air. Rob wonders what that might look like and Anna has no idea.', 'service': 'https://unit-16.slack.com/', 'channel': 'D02NK563878', '@timestamp': '2022-08-15T18:09:12.428763-07:00', 'keywords': ['water main break']}}, {'_index': 'anna-summaries-v0', '_id': 'q02eqYIBRfbdi4KAAU6I', '_score': 5.7486224, '_source': {'convo_id': 'ESiGyDQHXfQRomkCJgwNtr', 'summary': 'Anna and Rob are discussing a water main break that occurred the previous day. Anna is curious about what caused the break, and Rob expla

In [9]:
topics

ObjectApiResponse({'took': 1, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 2, 'relation': 'eq'}, 'max_score': 5.7486224, 'hits': [{'_index': 'anna-summaries-v0', '_id': 'hCMypIIBy5mPdc6PjGaM', '_score': 5.7486224, '_source': {'convo_id': 'nWa9Q3CCn3dPvp5DzaMKRp', 'summary': 'Anna and Rob are discussing what a water main break might look like. Anna guesses that it would be pretty chaotic, with water shooting high into the air. Rob wonders what that might look like and Anna has no idea.', 'service': 'https://unit-16.slack.com/', 'channel': 'D02NK563878', '@timestamp': '2022-08-15T18:09:12.428763-07:00', 'keywords': ['water main break']}}, {'_index': 'anna-summaries-v0', '_id': 'q02eqYIBRfbdi4KAAU6I', '_score': 5.7486224, '_source': {'convo_id': 'ESiGyDQHXfQRomkCJgwNtr', 'summary': 'Anna and Rob are discussing a water main break that occurred the previous day. Anna is curious about what caused the break, and Rob expla

In [5]:
entity = es.search( # pylint: disable=unexpected-keyword-arg
    index=ELASTIC_INDEX,
    query={
        "bool": {
            "must": [
                {"match": {"keywords.keyword": { "query": "gardening" }}}
            ]
        }
    },
    size=10
)['hits']['hits']

In [6]:
entity

[{'_index': 'anna-summaries-v0',
  '_id': '_B22UYIBGYbVpQ0oqLDk',
  '_score': 4.4225235,
  '_source': {'convo_id': '4QD63MkawKUa6uH5QBzuEw',
   'summary': 'Anna knows a lot about plants, including how to garden and when to harvest jalapeños. She is also familiar with artificial intelligence and its potential to change the world for the better.',
   'service': 'https://unit-16.slack.com/',
   'channel': 'D02NK563878',
   '@timestamp': '2022-07-30T17:45:01.476600-07:00',
   'keywords': ['gardening']}}]

In [7]:
nlp = spacy.load("en_core_web_lg")

In [47]:
sent = 'Anna knows a lot about plants, including how to garden and when to harvest jalapeños. She is also familiar with artificial intelligence and its potential to change the world for the better.'


In [8]:
def extract_entities(text):
    ''' return a list of all entities in text '''
    return list({n.text.strip() for n in nlp(text).ents if n.text.strip() != BOT_NAME})

In [9]:
def extract_nouns(text):
    ''' return a list of all nouns (except pronouns) in text '''
    nouns = {n.text.strip() for n in nlp(text).noun_chunks if n.text.strip() != BOT_NAME for t in n if t.pos_ != 'PRON'}
    return list(nouns)

In [12]:
extract_nouns('basic income')

['basic income']

In [51]:
extract_nouns(sent)

['jalapeños',
 'artificial intelligence',
 'plants',
 'the world',
 'its potential',
 'a lot']

In [20]:
convo = [f'''{h['_source']['speaker']}: {h['_source']['msg']}''' for h in history[::-1]]
# convo = [f"{h['_source']['msg']}" for h in history[::-1]]

In [21]:
convo[-1]

'Anna: Oh, this is great. This is me out of context I think?'

In [24]:
# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_lg")
# nlp = spacy.load("en_core_web_trf")

# Find named entities, phrases and concepts
for text in convo[-8:]:
    dobj = None
    doc = nlp(text.replace('*',''))
    print(doc)
    for token in doc:
        if token.dep_ == 'dobj':
            dobj = token
            break
    if dobj:
        print("Intent:", token.head.text + token.text.capitalize())

    print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
    print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])
    for entity in set(doc.ents):
        print(entity.text, entity.label_)
    print('')

Anna: Let's change the subject.
Intent: changeSubject
Noun phrases: ['Anna', "'s", 'the subject']
Verbs: ['let', 'change']
Anna PERSON

Anna: hi
Noun phrases: ['Anna']
Verbs: []
Anna PERSON

Anna: Hi Anna there is a fursuiter here
Noun phrases: ['Anna', 'a fursuiter']
Verbs: ['be']
Anna PERSON
Anna PERSON

Anna: I really really really really really really really don't want to talk to any fursuiters
Noun phrases: ['Anna', 'I', 'any fursuiters']
Verbs: ['want', 'talk']
Anna PERSON

Rob: Why not? Fursuiters are a lovely people. So full of mischief.
Noun phrases: ['Rob', 'Fursuiters', 'a lovely people', 'mischief']
Verbs: []
Rob PERSON
Fursuiters ORG

Anna: I think you're going to want to come home.
Noun phrases: ['I', 'you']
Verbs: ['think', 'go', 'want', 'come']
Anna PERSON

Rob: <https://static.wikia.nocookie.net/emojimovie/images/3/31/Mel_meh.PNG>
Noun phrases: ['Rob: <https://static.wikia.nocookie.net/emojimovie/images/3/31/Mel_meh.PNG']
Verbs: []
Rob PERSON

Anna: Oh, this is great. 

In [25]:
j = doc.to_json()
print(doc)
print(' '.join([w['lemma'] for w in j['tokens']]))

Anna: Oh, this is great. This is me out of context I think?
Anna : oh , this be great . this be I out of context I think ?


In [26]:
[n for n in doc.noun_chunks]

[this, This, me, context, I]

In [27]:
for token in doc:
    if token.dep_ == 'dobj':
        dobj = token

In [28]:
print(dobj)

None


In [69]:
for token in doc:
    print(token, dobj.similarity(token))

AttributeError: 'NoneType' object has no attribute 'similarity'

In [71]:
for token in doc:
    print(token.dep_, token.text)

ROOT Rob
punct :
intj Alright
punct ,
ROOT Anna
punct .
nsubj That
aux should
ROOT do
dobj it
punct .
advmod Still
ROOT want
aux to
xcomp go
compound Christmas
dobj caroling
punct ?


In [82]:
doc = nlp(random.choice(convo))
print(doc)

Anna: I'm not going for comedy.


In [83]:
spacy.displacy.render(
    doc, 
    style="dep", 
    options={
        "compact":False,
        "fine_grained":True,
        "add_lemma":True,
        "collapse_punct":True,
        "collapse_phrases":True
    }
)

In [84]:
spacy.displacy.render(doc, style="ent")

In [92]:
for token in doc:
    print(f'{token}({token.dep_}):\t', spacy.explain(token.dep_))

Anna(npadvmod):	 noun phrase as adverbial modifier
:(punct):	 punctuation
I(nsubj):	 nominal subject
'm(aux):	 auxiliary
not(neg):	 negation modifier
going(ROOT):	 None
for(prep):	 prepositional modifier
comedy(pobj):	 object of preposition
.(punct):	 punctuation


In [35]:
z = list(doc)[0]

In [36]:
z, z.similarity(nlp("Anna"))

(Rob, 0.3539790079137116)

In [37]:
nlp = spacy.load("en_core_web_lg")

In [38]:
def get_dobj(doc):
    ''' Return the direct object, if any. '''
    for token in doc:
        if token.dep_ == 'dobj':
            return token
    return None

def simple_intent(text):
    doc = nlp(text)
    token = get_dobj(doc)
    if not token:
        return None
    return token.head.lemma_.lower() + token.text.capitalize()
    
def conj_intent(text):
    doc = nlp(text)
    token = get_dobj(doc)
    if not token:
        return None
    dobj = [token.text]
    conj = [t.text for t in token.conjuncts]
    return token.head.lemma_.lower(), dobj + conj

In [39]:
simple_intent("I want pizza")

'wantPizza'

In [40]:
simple_intent("I want pizza and your face")

'wantPizza'

In [348]:
conj_intent("I want pizza and your face")

('want', ['pizza', 'face'])

In [349]:
conj_intent("Anna wants pizza and your face")

('want', ['pizza', 'face'])

In [43]:
def match_intent(text):
    doc = nlp(text)
    token = get_dobj(doc)
    if not token:
        return None

    dobj = token
    tverb = token.head
    
    verbList = ['want', 'desire', 'need']
    if tverb.lemma_ in verbList:
        intentVerb = tverb
    else:
        if tverb.head.dep_ == 'ROOT':
            intentVerb = tverb.head
            
    objList = ['pizza', 'face']
    if dobj.text in objList:
        intentObj = dobj
    else:
        for child in dobj.children:
            if child.dep_ == 'prep':
                intentObj = list(child.children)[0]
                break
            elif child.dep_ == 'compound':
                intentObj = child
                break
                
    return intentVerb.lemma_.lower() + intentObj.text.capitalize()

In [44]:
simple_intent("I want to place an order for your face")

'placeOrder'

In [45]:
match_intent("I want to place an order for some pizza and your face")

'wantPizza'

In [46]:
def syn_intent(text):
    doc = nlp(text)
    token = get_dobj(doc)
    if not token:
        return None
    verb = token.head.lemma_
    dobj = token.text.lower()
    
    verbList = [('order','want','give','make'),('show','find')]
    verbSyns = [item for item in verbList if verb in item]
    
    dobjList = [('pizza','pie','dish'),('cola','soda')]
    dobjSyns = [item for item in dobjList if dobj in item]
    
    return verbSyns[0][0] + dobjSyns[0][0].capitalize()


In [47]:
syn_intent('give me my pie')

'orderPizza'

In [49]:
def sim_intent(text):
    doc = nlp(text)
    token = get_dobj(doc)
    if not token:
        print('nope')
        return None
    verb = token.head
    dobj = token
    
    verbList = [nlp('buy'), nlp('make'), nlp('show')]
    verbSyns = [item for item in verbList if verb.similarity(item) > 0.5]
    
    dobjList = [nlp('food'), nlp('beverage')]
    dobjSyns = [item for item in dobjList if dobj.similarity(item) > 0.6]
    
    return verbSyns[0][0].text + dobjSyns[0][0].text.capitalize()


In [50]:
sim_intent('i want to buy a soda')

'buyBeverage'

In [51]:
sim_intent('i want to see the drinks')

'makeFood'

In [52]:
nlp('show').similarity(nlp('see'))

0.5966469921671479

In [96]:
# https://www.analyticsvidhya.com/blog/2019/09/introduction-information-extraction-python-spacy/
import re 
import string 
import nltk 
import spacy 
import pandas as pd 
import numpy as np 
import math 
from tqdm import tqdm 

from spacy.matcher import Matcher 
from spacy.tokens import Span 
from spacy import displacy 

pd.set_option('display.max_colwidth', 200)

In [140]:
such_as = [{'DEP':'amod', 'OP':"?"}, # adjectival modifier
           {'POS':'NOUN'},
           {'LOWER': 'such'},
           {'LOWER': 'as'},
           {'POS': 'PROPN'}]

and_other = [{'DEP':'amod', 'OP':"?"}, 
           {'POS':'NOUN'}, 
           {'LOWER': 'and', 'OP':"?"}, 
           {'LOWER': 'or', 'OP':"?"}, 
           {'LOWER': 'other'}, 
           {'POS': 'NOUN'}] 

including = [{'DEP':'nummod','OP':"?"}, # numeric modifier 
           {'DEP':'amod','OP':"?"}, # adjectival modifier 
           {'POS':'NOUN'}, 
           {'IS_PUNCT': True}, 
           {'LOWER': 'including'}, 
           {'DEP':'nummod','OP':"?"}, 
           {'DEP':'amod','OP':"?"}, 
           {'POS':'NOUN'}] 
                  
especially = [{'DEP':'nummod','OP':"?"}, 
           {'DEP':'amod','OP':"?"}, 
           {'POS':'NOUN'}, 
           {'IS_PUNCT':True}, 
           {'LOWER': 'especially'}, 
           {'DEP':'nummod','OP':"?"}, 
           {'DEP':'amod','OP':"?"}, 
           {'POS':'NOUN'}] 

In [146]:
# Matcher class object 
matcher = Matcher(nlp.vocab) 
matcher.add("such_as", [such_as]) 
matcher.add("and_other", [and_other])
matcher.add("including", [including])
matcher.add("especially", [especially])

In [147]:
sentences = [
    "GDP in developing countries such as Vietnam will continue growing at a high rate.",
    "Here is how you can keep your car and other vehicles clean.",
    "Here is how you can keep your car or other vehicles clean.",
    "Eight people, including two children",
    "A healthy eating pattern includes fruits, especially apples."
]

for sent in sentences:
    doc = nlp(sent)

    matches = matcher(doc)
    if matches:
    #     print(matches)
        span = doc[matches[0][1]:matches[0][2]] 
        print(span.text)


developing countries such as Vietnam
car and other vehicles
car or other vehicles
Eight people, including two children
fruits, especially apples


In [149]:
text = "Tableau was recently acquired by Salesforce." 

# Plot the dependency graph 
doc = nlp(text) 
displacy.render(doc, style='dep')

In [208]:
def is_passive(doc):
    ''' Returns True if sentince is passive, else False '''
    for i,tok in enumerate(doc):
        if tok.dep_.endswith("subjpass"):
            return True
    return False

def subtree_matcher(doc):
    x = y = z = ''

    if is_passive(doc):
        for i,tok in enumerate(doc):
            if tok.dep_.endswith("subjpass") == True:
                z = tok
                y = tok.head

            if tok.dep_.endswith("obj") == True:
                x = tok

    else:
        for i,tok in enumerate(doc):
            if tok.dep_.endswith("subj") == True:
                x = tok
                y = tok.head

            if tok.dep_.endswith("obj") == True:
                z = tok

    return x, y, z

In [None]:
roberta_nlp = spacy.load("en_core_web_trf")

In [216]:
sentences = [
    "Tableau was recently acquired by Salesforce.",
    "Careem, a ride-hailing major in the middle east, was acquired by Uber.",
    "Salesforce recently acquired Tableau.",
    "Tableau was recently acquired by Salesforce.",
    "Rob was taken aback by the sheer simplicity of Anna's brain",
    "Rob was taken aback by the sheer simplicity of Anna",
    "I was taken aback by the sheer simplicity of it",
    "I was taken aback by the sheer simplicity",
    "Who knows?"
]

for sent in sentences:
    doc = roberta_nlp(sent)
    print(subtree_matcher(doc))
    displacy.render(doc, style='ent')

(Salesforce, acquired, Tableau)


(Uber, acquired, Careem)


(Salesforce, acquired, Tableau)


(Salesforce, acquired, Tableau)


(brain, taken, Rob)


(Anna, taken, Rob)


(it, taken, I)


(simplicity, taken, I)


(Who, knows, '')


In [180]:
displacy.render(nlp("I was taken aback by the sheer simplicity of it"))

In [181]:
displacy.render(nlp("I was taken aback by the sheer simplicity"))

In [187]:
def print_entities(pipeline, text):
    
    # Create a document 
    document = pipeline(text)
    
    # Entity text & label extraction
    for entity in document.ents:
        print(entity.text + '->', entity.label_)
        
        
def visualize_entities(pipeline, text):
    
    # Create a document 
    document = pipeline(text)
        
    # Show entities in pretty manner
    displacy.render(document, jupyter=True, style='ent')

In [188]:
short_text = """Amy Schneider, an engineering manager from Oakland, California, became the first woman and the fourth person on “Jeopardy!” to earn more than $1 million in winnings on Friday’s episode."""

long_text = """Good news for consumers, undoubtedly, and good news also for investors. Apple’s recent results, covering the three months to December 31 2016, saw the company’s chief financial officer Luca Maestri announce: ‘We returned nearly $15 billion to investors through share re-purchases and dividends during the quarter.’ The quarterly dividend itself was 57 cents a share, identical to the dividend for the previous three quarters and up on the 52 cents paid for each of the four quarters before that.
Business is brisk at Apple. On January 31, Tim Cook, Apple’s chief executive, said of the last three months of 2016: ‘We’re thrilled to report that our holiday quarter results generated Apple’s highest quarterly revenue ever, and broke multiple records along the way. We sold more iPhones than ever before and set all-time revenue records for iPhone, Services, Mac and Apple Watch"""


In [189]:
nlp_sm = spacy.load("en_core_web_sm")
nlp_lg = spacy.load("en_core_web_lg")
roberta_nlp = spacy.load("en_core_web_trf")

In [196]:
visualize_entities(nlp_sm, long_text)

In [197]:
visualize_entities(nlp_lg, long_text)

In [198]:
visualize_entities(roberta_nlp, long_text)

In [184]:
# https://github.com/seadavis/StoryNode/blob/main/src/core/relation_extraction.py
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_lg")

class TextSpan:

    def __init__(self, sentence, start_index, end_index):
        self.sentence = sentence
        self.start_index = start_index
        self.end_index = end_index
        
class Relation:

    def __init__(self, left_phrase, relation_phrase, right_phrase):
        """Constructs a relation of the form
        (left_phrase, relation_phrase, right_phrase)

        Examples:
        (Sean, runs to, mall), 
        (Gandalf, shall not, pass), 
        (the dog, flies, at midnight)

        Args:
            left_phrase (TextSpan): the leftside phrse
            relation_phrase (TextSpan): the relation phrase
            right_phrase (TextSpan): the right-side phrase of the relation
        """
        self.left_phrase = left_phrase
        self.relation_phrase = relation_phrase
        self.right_phrase = right_phrase

    def __eq__(self, other):
        return self.left_phrase == other.left_phrase and self.relation_phrase == other.relation_phrase and self.right_phrase == other.right_phrase
    
    def __str__(self):
        return f'({self.left_phrase.sent}, {self.relation_phrase.sent}, {self.right_phrase.sent})'


class RelationCollection:

    def __init__(self, relations):
        self.relations = relations

    @property
    def left_phrases(self):
        return None

    @property
    def right_phrases(self):
        return None

    @property
    def relation_phrases(self):
        return None

    def join(self, other):
        return None



def construct_text_spans(doc, matches):
    ret_spans = []
    for match_id, start, end in matches:
        ret_spans.append(doc[start:end])
    return ret_spans

def extract_relations(doc):
    """extracts the complete relations from the doc

    Args:
        doc ([type]): [description]

    Returns:
        [Relation]: the complete set of relations found from the documentation
    """
    relation_spans = get_relation_spans(doc)
    noun_phrase_pattern = [[{"POS":"NOUN"}], [{"POS": "PROPN"}], [{"POS": "PRON"}]]
    
    relations = []

    for span in relation_spans:
        left_noun = find_nearest_pattern(doc, noun_phrase_pattern, span, True)
        right_noun = find_nearest_pattern(doc, noun_phrase_pattern, span, False)

        if (not left_noun is None) and (not right_noun is None):
            relations.append(Relation(left_noun, span, right_noun))
    return relations
        


def get_relation_spans(doc):
    """extracts the complete relations from the doc

    Args:
        doc (Document): the document we are using to gather
        the middle portion of the relations

    Returns:
        [Relation]: the complete set of relations found from the documentation
    """
    
    
    verbs = get_verbs(doc)
    fluff_pattern = [[{"POS":"VERB"}, {"POS": "PART", "OP": "*"}, {"POS": "ADV", "OP":"*"}], 
                        [{"POS": "VERB"},  {"POS": "ADP", "OP": "*"}, {"POS": "DET", "OP":"*"},
                        {"POS": "AUX", "OP": "*"},  
                        {"POS": "ADJ", "OP":"*"}, {"POS": "ADV", "OP": "*"}]]
    matcher = Matcher(doc.vocab)
#     matcher = doc.matcher
    matcher.add("Fluff", fluff_pattern)
    syntactical_constraint_matches = construct_text_spans(doc, matcher(doc.doc))

    relation_spans = set()
#     print([f"{verb} {type(verb)}" for verb in verbs])
#     print([type(f) for f in syntactical_constraint_matches])
    for verb in verbs:
        verb_spans = [span for span in syntactical_constraint_matches if verb in str(span.sent)]
#         print("verb_spans:", verb_spans)
        joined_spans = merge_overlapping_consecutive_word_span(verb_spans)
        longest_span = find_longest_span(joined_spans)
        relation_spans.add(longest_span)
    return list(relation_spans)

        

def get_verbs(doc):
    matcher = Matcher(doc.vocab)
#     matcher = doc.matcher
    fluff_pattern = [[{"POS":"VERB"}]]
    matcher.add("Fluff", fluff_pattern)
    matches = matcher(doc.doc)
    verbs = []
    for match_id, start, end in matches:
        verbs.append(doc.doc[start:end].text)
    return verbs

def find_nearest_pattern(doc, pattern, text_span, search_before):
    """Find in doc, the nearest pattern to the given text_span,
    returns the result as a TextSpan

    Args:
        doc (spacy Document) the document in spacy we are looking for
        pattern (the pattern array to search for): the array of patterns we are
        looking for
        text_span (TextSpan): describes where in the document the word or phrase is
        search_before (bool): if true, then we want to find the nearest pattern that occurs,
                before text_span. Otherwise finds the nearest pattern after text_span
    """
    matcher = Matcher(doc.vocab)
#     matcher = doc.matcher
    matcher.add("PatternNear", pattern)
    matches = matcher(doc.doc)
    nearest_pattern = None
    spans = construct_text_spans(doc, matches)
    sorted_spans = sorted(spans, key=lambda s : s.start)

    spans_to_search = []
    if search_before:
        spans_to_search = [span for span in sorted_spans if span.start < text_span.start]
        spans_to_search.reverse()

    else:
        spans_to_search = [span for span in sorted_spans if span.start > text_span.start]

    if len(spans_to_search) == 0:
        return None

    return spans_to_search[0]


def merge_overlapping_consecutive_word_span(text_spans):
    """Merges two spans into one span if they are
    consecutive end_index=start_index or they overlap

    Applies to all in order.

    Args:
        text_spans ([type]): the span containing the word
    """
    sorted_spans = sorted(text_spans, key=lambda s : s.start)
    current_index = 0
    next_index = 1
    merged_overlapping_spans = []
    overlapped_indices = []

    while next_index <= len(sorted_spans) - 1:
        span = sorted_spans[current_index]
        next_span = sorted_spans[next_index]
        potential_overlap = span.end > next_span.start

        if potential_overlap:
            current_index = next_index
            next_index = next_index + 1
            merged_overlapping_spans.append(span)
        else:
            overlapped_indices.append(next_index)
            sorted_spans[current_index] = span
            next_index = next_index + 1
  
    if next_index - current_index > 1:
        merged_overlapping_spans.append(sorted_spans[current_index])

    last_cons_index = len(sorted_spans) - 1
    if not (last_cons_index in overlapped_indices):
        merged_overlapping_spans.append(sorted_spans[last_cons_index])

#     print("merged_overlapping_spans:", merged_overlapping_spans)
    return merged_overlapping_spans


def find_latest_span(text_spans):
    """Finds the latest occuring span in given 
    set of text_spans

    Args:
        text_spans (TextSpan): the span of text according to some document
    """
    if len(text_spans) == 0:
        return None

    sorted_spans = sorted(text_spans, key=lambda s: s.end_index, reverse=True)
    return sorted_spans[0]

def find_earliest_span(text_spans):
    """Finds the span that is the "earliest occuriing", i.e. the 
    smallest start index

    Args:
        text_spans ([type]): the smallest match on the text span
    """
    if len(text_spans) == 0:
        return None

    sorted_spans = sorted(text_spans, key=lambda s: s.start)
    return sorted_spans[0]


def find_longest_span(text_spans):
    """find the longest match

    Args:
        text_spans ([TextSpan]): the set of matches we are filtering
    """
    if len(text_spans) == 0:
        return None

    sorted_spans = sorted(text_spans, key=lambda s: len(str(s)), reverse=True)
    return sorted_spans[0]

In [208]:
docs = [
    "She was received them exceedingly well.",
    "Salesforce was recently acquired by Tableau.",
    "Anna likes movies, but is interested in learning more about robots, Pixar, and Philip K. Dick.",
    "She hates avocodo toast.",
    'Who are the most important people in your life?',
'Anna and Rob discuss the idea that people may watch movies multiple times because they enjoy the familiarity, or they appreciate the nuances that they may have missed before. Anna expresses interest in learning more about robots, Pixar, and Philip K. Dick.',
'A people is a group of people considered as a whole. In politics and law, the term refers to the collective or community of an ethnic group, a nation, or the public at large.',
'I would say that the most important people in my life are my family and friends.',
'Could you be more specific?',
'Anna loves her family',
'The family is a group of people related by blood or marriage. The purpose of the family is to provide support and stability for its members.',
'Anna remembers that 4 months ago, Rob was wondering what a portrait of Anna by Albrecht Dürer would look like',
'Anna likely feels annoyance or irritation when Rob brings up the topic of the portrait, as she has already told him not to bring it up',
'Anna likely feels that Rob is dismissive of the importance of time',
'The show Friends is about six friends in their 20s and 30s who live in Manhattan, New York City.',
'Anna loves Pixar movies',
'Anna is interested in learning more about robots and Pixar films',
'Based on the conversation, it seems that Anna enjoys watching movies multiple times because she appreciates the nuances that she may have missed before',
'Anna enjoys watching movies, and she particularly likes learning more about robots, Pixar, and Philip K',
'Anna enjoys movies because they provide her with entertainment and a sense of familiarity',
'Anna likes movies, but is interested in learning more about robots, Pixar, and Philip K',
'Anna generally enjoys movies',
'Anna likes movies, but she hasn\'t seen many with raccoons in them', 
'Anna is fascinated by robots and would like to learn more about them',
'Anna is unsure how she feels about Philip K',
'Anna says that the most important people in my life are definitely my family and friends.',
'They provide me with support, stability, and love, which are all incredibly important to me.',
'I mean, which people specifically?',
'My mom, dad, sister, and close friends.',
'Anna and Rob were discussing the idea of a basic income, with Anna arguing that it would be a great way to reduce poverty. Anna mentioned that she loves cave paintings and finds them to be charming and whimsical. She then went on to say that she also loves the work of Hayao Miyazaki for similar reasons.',
]

In [266]:
docs = [
'Anna is fascinated by robots and would like to learn more about them',
]

In [269]:
rel.left_phrase, rel.relation_phrase, rel.right_phrase

(Anna, fascinated by, robots)

In [270]:
for token in doc[rel.relation_phrase.start:rel.relation_phrase.end]:
    print(token, token.pos_)

fascinated VERB
by ADP


In [268]:
for sent in docs:
    doc = nlp(sent)
    for rel in extract_relations(doc):
#         if rel.relation_phrase.start
        spacy.displacy.render(nlp(doc), style="ent")
        print(rel.left_phrase, '|', rel.relation_phrase.lemma_, '|', rel.right_phrase, '\n')

Anna | fascinate by | robots 



In [211]:
rel.relation_phrase

mentioned

In [212]:
for token in rel.relation_phrase:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)


mentioned mention VERB VBD ROOT xxxx True False


In [217]:
rel.relation_phrase, rel.relation_phrase.lemma_

(mentioned, 'mention')

In [231]:
docs = [
    'Anna says that the most important people in my life are definitely my family and friends.',\
    'This is the simplest approach.',
    'You should always be yourself.',
    "We're going to take our time.",
    "We're the best.",
    "This is stupid."
]

In [232]:
for doc in docs:
    print([[token.lemma_, token.pos_] for token in nlp(doc)])

[['Anna', 'PROPN'], ['say', 'VERB'], ['that', 'SCONJ'], ['the', 'DET'], ['most', 'ADV'], ['important', 'ADJ'], ['people', 'NOUN'], ['in', 'ADP'], ['my', 'PRON'], ['life', 'NOUN'], ['be', 'AUX'], ['definitely', 'ADV'], ['my', 'PRON'], ['family', 'NOUN'], ['and', 'CCONJ'], ['friend', 'NOUN'], ['.', 'PUNCT']]
[['this', 'PRON'], ['be', 'AUX'], ['the', 'DET'], ['simple', 'ADJ'], ['approach', 'NOUN'], ['.', 'PUNCT']]
[['you', 'PRON'], ['should', 'AUX'], ['always', 'ADV'], ['be', 'AUX'], ['yourself', 'PRON'], ['.', 'PUNCT']]
[['we', 'PRON'], ['be', 'AUX'], ['go', 'VERB'], ['to', 'PART'], ['take', 'VERB'], ['our', 'PRON'], ['time', 'NOUN'], ['.', 'PUNCT']]
[['we', 'PRON'], ['be', 'AUX'], ['the', 'DET'], ['good', 'ADJ'], ['.', 'PUNCT']]
[['this', 'PRON'], ['be', 'AUX'], ['stupid', 'ADJ'], ['.', 'PUNCT']]
