In [None]:
import nltk
from py2neo import Graph

import pandas as pd
from pprint import pprint

from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

import spacy
from spacy import displacy
import en_core_web_sm

from nltk import pos_tag
from itertools import groupby

import pickle

In [None]:
def connect():
    global graph
    graph = Graph("bolt://localhost:7687", auth = ("neo4j", "test"))
    tx = graph.begin()
    print('Connected...')

In [None]:
connect()

Connected...


In [None]:
def ask_question():
    global question
    question = input("INPUT: ")
    print("\n")

In [None]:
def tokenize(question):
    
    question_tokenized = word_tokenize(question)
    
    stop_words = set(stopwords.words('english'))
    
    filtered_question = [w for w in question_tokenized if not w in stop_words]
    
    filtered_question = []
    
    for w in question_tokenized:
        
        if w not in stop_words:
            
            filtered_question.append(w)
            
    return filtered_question

In [None]:
question = "Where does Louisa Lim live?"
filtered_question = tokenize(question)

In [None]:
filtered_question

['Where', 'Louisa', 'Lim', 'live', '?']

In [None]:
companies = ["Lehman Brothers",
"ABN Amro",
"ING",
"Sugar Foods",
"Dresdner Kleinwort Wasserstein",
"Calyon Securities",
"Purvin",
"Moet & Chandon",
"Seton Healthcare",
"EADS",
"State Administration of Taxation",
"Sunderland FC",
"Bank of England",
"AG Edwards & Sons",
"SEC",
"Stanley Gold",
"Reliance",
"Tyco International",
"Santander Central Hispano",
"Banco Central Hispano",
"McDonald",
"Nomura",
"GM Europe",
"Vivendi",
"Barclays Capital",
"Manchester United",
"Barclay",
"Shearman",
"Bundesbank",
"GM",
'Fiat']

In [None]:
companies = [c.lower() for c in companies]

In [None]:
# Because we are lowering the text when we parse and clean an incoming query
# we need to create a mapping from capitalized company names to lower-cased company nmes, since BBC
# and thus Janus uses proper noun capitalisation for the graph. Companies are tricky,
# because of names like ING and AG Edwards & Sons. Most people won't use the ampersand

companies_map = {"lehman brothers":"Lehman Brothers",
"abn amro":"ABN Amro",
"ing":"ING",
"sugar foods":"Sugar Foods",
"dresdner kleinwort wasserstein":"Dresdner Kleinwort Wasserstein",
"calyon securities":"Calyon Securities",
"purvin":"Purvin",
"moet & chandon":"Moet & Chandon",
"seton healthcare":"Seton Healthcare",
"eads":"EADS",
"state administration of taxation":"State Administration of Taxation",
"sunderland fc":"Sunderland FC",
"bank of england":"Bank of England",
"ag edwards & sons":"AG Edwards & Sons",
"sec":"SEC",
"stanley gold":"Stanley Gold",
"reliance":"Reliance",
"tyco international":"Tyco International",
"santander central hispano":"Santander Central Hispano",
"banco central hispano":"Banco Central Hispano",
"mcdonald":"McDonald",
"nomura":"Nomura",
"gm europe":"GM Europe",
"vivendi":"Vivendi",
"barclays capital":"Barclays Capital",
"manchester united":"Manchester United",
"barclay":"Barclay",
"shearman":"Shearman",
"bindesbank":"Bundesbank",
"gm":"GM",
"fiat": "Fiat"}

In [None]:
people_locations = ["Greenspan",
"Claridge",
"Belo Horizonte",
"Brian Taylor",
"Gerhard Schroeder",
"Lindemans",
"Illva Saronno",
"Volzhsky",
"Nikolaev",
"Bill",
"Brad Wernle",
"Nestor Kirchner",
"Mangala",
"Junya Tanase",
"Nicolas Sarkozy",
"Shanda",
"Pernod Ricard",
"Hall Green",
"Perry Barr",
"Suzuki",
"Rachel Harvey",
"Helen Carroll",
"Isabelle Kronawitter",
"David Wyss",
"Manger Magazin",
"Louisa Lim",
"David Willey",
"Sarah Rainsford",
"Stefan Johannesson",
"Mikoyan-Gurevich",
"Emmanuel Gaillard"]

In [None]:
people_locations_lower = [x.lower() for x in people_locations]

In [None]:
people_location_map = {people_locations_lower[i]: people_locations[i] for i in range(len(people_locations))} 

In [None]:
people_locations_lower[-1]

'emmanuel gaillard'

In [None]:
# Because we are lowering the text when we parse and clean an incoming query
# we need to create a mapping from capitalized person names to lower-cased person nmes, since BBC
# and thus Janus uses proper noun capitalisation for the graph
people_companies = [

"Alex Potter",
"Eddie Wong",
"Jason Kenney",
"Michael Hannigan",
"Mike Powell",
"Ray Neidl",
"Gertz",
"Louis Vuitton",
"Scholl",
"Philippe Camus",
"Xie Xuren",
"Bob Murray",
"Rachel Lomax",
"Gary Thayer",
"Reveta Bowers",
"Mukesh",
"Dennis Kozlowski",
"Emilio Botin",
"Santander",
"Bell",
"Erling Refsum",
"Carl-Peter Forster",
"Carl Peter Forster",
"Jean-Rene Fourtou",
"Jean Rene Fourtou",
"Orin Middleton",
"Eric Cantona",
"Frederick",
"Emmanuel Gaillard",
"Hans Reckers",
"Rick Wagoner",
]

In [None]:
people_companies_lower = [x.lower() for x in people_companies]

In [None]:
# here we map upper-case to lower-case
people_companies_map = {people_companies_lower[i]:people_companies[i] for i in range(len(people_companies))}

In [None]:
pickle.dump( companies_map, open( "companies_map.p", "wb" ) )


In [None]:
pickle.dump(companies, open( "companies.p", "wb" ))

In [None]:
pickle.dump(people_companies_map, open( "people_companies.p", "wb" ))

In [None]:
pickle.dump(people_location_map, open( "people_locations.p", "wb" ))

In [None]:
def tag(question, filtered_question):
    
    ner, tags = [], []
    
    nlp = en_core_web_sm.load()
    
    doc = nlp(question)
    
    ner = [(X.text, X.label_) for X in doc.ents]
    
    print(question)
    
    print(filtered_question)
    
    for token in doc:
        
        print((token.text, token.pos_, token.tag_, token.dep_))
        
    displacy.render(doc)
    
    tags = pos_tag(filtered_question)
    
    print('All tags: ',tags)
    print('Length of the list: ',len(tags))
    
    groups = groupby(tags, key=lambda x: x[1])
    
    names_tagged = [[w for w,_ in words] for tag,words in groups if tag=="NNP"]
    
    print('Tagged names are: ',names_tagged)
    
    names = [" ".join(name) for name in names_tagged if len(name)>=2]
    
    if len(ner) == 0:
        
        if any([x in companies for x in filtered_question]):
            
            matches = [x for x in companies if x in filtered_question]
            
            for m in matches:
                
                ner.append((m, "ORG"))
            
    return ner, tags

In [None]:
filtered_question

['Where', 'Louisa', 'Lim', 'live', '?']

In [None]:
ner,tags = tag(question, filtered_question)

Where does Louisa Lim live?
['Where', 'Louisa', 'Lim', 'live', '?']
('Where', 'ADV', 'WRB', 'advmod')
('does', 'AUX', 'VBZ', 'aux')
('Louisa', 'PROPN', 'NNP', 'compound')
('Lim', 'PROPN', 'NNP', 'nsubj')
('live', 'VERB', 'VB', 'ROOT')
('?', 'PUNCT', '.', 'punct')


All tags:  [('Where', 'WRB'), ('Louisa', 'NNP'), ('Lim', 'NNP'), ('live', 'NN'), ('?', '.')]
Length of the list:  5
Tagged names are:  [['Louisa', 'Lim']]


In [None]:
tags

[('Where', 'WRB'),
 ('Louisa', 'NNP'),
 ('Lim', 'NNP'),
 ('live', 'NN'),
 ('?', '.')]

In [None]:
# RESTAURANTS NEAR DALLAS <- 
##### <- code that kinda work 
##### <- towards 
### Code that makes query work

In [None]:
def params_builder(ner):
    
    params, params_2 = {}, {}
    
    if len(ner) == 1:
        
        if (ner[0][1] == 'GPE') or (ner[0][1] == 'LOC'):
            
            if (ner[0][0] == "US") or (ner[0][0] == "USA"):
                
                country_ = 'United States'
                
            elif (ner[0][0] == "UK"):
                
                country_ = 'United Kingdom'
                
            else:
                
                country = ner[0][0]
                
                params = {}
                
                params["country"] = country
                
                print(params)
                
        elif (ner[0][1] == 'ORG'):
            
            org = ner[0][0]
            
            params = {}
            
            try:
                params["org"] = companies_map[org] # get our mapped version to make sure we get the capitalized proper noun
                
            except:
                
                if org[0].islower():
                    params["org"] = org[0].toupper()+org[1:]
            
            print(params)
            
        elif (ner[0][1] == 'PERSON'):
            
            person = ner[0][0]
            
            params = {}
            
            params["person"] = person
            
            print(params)
            
    elif len(ner) > 1:
        
        name1 = ner[0][0]
        
        name2 = ner[1][0]
        
        params_2 = {"name1":name1, "name2":name2} # looking for first and last name
        print(params_2)
        
    return params, params_2

In [2]:
params, params_2 =  params_builder(ner)

In [3]:
params

In [105]:
# finally, we pick the right query based on verbs.
# the query + params will be sent to Neo4j
def query_picker(tags, params):
    
    label_p = "(p:Person)"
    label_o = "(o:Company)"
    label_l = "(l:NE_Location)"
    works = "-[r:WORKS_FOR]-"
    lives = "-[:LIVES_IN]-"

    query1 = '''
    match {} {} {}
    where p.value IN $person
    return p.value as Name,r.value as Works_as,o.value as at
    '''.format(label_p,works,label_o)

    query1_1 = '''
    match {} {} {}
    where o.value IN $org
    return o.value as Organization, collect(distinct p.value) as Person, r.value as Position
    '''.format(label_p,works,label_o)

    query2 = '''
    match (p:Person {value:$person})-[r:LIVES_IN]-(l:Location)
    return p.value as Person, l.value as STAYS_AT
    '''

    query3 = '''
    MATCH (p1:NER_Person:Tag{ value: $name1 }),(p2:NER_Person:Tag{ value: $name2 }), p = shortestPath((p1)-[*..15]-(p2))
    RETURN p1.value as Person1, p2.value as Person2, p as Relation
    '''

    query4 = '''
    MATCH (p:NER_Person)-[w:LIVES_IN]->(o:NER_Location)
    RETURN p.value as Person ,o.value as Lives_in
    '''.format(label_p,lives,label_o)

    query5 = '''
    MATCH (p:NER_Person)-[:LIVES_IN]->(l:NER_Location), (p)-[w:WORKS_AT]-(o:NER_Organization)
    RETURN p.value as Person, l.value as Lives_in, o.value as Works_at, w.AS as Position
    '''

    query6 = '''
    MATCH (s:Sentence)-[st:SENTENCE_TAG_OCCURRENCE]->(n:TagOccurrence), (s)-[h:HAS_TAG]-(p:NER_Person), (s)-[h]-(o:NER_Organization)
    where n.value IN ["said","says","think","thinks"] AND (p.value in $names OR o.value in $org)
    return s.text as Sentence, p.value as Person
    '''
    
    query7 = '''
    match (a:Article)-[:HAS_ANNOTATED_TEXT]-(at:AnnotatedText)-[:CONTAINS_SENTENCE]-(s:Sentence)-[:SENTENCE_TAG_OCCURRENCE]-(t:TagOccurrence)-[:TAG_OCCURRENCE_TAG]-(o:NER_Organization)
    where o.value = $org
    return distinct a.Text
    
    '''
    
    query8 = '''
    match (a:Article)-[:HAS_ANNOTATED_TEXT]-(at:AnnotatedText)-[:CONTAINS_SENTENCE]-(s:Sentence)-[:SENTENCE_TAG_OCCURRENCE]-(t:TagOccurrence)-[:TAG_OCCURRENCE_TAG]-(p:NER_Person)
    where p.value = $person
    return a.Text
    
    '''
        
    for word,tag in tags:
        
        if word in ['work','do']:
            verb = word
            query = query1
            print('(q1) We have a verb: ',verb)
            print(query)
            print(graph.run(query,params).data())
        elif word in ['works','at']:
            verb = word
            print('(q1_1) We have an org: ',word)
            print(graph.run(query1_1,params).to_table())
        elif word in ['live','reside','stay']:
            verb = word
            print('(q2) We have a verb: ',verb)
            return graph.run(query2,params).data()
        elif word in ['related','relation']:
            verb = word
            print('(q3) We have a verb: ',verb)
            print(graph.run(query3,params_2).data())
        elif word in ['live','lives']:
            verb = word
            print('(q4) We have a verb: ',verb)
            return graph.run(query4).to_table()
        elif word in ['everyone']:
            verb = word
            print('(q5) We have a verb (s): ',verb)
            print(graph.run(query5).to_table()) 
        elif word in ['think', 'says', 'say']:
            verb = word
            print('(q6) We have a verb: ',verb)
            print(graph.run(query6,params).to_table()) 
            
        elif word in ['article','articles']:
            
            if params.get('org'):
                
                print('(q7) We have an article: ',word)
                return graph.run(query7,params).data() 
            
            
            else:
                words = [x[0] for x in tags if x[1] == "NNP"] 
                print('(q8) We have an article: ',word)    
                params = {"person":" ".join(words)}
                return graph.run(query8, params).data()
                
           
               

In [106]:
# connect -> tokenize -> tag -> params_builder -> query_picker
tags

[('Where', 'WRB'),
 ('Louisa', 'NNP'),
 ('Lim', 'NNP'),
 ('live', 'NN'),
 ('?', '.')]

In [107]:
results = query_picker(tags, params)

(q2) We have a verb:  live


In [108]:
#params
params

{'person': 'Louisa Lim'}

In [109]:
# response from Neo4j
results

[{'Person': 'Louisa Lim', 'STAYS_AT': 'Beijing'}]

In [287]:
tags

[('articles', 'NNS'), ('Lisa', 'NNP'), ('Lim', 'NNP')]