# Entity extraction with polyglot

Use the Polyglot Python library to enrich the tweet graph with extracted entities

In [1]:
#!pip install polyglot

In [2]:
#!polyglot download embeddings2.en ner2.en

In [7]:
from neo4j.v1 import GraphDatabase
import json
import pprint
#from polyglot.text import Text

In [None]:


from neo4j.v1 import GraphDatabase
driver = GraphDatabase.driver("bolt://localhost:7687")


with driver.session() as session:
    results = session.run("MATCH (t:Tweet) RETURN t.text AS text, t.tweet_id AS tweet_id")

tweetObjArr = []

for r in results:
    tweetObj = {}
    tweetObj['id'] = r['tweet_id']
    tweetObj['text'] = r['text']
    tweetObjArr.append(tweetObj)

             


In [None]:
len(tweetObjArr)

In [None]:
entityArr = []


for t in tweetObjArr:
    try:
        parsedTweet = {}
        parsedTweet['id'] = t['id']
        parsedTweet['text'] = t['text']
        blob = Text(t['text'])
        entities = blob.entities
        parsedTweet['entities'] = []
        for e in entities:
            eobj = {}
            eobj['tag'] = e.tag
            eobj['entity'] = e
            parsedTweet['entities'].append(eobj)
        if len(parsedTweet['entities']) > 0:
            entityArr.append(parsedTweet)
    except:
        pass

In [None]:
import json
with open('parsed_tweets_scraped.json', 'w') as f:
    json.dump(entityArr, f, ensure_ascii=False, sort_keys=True, indent=4)

In [None]:
len(entityArr)

In [None]:
entityArr[5]

{'entities': [{'entity': I-PER(['Hillary', 'Clinton']), 'tag': 'I-PER'}],
 'id': 773585101489922048,
 'text': '@realDonaldTrump "Hillary Clinton has zero record to run on - unless you call corruption positive.." - @IngrahamAngle'}

# Import into Neo4j

In [8]:
with open("parsed_tweets_scraped.json") as f:
    parsed_tweets = json.load(f)

In [9]:
len(parsed_tweets)

589

In [16]:
pprint.pprint(parsed_tweets[4])

{'entities': [{'entity': ['Hillary'], 'tag': 'I-PER'}],
 'id': '588771323289030657',
 'text': 'Emails investigation made Hillary a dubious candidate #DemsWontPass'}


In [26]:
driver = GraphDatabase.driver("bolt://localhost:7687")

with driver.session() as session:
    session.run('CREATE CONSTRAINT ON (p:Person) ASSERT p.name IS UNIQUE;')
    session.run('CREATE CONSTRAINT ON (l:Location) ASSERT l.name IS UNIQUE;')
    session.run('CREATE CONSTRAINT ON (o:Organization) ASSERT o.name IS UNIQUE;')

In [27]:
entity_import_query = '''
WITH $parsedTweets AS parsedTweets
UNWIND parsedTweets AS parsedTweet
MATCH (t:Tweet) WHERE t.tweet_id = parsedTweet.id


FOREACH(entity IN parsedTweet.entities |
    // Person
    FOREACH(_ IN CASE WHEN entity.tag = 'I-PER' THEN [1] ELSE [] END | 
        MERGE (p:Person {name: trim(reduce(s = "", x IN entity.entity | s + x + " ")}))
        MERGE (p)<-[:CONTAINS_ENTITY]-(t)
    )
    
    // Organization
    FOREACH(_ IN CASE WHEN entity.tag = 'I-ORG' THEN [1] ELSE [] END | 
        MERGE (o:Organization {name: trim(reduce(s = "", x IN entity.entity | s + x + " ")}))
        MERGE (o)<-[:CONTAINS_ENTITY]-(t)
    )
    
    // Location
    FOREACH(_ IN CASE WHEN entity.tag = 'I-LOC' THEN [1] ELSE [] END | 
        MERGE (l:Location {name: trim(reduce(s = "", x IN entity.entity | s + x + " ")}))
        MERGE (l)<-[:CONTAINS_ENTITY]-(t)
    )
)

'''

In [28]:
with driver.session() as session:
    session.run(entity_import_query, parsedTweets=parsed_tweets)