This notebook demonstates how to fetch urls shared on Twitter and insert into Neo4j. It is a companion to [this blog post]().

In [None]:
from newspaper import Article, fulltext
import newspaper
import random
import pprint
import tweepy


In [None]:
consumer_key = 'XXX'
consumer_secret = 'XXX'
access_token = 'XXX'
access_token_secret = 'XXX'

In [None]:
graphdb = Graph('http://NEO4J_SERVER_URL_HERE/db/data')

In [None]:
INSERT_USER_URL_QUERY = '''
    MERGE (user:User {username: {username}})
    MERGE (url:URL {url: {url}})
    CREATE UNIQUE (user)-[:SHARED]->(url)
    FOREACH (kw in {keywords} | MERGE (k:Keyword {text: kw}) CREATE UNIQUE (k)<-[:IS_ABOUT]-(url))
    FOREACH (author in {authors} | MERGE (a:Author {name: author}) CREATE UNIQUE(a)<-[:WRITTEN_BY]-(url))
'''

In [None]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
 
api = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify = True)

In [None]:
ids = api.friends_ids()
urls = []

In [None]:
for friend in ids:
    statuses = api.user_timeline(id=friend, count=200)
    for status in statuses:
        if status.entities and status.entities['urls']:
            for url in status.entities['urls']:
                urls.append((url['expanded_url'], status.author.screen_name))

In [None]:
with open('urls.csv', 'w') as f:
    for url in urls:
        f.write(url[0] + ',' + url[1] + '\n')
    f.close()

In [None]:
def parseURL(url):
    a = Article(url)
    try:
        a.download()
        a.parse()
        a.nlp()
        authors = a.authors
        keywords = a.keywords
        del(a)
        return (authors, keywords)
    except:
        return (None, None)

In [None]:
def insertUserURL(user, url):
    authors, keywords = parseURL(url)
    if authors and keywords:
        params = {}
        params['username'] = user
        params['url'] = url
        params['authors'] = authors
        params['keywords'] = keywords
        graphdb.cypher.execute(INSERT_USER_URL_QUERY, params)

In [None]:
def doWork():
    while True:
        urlTuple = q.get()
        insertUserURL(urlTuple[0], urlTuple[1])
        q.task_done()
        

In [None]:
# number of threads / maximum concurrent requests
concurrent = 200

# init the work queue
q = Queue(concurrent * 2)

for i in range(concurrent):
    t = Thread(target=doWork)
    t.daemon = True
    t.start()
try:
    with open('urls.csv', 'r') as f:
        for line in f:
            l = line.split(',')
            url = l[0]
            # trim the newline
            user = l[1].replace('\n', '')
            q.put((user, url))
    q.join()
except:
    pass