Simple Python notebook to leverage Twitter streaming API to collect tweets and save to a cloudant database.


In [None]:
# load required libraries
import tweepy
import json
import cloudant
from cloudant.client import Cloudant
from cloudant.document import Document

#Import the necessary methods from tweepy library
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream

In [None]:
# Provide complete path to the file which includes all required parms
# A sample parms file is included (example_parms.json)
parmsFile = 'PATH to your parms file including '
parms = ''
with open(parmsFile) as parmFile:
    parms = json.load(parmFile)

twitter_consumer_key=parms['twitter_consumer_key']
twitter_consumer_secret=parms['twitter_consumer_secret']
twitter_access_token=parms['twitter_access_token']
twitter_access_token_secret=parms['twitter_access_token_secret']

cloudant_username=parms['cloudant_username']
cloudant_password=parms['cloudant_password']
cloudant_url=parms['cloudant_url']
cloudant_database_name=parms['cloudant_database_name']

In [None]:
# Cloudnt client
client = Cloudant(cloudant_username, cloudant_password, url=cloudant_url, connect=True)

In [None]:
# Create a cloudant database with the given name
# If Exception returned, then database exists, just get a reference to it
try:
    tweets_db = client.create_database(cloudant_database_name)
except Exception, e:
    tweets_db = client[cloudant_database_name]
if tweets_db.exists():
    print "Database ", cloudant_database_name, " ready"

## Twitter's Streaming API

For more details on Twitter's streaming api, please check the following:
https://dev.twitter.com/overview/api/tweets


In [None]:
# useful method to remove a substring from a string
def remove_substring(orgstr, substr):
    newstr = orgstr.replace(substr, "")
    return newstr

In [None]:
#This is a basic listener that just prints received tweets to stdout.
# For reference: http://adilmoujahid.com/posts/2014/07/twitter-analytics/
# https://blog.gnip.com/tag/data-streaming/

class StdOutListener(StreamListener):

    def on_data(self, data):
        try:
            # Decode the JSON from Twitter
            datajson = json.loads(data)
            
            #grab the 'created_at' data from the Tweet to use for display
            created_at = datajson['created_at']
            
            # Skip tweets not in English for purposes of this notebook
            msg_lang = datajson['lang']
            if msg_lang != "en":
                return True

            # grab useful values from the data for our application
            msg_text = datajson['text']
            msg_id = datajson['id']
            msg_retweeted = datajson['retweeted']
            msg_favorite_count = datajson['favorite_count']
            msg_retweet_count = datajson['retweet_count']
            msg_entities_hashtags = datajson['entities']['hashtags']
            msg_entities_urls = datajson['entities']['urls']
            msg_entities_user_mentions = datajson['entities']['user_mentions']
            msg_entities_symbols = datajson['entities']['symbols']
            msg_coordinates = datajson['coordinates']
            msg_usr_screen_name = datajson['user']['screen_name']
            msg_place = datajson['place']
           
            # Run some text clean-up by removing referenced entities, hashtags, symbols, and urls
            # Save the result as 'text_clean' field
            msg_text_cleaned = msg_text
            if msg_entities_urls is not None:
                for tweet_url in msg_entities_urls:
                    substr = tweet_url['url']
                    msg_text_cleaned = remove_substring(msg_text_cleaned,substr)
            
            if msg_entities_hashtags is not None:
                for hashtag in msg_entities_hashtags:
                    substr = '#' + hashtag['text']
                    msg_text_cleaned = remove_substring(msg_text_cleaned,substr)

            if msg_entities_symbols is not None:
                for symbol in msg_entities_symbols:
                    substr = symbol['text']
                    msg_text_cleaned = remove_substring(msg_text_cleaned,substr)
                
            if msg_entities_user_mentions is not None:
                for user_mention in msg_entities_user_mentions:
                    substr = '@' + user_mention['screen_name']
                    msg_text_cleaned = remove_substring(msg_text_cleaned,substr)

            # Create user object with useful user information
            usr_id = datajson['user']['id']
            usr_followers_count = datajson['user']['followers_count']
            usr_statuses_count = datajson['user']['statuses_count']
            usr_friends_count = datajson['user']['friends_count']
            usr_favourites_count = datajson['user']['favourites_count']
            usr_screen_name = datajson['user']['screen_name']
            usr_location = datajson['user']['location']
            
            usr = {
                'id': usr_id,
                'screen_name': usr_screen_name,
                'followers_count': usr_followers_count,
                'friends_count': usr_friends_count,
                'favourites_count': usr_favourites_count,
                'statuses_count': usr_statuses_count,
                'location': usr_location
            }
            
            # Define all relevant fields for message object
            msg = {
                'id': msg_id,
                'created_at': created_at,
                'text': msg_text,
                'text_clean' : msg_text_cleaned,
                'user': usr,
                'lang': msg_lang,
                'place': msg_place,
                'favorite_count': msg_favorite_count,
                'hashtags': msg_entities_hashtags,
                'urls': msg_entities_urls,
                'symbols': msg_entities_symbols,
                'user_mentions': msg_entities_user_mentions,
                'coordinates': msg_coordinates
            }
            msgjson = json.dumps(msg)
            
            print("Tweet collected at " + str(created_at))
            
            # Add tweet to Cloudant
            tweet = tweets_db.create_document(msg)
            if tweet.exists():
                print 'Tweet added'

        except Exception as e:
           print(e)
        
        
        return True

    def on_error(self, status):
        print status

In [None]:
#This handles Twitter authetification and the connection to Twitter Streaming API
l = StdOutListener()
auth = OAuthHandler(twitter_consumer_key, twitter_consumer_secret)
auth.set_access_token(twitter_access_token, twitter_access_token_secret)
stream = Stream(auth, l)

# Collect tweets mentioning toyota or tesla
keywords= ['keyword1','keyword2']

stream.filter(track=keywords)