In [4]:
import tweepy
from tweepy import *
from tweepy.auth import OAuthHandler
from tweepy.streaming import StreamListener
 
import numpy as np
import pandas as pd

from datetime import datetime, timedelta
from textblob import TextBlob

from googlemaps import *

import re
import csv

import matplotlib.pyplot as plt
import json
plt.close('all')

In [None]:
### Setting up the API authentication.


class Authenticator():
    # This class will handle authentication for the twitter API.
    
    def auth(self):
        auth = OAuthHandler(twitter_auth.consumer_key, twitter_auth.consumer_secret)
        auth.set_access_token(twitter_auth.access_token, twitter_auth.access_token_secret)
        return auth
    
class MyT():
    def __init__(self, user=None):
        # Iniatiate the user, auth, and my_t or my twitter is the API pointer
        
        self.user = user
        self.auth = Authenticator().auth()
        self.my_t = API(self.auth)

    def user_timeline_head(self, n):
        # Like pandas, returns n tweets from the top of the users home timeline
        
        tweets = []
        for tweet in Cursor(self.my_t.user_timeline, id=self.user).items(n):
            tweets.append(tweet)
        return tweets

    def get_my_friends(self, n):
        # Get n number of friends of myt.
        
        friend_list = []
        for friend in Cursor(self.my_t.friends, id=self.user).items(n):
            friend_list.append(friend)
        return friend_list

    def home_timeline_head(self, n):
        # Like pandas, returns n tweets from the top of the users home timeline
        
        timeline = []
        for tweet in Cursor(self.my_t.home_timeline, id=self.user).items(n):
            timeline.append(tweet)
        return timeline
    
    def get_my_t_api(self):
        # Gets the twiter api pointer
        
        return self.my_t

class TwitterStreamer():
    
    def stream_tweets(self, fetched_tweets_filename, search_hashtags):
        # A function that will stream and process tweets given hashtag parameter
        
        listener = StdOutListener(fetched_tweets_filename)
        auth = OAuthHandler(consumer_key, consumer_secret)
        auth.set_access_token(access_token, access_token_secret)
        
        stream = Stream(auth, listener)
        
        stream.filter(track=search_hashtags)

class StdOutListener(StreamListener):
    # Basic listener class that will handle good data and errors
    
    def __init__(self, fetched_tweets_filename):
        self.fetched_tweets_filename = fetched_tweets_filename
    
    def on_data(self, data):
        try:
            print(data)
            with open(self.fetched_tweets_filename, 'a') as tf:
                tf.write(data)
            return True
        except BaseException as e:
            print("Error on_data: %s" % str(e))
        return True
    
    def on_error(self, status):
        print(status)
        
if __name__ == "__main__": 
    
    hash_tag_list = ['donald trump']
    fetched_tweets_filename = 'tweets.json'
    my_twitter = MyT('POTUS')
    print(my_twitter.user_timeline_head(5).filter(track=hash_tag_list))

In [None]:
class TweetAnalyzer():
    # This class will contain basic tweet analyzation tools 
    def arrTweetToDf(self, tweets):
        # covert tweet to a dataFrame so we can take full advantage of pandas and the numpy liberaries
        # neat
        
        df = pd.DataFrame(data=[tweet.text for tweet in tweets], columns=['Tweets'])

        df['id'] = np.array([tweet.id for tweet in tweets])
        df['len'] = np.array([len(tweet.text) for tweet in tweets])
        df['date'] = np.array([tweet.created_at for tweet in tweets])
        df['source'] = np.array([tweet.source for tweet in tweets])
        df['likes'] = np.array([tweet.favorite_count for tweet in tweets])
        df['retweets'] = np.array([tweet.retweet_count for tweet in tweets])

        return df

In [None]:
myt = MyT()
tanal = TweetAnalyzer()
api = myt.get_my_t_api()

tweets = api.home_timeline(20)

df = tanal.arrTweetToDf(tweets)

In [None]:
# Steps, collect 
#
#
#



In [None]:
lookback = 5
last_day = datetime.now()
start_date = last_day - timedelta(days = lookback)
end_date = last_day
day = timedelta(days = 1)
search_words = "quarentine -filter:retweets"


tweets = tweepy.Cursor(api.search,
                  q=search_words,
                  lang="en",
                  since=str(start_date)[:10],
                  until=str(start_date + day + day)[:10]).items(3)

users_locs = [[
        tweet.user.screen_name,
        tweet.user.location,
        tweet.created_at,
        tweet.favorite_count] for tweet in tweets]

df = pd.DataFrame(data=users_locs, columns=['user', "location", "created", "likes"])
df

In [None]:
search_words = "#quarentine -filter:retweets"

#tweets per day in range
resolution = 50

# Collect tweets
def get_tweet_df(date_since, date_until):
    tweets = tweepy.Cursor(api.search,
                  q=search_words,
                  lang="en",
                  since=date_since,
                  until=date_until).items(300)

    users_locs = [[
        tweet.user.screen_name,
        tweet.user.location,
        tweet.created_at,
        tweet.favorite_count] for tweet in tweets]

    df = pd.DataFrame(data=users_locs, columns=['user', "location", "created", "likes"])

    return df

def get_tweet_range_df(lookback = 90, last_day = datetime.now(), resolution = 1):
    # lookback = days to look back from
    # last day = end of looking period, set to today
    # resultion = tweets per day that will come up
    
    start_date = last_day - timedelta(days = lookback)
    end_date = last_day
    day = timedelta(days = 1)
    
    df = pd.DataFrame()
    
    days = 0
    
    while start_date < end_date:
        days += 1
        
        #print(str(start_date)[:10])
        tweets = tweepy.Cursor(api.search,
                      q=search_words,
                      lang="en",
                      since=str(start_date)[:10],
                      until=str(start_date + day)[:10]).items(resolution)

        start_date += day
        
        users_locs = [[
            tweet.text,
            tweet.user.screen_name,
            tweet.user.location,
            tweet.created_at,
            tweet.favorite_count] for tweet in tweets]

        dfAppend = pd.DataFrame(data=users_locs, columns=['tweet', 'user', "location", "created", "likes"])

        #print(dfAppend)
        
        df = df.append(dfAppend, ignore_index=True)
        
    #print(days)
    return df

df = get_tweet_range_df(lookback = 10, resolution = 500)

In [107]:
gmaps = Client(key='AIzaSyCdBA39xq1V7E7olkINdWijGe7bRX9UZkg')
geo = gmaps.geocode('Central Canada')

In [232]:
lats = []
lons = []

count = 0
for item in geo[0]['geometry']['bounds']:
    for latlon in geo[0]['geometry']['bounds'][item]:
        if count%2 == 0:
            lats.append(geo[0]['geometry']['bounds'][item][latlon])
        else:
            lons.append(geo[0]['geometry']['bounds'][item][latlon])
        count += 1
        
lat = sum(lats)/len(lats)
lon = sum(lons)/len(lons)

return [round(lat, 3), round(lon, 3)]

62.657 -95.989


In [104]:
class TwitterSentimentAnalysis():
    # This class will handle everything that is part getting a sentiment analysis
    
    def __init__(self):
        
        consumer_key = "3LntKM9D0jXZbKhE9G0ek63Ar"
        consumer_secret = "QNXlkRAUrCRiiEZacnSEnRW4Oeze3h5romq0YG48IPsb62BuoA"

        access_token = "1252051453474111491-MLxtYXJAicSFnWI0pfJiFxOd6OhICJ"
        access_token_secret = "nMjIIjMyqsMxnRUlpiOGCGOPsWCuRgByNTH3E5LS1AAS0"
        
        self.search_words = "#quarentine -filter:retweets"
        self.today = datetime.now()
        
        try:
            # handle authentication and possible issues
            
            self.auth = OAuthHandler(consumer_key, consumer_secret)
            self.auth.set_access_token(access_token, access_token_secret)
            self.api = API(self.auth, wait_on_rate_limit=True)
            
        except:
            print("Twitter Authentication Failed! Check API")
            raise Exception()
            
        try:
            self.gmaps = Client(key='AIzaSyCdBA39xq1V7E7olkINdWijGe7bRX9UZkg')
        except:
            
            print("GMAPS Authentication Failed! Check API")
            raise Exception()
    
    def parse_tweet(self, tweet):
        # A regular expression, to clean tweets
        # This will ultimatly remove links or any special character that may interfere with NLP
        ### REMEMBER: we want a sentiment of the text, not anything linked to it
        
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())
    

    def tweet_sentiment_score(self, tweet):
        # This will score the tweets sentiment based on textblob's sentiment method
        # more information can be found out about textblob in the documentation
        # link here: https://textblob.readthedocs.io/en/dev/index.html
        
        # TextBlob object <-- a
        a = TextBlob(self.parse_tweet(tweet))
        
        '''
        #for debugging
        
        if a.sentiment.polarity > 0:
            print('positive')
        elif a.sentiment.polarity == 0:
            print('neutral')
        else:
            print('negative')
        '''
            
        return a.sentiment.polarity
    
    def get_df_sentiments(self, dataframe):

        df = dataframe.copy()

        try:        
            df['sentiment'] = [self.tweet_sentiment_score(tweet) for tweet in df['tweet']]
        except:
            print("error collection sentiment (is dataframe intitialized?)")
            raise Exception()

        return df
    
    def get_tweet_range_df(self, lookback, tweets_per_day, senti=True, cleanloc=True, getCentroid=True):
        # lookback = days to look back from
        # last day = end of looking period, set to today
        # resultion = tweets per day that will come up
        df = pd.DataFrame()
        
        ed = self.today
        sd = self.today - timedelta(days=lookback)
        day = timedelta(days = 1)

        days = 0
        
        # This will get us a 'random' sample of tweets each day from all across the globe
        # CONSTRAINT: MUST BE ENGLISH AND HAVING TO DO WITH QUARENTINE
        while sd < ed:
            days += 1
    
            #print(str(start_date)[:10])
            tweets = tweepy.Cursor(self.api.search,
                          q=self.search_words,
                          lang="en",
                          since=str(sd)[:10],
                          until=str(sd + day)[:10]).items(tweets_per_day)

            sd += day

            users_locs = [[
                tweet.text,
                tweet.user.screen_name,
                tweet.user.location,
                tweet.created_at,
                tweet.favorite_count] for tweet in tweets]

            dfAppend = pd.DataFrame(data=users_locs, columns=['tweet', 'user', "location", "created", "likes"])

            #print(dfAppend)

            df = df.append(dfAppend, ignore_index=True)
        #print(days)
        
        if senti:
            # add sentiment to our dataframe
            
            df = self.get_df_sentiments(df)
            
        #country_list = []
        lats = []
        lons = []
            
        if cleanloc:
            # This condition will get a lat/lon pair for the location proiveded if google maps can process it
            # if not it will remain Nan
            c = 0
            #coordSet = False
            for loc in df['location']:
                #coordSet = False
                c += 1
                #print("itter")
                    #we can use the tweet parser to also parse the addresses
                 #   print(loc)

                if loc == '': 
                    #country_list.append(np.nan)
                    lats.append(np.nan)
                    #print("adding nan to lat, itter: " + str(c))
                    lons.append(np.nan)
                    continue
                    
                gmapsRETURN = self.gmaps.geocode(loc)

                la = []
                lo = []

                count = 0

                try:
                    #x = gmapsRETURN[0]['address_components']
                    y = gmapsRETURN[0]['geometry']['bounds']
                    
                except:
                    #print("adding nan to lat, itter: " + str(c))
                    lats.append(np.nan)
                    lons.append(np.nan)
                    #country_list.append(np.nan)
                    #print('nanE')
                    continue
                
                '''   
                for item in gmapsRETURN[0]['address_components']:
                    if item['types'] == ['country', 'political']:
                        print(item['long_name'])
                        country_list.append(item['long_name'])
                '''
                for item in gmapsRETURN[0]['geometry']['bounds']:
                    for latlon in gmapsRETURN[0]['geometry']['bounds'][item]:
                        if count%2 == 0:
                            la.append(gmapsRETURN[0]['geometry']['bounds'][item][latlon])
                        else:
                            lo.append(gmapsRETURN[0]['geometry']['bounds'][item][latlon])
                        count += 1

                la = round(sum(la)/len(la), 3)
                lo = round(sum(lo)/len(lo), 3)

                #print("adding "+ str(la) +" to lat, itter: " + str(c))
                lats.append(la)
                lons.append(lo)
                continue

        #print(len(df))
        #print(len(lats), len(lons))
        #df['country'] = country_list
        df['centroid_lats'] = lats
        df['centroid_lons'] = lons
        
        return df
    
    def update(self, df, filepath='csvFiles/tweets.csv'):
        return df.to_csv(filepath, index=False)
    
    def read_csv(self, fname='csvFiles/tweets.csv'):
        df = pd.read_csv(fname)
        df['created'] = pd.to_datetime(df['created'])
        return df
    
    def get_by_date(self, df, lookback):
        firstd = self.today - timedelta(days=lookback)
        df = df[(df['created']>firstd) & (df['created']<self.today)]
        df = df.resample('D', on='created').mean()
        del df['likes']
        
        return df
        
    def get_by_country(self, df):
        df = df[df['country'].notna()]
        df = df.reset_index(drop=True)
        df = df.groupby('country').agg({'sentiment':'mean'}).reset_index()
        
        return df
    
    def gets_dates_as_list(self, df):
        return [str(item)[:10] for item in df.index.to_series()]
    
    def get_lat_lons_and_senti(self, df, lookback):
        df = df[['sentiment', 'centroid_lats', 'centroid_lons', 'created']]
        df = df[df['centroid_lats'].notna()]
        df = df.reset_index(drop=True)
        df['heatmap_weight'] = [(2.5*senti+2.5) for senti in df['sentiment']]
        
        firstd = self.today - timedelta(days=lookback)
        df = df[(df['created']>firstd) & (df['created']<self.today)]
        df = df[['sentiment', 'centroid_lats', 'centroid_lons', 'created']]
        return df

    def gets_sentiment_as_list(self, df):
        return [item for item in df['sentiment']]

In [105]:
a = TwitterSentimentAnalysis()
df = a.get_tweet_range_df(7, 5)

In [113]:
a.get_by_date(df, 8)

Unnamed: 0_level_0,sentiment,centroid_lats,centroid_lons
created,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-05-27,0.078929,14.031,1.646667
2020-05-28,-0.097576,-14.409,-51.317
2020-05-29,0.166667,29.7205,-51.4795
2020-05-30,0.293636,33.55025,-94.26725
2020-05-31,0.1055,-5.4625,-112.5165
2020-06-01,0.183571,26.64,-89.18525
2020-06-02,0.106667,9.8795,-81.6235


In [72]:
df.to_csv('tweets.csv', index=False)

In [103]:
a.get_by_date(df, 10)

TypeError: '>' not supported between instances of 'str' and 'datetime.datetime'

In [102]:
dfc = df.copy()
dfc = dfc.resample('D', on='created').mean()
dfc

TypeError: Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, but got an instance of 'Index'

In [147]:
dfc

Unnamed: 0_level_0,sentiment
created,Unnamed: 1_level_1
2020-05-21,0.094861
2020-05-22,0.094624
2020-05-23,0.07154
2020-05-24,0.044306
2020-05-25,0.159545
2020-05-26,0.126188
2020-05-27,0.131134


In [176]:
df

Unnamed: 0,tweet,user,location,created,likes,sentiment,country
0,"Stories vibe, for more.\n🌪️😈\n\n.\n#quarantine...",vivilimasantana,EARTH!!,2020-05-21 23:28:22,0,0.65,United States
1,Help us identify views on #COVID19. If you're ...,covidprisk,Central New York,2020-05-21 23:01:00,1,0.0,United States
2,Quote of the day\n\nPlease follow us on-\nYout...,Learnado8,"Dhaka, Bangladesh",2020-05-22 22:45:50,0,0.0,Bangladesh
3,"Coronavirus Chronicles... you have the power, ...",guayilandia,washington dc,2020-05-22 22:15:33,0,0.25,United States
4,How yall coming out of Quarentine cuz this how...,ApolloSadeek,"los angeles, CA",2020-05-22 22:02:04,1,0.0,United States
5,look and smile\n.\n.\n.\n.\n.\n.\n.\n.\n.\n#in...,ingridcorrea03,Brasil,2020-05-23 23:28:12,1,0.144444,Brazil
6,The weather : __/\/\/\/\__\n\nMe : ? ? ? ?\n#T...,heavenlySkyee,"Montgomery, AL",2020-05-23 23:16:55,0,-0.5,United States
7,Therapy session scene. Short film I Just Look ...,JeffersonStil,"São Paulo, Brasil",2020-05-24 23:43:04,0,0.175,Brazil
8,https://t.co/eXR0yu1D0T\n\nSLVDE - OFF BALANCE...,OnliShak,"London, England",2020-05-24 23:25:23,0,0.0,United Kingdom
9,Girl from the North Country - 🎶🎸bobdylan &amp;...,LeninR22,"Quito, Ecuador",2020-05-24 23:23:10,1,0.0,Ecuador


In [None]:
str(datetime.now())[:10]

In [None]:
str(datetime.now() - timedelta(days = 90))[:10]

In [None]:
for i in range(15):
    print(df.iloc[i]['Tweets'])