In [91]:
import pandas as pd
from datetime import datetime
from matplotlib import pyplot as plt
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [92]:
%matplotlib notebook

## Data Source

* 2 primary data sets for tweets are csv files found on Kaggle
* 1 secondary daa set for VIX volatility index price is also a csv file found on Kaggle
* All sources are linked on the repo for reference and acknowledgement

## Tweet Data Preparation & Summary






In [136]:
rdt = pd.read_csv('realdonaldtrump.csv')
bo = pd.read_csv('Tweets-BarackObama.csv')

In [98]:
print(f"Obama tweets dataset has {bo['Date'].nunique()} entries and Trump tweets dataset has {rdt['date'].nunique()} entries")

Obama tweets dataset has 6768 entries and Trump tweets dataset has 42979 entries


#### Tweet Dataframes
* Limited to 2013-2019 to make sure the two data sets are aligned in time
* Standardized the column names
* Joined the two data sets

In [99]:
def time_limits(date):
    oldest = datetime.strptime('01/01/13', '%d/%m/%y').date()
    newest = datetime.strptime('01/01/19', '%d/%m/%y').date()
    return ((date <= newest) & (date >= oldest))


def tweets_df(file_name, date_column, date_format, tweet_column):
    data = pd.read_csv(file_name)
    data['Datetime'] = data[date_column].apply(lambda x: datetime.strptime(x[:10], date_format).date() )
    data['Words'] = data[tweet_column].apply(lambda x: x.split(" "))
    data = data[time_limits(data['Datetime'])]
    return data

In [100]:
bo_tweets = tweets_df('Tweets-BarackObama.csv',  'Date', '%Y/%m/%d', 'Tweet-text' )
rdt_tweets = tweets_df('realdonaldtrump.csv', 'date', '%Y-%m-%d',  'content' )

In [103]:
# A consolidated dataframe that includes both Trump and Obama tweets
d = {'Author': [], 'Date': [], 'Text': [], 'Words': [], 'Words': [], 'Retweets': [], 'Likes': []}
tweets = pd.DataFrame(data=d)

d1 = {'Author': 'Obama', 'Date': bo_tweets['Datetime'], 'Text': bo_tweets['Tweet-text'], 
      'Words': bo_tweets['Words'], 'Retweets': bo_tweets['Retweets'], 'Likes': bo_tweets['Likes']}
df1 = pd.DataFrame(data=d1)
d2 = {'Author': 'Trump', 'Date': rdt_tweets['Datetime'], 'Text': rdt_tweets['content'], 
      'Words': rdt_tweets['Words'], 'Retweets': rdt_tweets['retweets'], 'Likes': rdt_tweets['favorites']}
df2 = pd.DataFrame(data=d2)
tweets = tweets.append(df1)
tweets = tweets.append(df2)
tweets

Unnamed: 0,Author,Date,Text,Words,Retweets,Likes
35,Obama,2019-01-01,In 2018 people stepped up and showed up like n...,"[In, 2018, people, stepped, up, and, showed, u...",111380.0,836624.0
36,Obama,2018-12-29,I hope you find inspiration in the stories of ...,"[I, hope, you, find, inspiration, in, the, sto...",11579.0,92906.0
37,Obama,2018-12-29,Leaders like Jonny Boucher a Chicago native wh...,"[Leaders, like, Jonny, Boucher, a, Chicago, na...",19658.0,113752.0
38,Obama,2018-12-29,Leaders like Hong Hoang who mobilized a youth-...,"[Leaders, like, Hong, Hoang, who, mobilized, a...",3791.0,29283.0
39,Obama,2018-12-29,Leaders like Moussa Kondo and Sandor Lederer w...,"[Leaders, like, Moussa, Kondo, and, Sandor, Le...",3841.0,27248.0
...,...,...,...,...,...,...
36357,Trump,2019-01-01,Gas prices are low and expected to go down thi...,"[Gas, prices, are, low, and, expected, to, go,...",26266.0,185018.0
36358,Trump,2019-01-01,Washington Examiner - “MAGA list: 205 ‘histori...,"[Washington, Examiner, -, “MAGA, list:, 205, ‘...",14833.0,70501.0
36359,Trump,2019-01-01,“Kim Jong Un says North Korea will not make or...,"[“Kim, Jong, Un, says, North, Korea, will, not...",21478.0,105630.0
36360,Trump,2019-01-01,Do you think it’s just luck that gas prices ar...,"[Do, you, think, it’s, just, luck, that, gas, ...",26403.0,146789.0


In [104]:
#This function is useful for zooming into specific time periods
def filter_by_dates(tweets= tweets, start = '12/31/12', end ='01/01/19'):
    #Dates
    start_date = datetime.strptime(start, '%m/%d/%y').date()
    end_date = datetime.strptime(end, '%m/%d/%y').date()

    tweets_filtered = tweets[(tweets['Date']>start_date) & (tweets['Date']< end_date)].copy()
    return tweets_filtered

In [105]:
#This function is useful for zooming into specific time periods for each president
def tweets_by_date_and_president(president, start = '12/31/12', end ='01/01/19'):
    tweets_filtered = filter_by_dates(tweets,start,end)
    tweets_filtered = tweets_filtered[tweets_filtered['Author']==president].copy()
    return tweets_filtered

In [109]:
def show_frequency(tweets= tweets, start = '12/31/12', end ='01/01/19'):
    ## Filter the tweets based on the desired time period
    ## If no period is indicated when calling the function, it will default to 2013-2019
    tweets_filtered = filter_by_dates(tweets, start,end).copy()

    # Prepare the appropriate data series of Tweet counts by president & associated dates
    y1 = tweets_filtered[(tweets_filtered['Author']=='Obama' )].groupby(['Date']).Text.count()
    x1 = pd.Series(tweets_filtered[(tweets_filtered['Author']=='Obama')]['Date'].unique())
    
    y2 = tweets_filtered[(tweets_filtered['Author']=='Trump' )].groupby(['Date']).Text.count()
    x2 = pd.Series(tweets_filtered[(tweets_filtered['Author']=='Trump')]['Date'].unique())

    
    #Plot the data
    fig = make_subplots()
    fig.add_trace(
        go.Scatter(x=x1, y=y1, name="Obama")
    )

    fig.add_trace(
        go.Scatter(x=x2, y=y2, name="Trump")
    )

    fig.update_layout(
        title_text="Activity"
    )

    fig.update_yaxes(title_text="# of Daily Tweets")
    fig.update_xaxes(title_text="Time")

    fig.show()

In [112]:
show_frequency()

In [194]:
#Function takes a look at the quick stats of presidents' daily activity for a given time period
def tweet_style(president, start = '12/31/12', end ='01/01/19' ):
    #Filter by the time period and president
    
    tweets = tweets_by_date_and_president(president, start, end)
    
    #Caclulate quick stats
    mx = tweets.groupby(['Date']).Text.count().max()
    mn = tweets.groupby(['Date']).Text.count().min()
    avg = tweets.groupby(['Date']).Text.count().mean()
    sm = tweets.groupby(['Date']).Text.count().sum()
    
    #Organize quick stats in a dataframe for a table display
    d = {'Max': [mx], 'Min': [mn], 'Average': [avg], 'Total': [sm]}
    summary = pd.DataFrame(data=d)
    return summary

In [195]:
tweet_style('Obama', '01/01/15','01/01/16')

Unnamed: 0,Max,Min,Average,Total
0,51,1,4.14557,1310


In [191]:
tweet_style('Trump', '01/01/15','01/01/16')

Unnamed: 0,Max,Min,Average,Total
0,155,1,21.535014,7688


In [137]:
show_frequency(tweets, '01/01/16', '01/01/17')

#### Word Dataframes
* Created two dictionaries as dataframes to give us ideas about the presidents' buzzwords
* Filtered out conjunctions, articles, prepositions, pronouns, conjunctions, and presidents' names for simplicity
* Combed over capitalizations and extra symbols (e.g., both 'president' and '-president' grouped under 'president)

In [138]:
exclude = ['realdonaldtrump', 'trump', 'obama', 'donald', 'barack','barackobama', 'to', 'the', 'a', 'an', 'of', 'for', 'and', 'is', 'in', 'on', 'are', 'that', 'this', 'have', 'at', 'be', 'about', 
           'from', 'it', "it's", 'its', 'than', 'then', 'from', 'with', 'as', 'who', 'do', 'has', 'not', 'by', 'what', '@', '"', 
           '#', '&', '…', '-', 'so', 'but', 'i', 'you', 'he', 'she', 'we', 'they', 'my', 'your', 'her', 'his', 'our', 'their',
          'yes', 'no', '.', 'should', 'could', 'would']

In [139]:
## Function to display most used words for each president
def words_df(president,start, end):
    ## Filter the tweets based on the desired time period
    ## If no period is indicated when calling the function, it will default to 2013-2019
    tweets_filtered = tweets_by_date_and_president(president, start,end).copy()
    
    ##Create a consolidated list of words
    words_list = []
    for i in range (0, len(tweets_filtered)):
        words_list += tweets_filtered['Words'].iloc[i]
    
    ##Use the list to create a dataframe
    words_df = pd.DataFrame({'Word': words_list})
    words_df['word'] = words_df['Word'].apply(lambda x: x.lower())
    words_df['Exclude'] = words_df['word'].apply(lambda x: x in exclude)
    words_df = words_df[words_df['Exclude'] == False]
    words_df['word'] = words_df['word'].apply(lambda x: x.replace('—', ''))
    words_df = words_df[words_df['Exclude'] == False]
    return words_df

In [140]:
words_df('Trump', '01/01/17', '01/01/18' )['word'].value_counts()[0:15]

will      475
great     458
was       221
all       187
very      172
fake      169
just      161
thank     159
news      144
people    144
big       142
tax       134
u.s.      117
many      110
now       109
Name: word, dtype: int64

In [141]:
words_df('Obama', '01/01/16', '01/01/17' )['word'].value_counts()[0:15]

president        237
senate           130
judge             93
supreme           86
#doyourjob        78
fair              70
court             68
leaders           68
climate           67
garland           64
#actonclimate     52
more              50
if                48
up                44
hearing           43
Name: word, dtype: int64

In [142]:
#Function displays the daily frequency of the presidents using a specific word

def show_word_frequency(word, start = '12/31/12', end ='01/01/19'):
    ## Filter the tweets based on the desired time period
    ## If no period is indicated when calling the function, it will default to 2013-2019
    tweets_filtered = filter_by_dates(tweets, start, end).copy()
    tweets_filtered['Count'] = tweets_filtered['Text'].apply(lambda x: word in x)
    tweets_filtered = tweets_filtered[tweets_filtered['Count'] == True]
    
    # Prepare the appropriate data series of Tweet counts by president & associated dates
    y1 = tweets_filtered[(tweets_filtered['Author']=='Obama' )].groupby(['Date']).Text.count()
    x1 = pd.Series(tweets_filtered[(tweets_filtered['Author']=='Obama')]['Date'].unique())
    
    y2 = tweets_filtered[(tweets_filtered['Author']=='Trump' )].groupby(['Date']).Text.count()
    x2 = pd.Series(tweets_filtered[(tweets_filtered['Author']=='Trump')]['Date'].unique())

    
    #PLOT
    fig = make_subplots()
    fig.add_trace(
        go.Scatter(x=x1, y=y1, name="Obama")
    )

    fig.add_trace(
        go.Scatter(x=x2, y=y2, name="Trump")
    )

    fig.update_layout(
        title_text="Word Frequency"
    )

    fig.update_yaxes(title_text="# of Times the Word is Used")
    fig.update_xaxes(title_text="Time")

    fig.show()

In [143]:
show_word_frequency('great')

In [144]:
show_word_frequency('climate')

## Economic Data Preparation & Summary

In [145]:
vix = pd.read_csv('Jan20_vixcurrent_Jan20.csv', skiprows=[0])
vix['Date'] = vix['Date'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y').date())

In [146]:
vix = filter_by_dates(vix, start = '12/31/12', end ='01/01/19')
vix

Unnamed: 0,Date,VIX Open,VIX High,VIX Low,VIX Close
2265,2013-01-02,15.24,15.93,14.60,14.68
2266,2013-01-03,14.77,14.92,14.24,14.56
2267,2013-01-04,14.23,14.31,13.64,13.83
2268,2013-01-07,14.53,14.53,13.71,13.79
2269,2013-01-08,13.88,14.29,13.62,13.62
...,...,...,...,...,...
3532,2018-01-12,9.74,10.31,9.54,10.16
3533,2018-01-16,10.42,12.41,10.40,11.66
3534,2018-01-17,11.35,12.81,11.18,11.91
3535,2018-01-18,12.01,12.40,11.62,12.22


In [147]:
def words_stocks(word, start = '12/31/12', end ='01/01/19'):
    #Filter
    tweets_filtered = filter_by_dates(tweets, start, end).copy()
    tweets_filtered['Count'] = tweets_filtered['Text'].apply(lambda x: word in x)
    tweets_filtered = tweets_filtered[tweets_filtered['Count'] == True]
    vix_filtered = filter_by_dates(vix, start, end)
    
    #Organize
    y1 = tweets_filtered[(tweets_filtered['Author']=='Obama' )].groupby(['Date']).Text.count()
    x1 = pd.Series(tweets_filtered[(tweets_filtered['Author']=='Obama')]['Date'].unique())
    
    y2 = tweets_filtered[(tweets_filtered['Author']=='Trump' )].groupby(['Date']).Text.count()
    x2 = pd.Series(tweets_filtered[(tweets_filtered['Author']=='Trump')]['Date'].unique())
    
    x3 = vix_filtered['Date']
    y3 = vix_filtered['VIX Close']


    
    #PLOT
    fig = make_subplots()
    fig.add_trace(
        go.Scatter(x=x1, y=y1, name="Obama")
    );

    fig.add_trace(
        go.Scatter(x=x2, y=y2, name="Trump")
    );
    
    fig.add_trace(
        go.Scatter(x=x3, y=y3, name="Vix")
    );
        

    fig.update_layout(
        title_text="Activity"
    )

    fig.update_yaxes(title_text="# of Daily Tweets")
    fig.update_xaxes(title_text="Time")

    fig.show()

In [148]:
words_stocks('tax', '01/01/15','01/01/16' )

In [149]:
words_stocks('climate', '01/01/15','01/01/16' )