In [97]:
# import pandas
import pandas as pd

In [98]:
# read in working CSV file
# note: I could've built my own scraper for this, but to save time, I paid $5 for docteur-tweety.com. 
# Totally lazy, totally worth it.

tweets = pd.read_table('tweetscopy.csv', sep=',')

In [99]:
# check to make sure it read in properly
tweets.head(5)

Unnamed: 0,date,text,retweets,favorites
0,3/18/16 22:55,Everybody should boycott the @megynkelly show....,3517,10453
1,3/18/16 21:41,Thank you Arizona- I love you! #MakeAmericaGre...,3222,9352
2,3/18/16 21:39,Join me tomorrow! #Trump2016 #MakeAmericaGreat...,1742,5283
3,3/18/16 21:37,Mitt Romney is a mixed up man who doesn't have...,2755,9199
4,3/18/16 21:21,Failed Presidential Candidate Mitt Romney was ...,2770,8348


In [100]:
# quick conversion of date format
tweets.date = tweets.date.apply(pd.to_datetime)

In [101]:
# checking to make sure it worked; looks good
tweets.head(5)

Unnamed: 0,date,text,retweets,favorites
0,2016-03-18 22:55:00,Everybody should boycott the @megynkelly show....,3517,10453
1,2016-03-18 21:41:00,Thank you Arizona- I love you! #MakeAmericaGre...,3222,9352
2,2016-03-18 21:39:00,Join me tomorrow! #Trump2016 #MakeAmericaGreat...,1742,5283
3,2016-03-18 21:37:00,Mitt Romney is a mixed up man who doesn't have...,2755,9199
4,2016-03-18 21:21:00,Failed Presidential Candidate Mitt Romney was ...,2770,8348


In [102]:
# some quick analytics
tweets.describe()

Unnamed: 0,retweets,favorites
count,3174.0,3174.0
mean,2193.762445,5515.55419
std,2093.820027,4835.761654
min,322.0,0.0
25%,908.5,2262.25
50%,1489.0,3886.0
75%,2746.0,7080.0
max,25548.0,46501.0


In [103]:
# total tweets in database: 3,174
# max retweets: 25,548
# max favorites: 46,501
# who are these people?!

In [104]:
from pandas.tseries.resample import TimeGrouper
from pandas.tseries.offsets import DateOffset
tweets['date'] = pd.to_datetime(pd.Series(tweets['date']))
tweets.set_index('date', drop=False, inplace=True)
tweets.index = tweets.index.tz_localize('GMT').tz_convert('EST')
tweets.index = tweets.index - DateOffset(hours = 0)
tweets.index

DatetimeIndex(['2016-03-18 17:55:00-05:00', '2016-03-18 16:41:00-05:00',
               '2016-03-18 16:39:00-05:00', '2016-03-18 16:37:00-05:00',
               '2016-03-18 16:21:00-05:00', '2016-03-18 16:18:00-05:00',
               '2016-03-18 16:07:00-05:00', '2016-03-18 16:05:00-05:00',
               '2016-03-18 15:57:00-05:00', '2016-03-18 15:44:00-05:00', 
               ...
               '2015-10-07 13:19:00-05:00', '2015-10-07 12:47:00-05:00',
               '2015-10-07 12:46:00-05:00', '2015-10-07 11:39:00-05:00',
               '2015-10-07 11:34:00-05:00', '2015-10-07 08:51:00-05:00',
               '2015-10-07 00:13:00-05:00', '2015-10-07 00:12:00-05:00',
               '2015-10-07 00:06:00-05:00', '2015-10-06 21:06:00-05:00'],
              dtype='datetime64[ns]', name=u'date', length=3174, freq=None, tz='EST')

In [105]:
tweets.head(5)

Unnamed: 0_level_0,date,text,retweets,favorites
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-03-18 17:55:00-05:00,2016-03-18 22:55:00,Everybody should boycott the @megynkelly show....,3517,10453
2016-03-18 16:41:00-05:00,2016-03-18 21:41:00,Thank you Arizona- I love you! #MakeAmericaGre...,3222,9352
2016-03-18 16:39:00-05:00,2016-03-18 21:39:00,Join me tomorrow! #Trump2016 #MakeAmericaGreat...,1742,5283
2016-03-18 16:37:00-05:00,2016-03-18 21:37:00,Mitt Romney is a mixed up man who doesn't have...,2755,9199
2016-03-18 16:21:00-05:00,2016-03-18 21:21:00,Failed Presidential Candidate Mitt Romney was ...,2770,8348


In [106]:
tweets1t = tweets['date'].resample('750t', how='count')
tweets1t.head(10)

date
2015-10-06 12:30:00-05:00     4
2015-10-07 01:00:00-05:00     6
2015-10-07 13:30:00-05:00    12
2015-10-08 02:00:00-05:00    13
2015-10-08 14:30:00-05:00    11
2015-10-09 03:00:00-05:00     9
2015-10-09 15:30:00-05:00    23
2015-10-10 04:00:00-05:00    10
2015-10-10 16:30:00-05:00     3
2015-10-11 05:00:00-05:00     1
Freq: 750T, Name: date, dtype: int64

In [107]:
# Graph time series of tweets
import vincent
vincent.core.initialize_notebook()
area = vincent.Area(tweets1t)
area.colors(brew='Spectral')
area.display()

In [108]:
# Separate each tweet into separate words (tokens)

import nltk
from nltk.corpus import stopwords
from nltk import FreqDist
stop = stopwords.words('english')
text = tweets['text']

tokens = []
for txt in text.values:
    tokens.extend([t.lower().strip(":,.&- ") for t in txt.split()])

filtered_tokens = [w for w in tokens if w not in stop]



In [109]:
# Run frequency distribution to see most frequently used words.

freq_dist = nltk.FreqDist(filtered_tokens)

freq_dist

FreqDist({'@realdonaldtrump': 696, '': 558, 'trump': 483, 'great': 447, 'thank': 384, '#trump2016': 324, 'new': 224, '#makeamericagreatagain': 201, 'poll': 194, 'america': 192, ...})

In [110]:
# Adding a different method to text analysis

import re
def word_in_text(word, text):
    word = word.lower()
    text = text.lower()
    match = re.search(word, text)
    if match:
        return True
    return False

In [157]:
# Note: the numbers produced by this method differ from the numbers produced above by the frequency distribution; 
# this is because the two methods parse text differently.
# This method will count an instance of "Trump" in "#Trump2016", for instance;
# the frequency distribution will only count isolated instances of "trump".
# Note also: I realize there's a much more succinct way of doing this, refactored, with cleaner (more complex) syntax. 
# I'm lazy. I'm sorry.

tweets['America'] = tweets['text'].apply(lambda tweet: word_in_text('america', tweet))
tweets['boycott'] = tweets['text'].apply(lambda tweet: word_in_text('boycott', tweet))
tweets['boring'] = tweets['text'].apply(lambda tweet: word_in_text('boring', tweet))
tweets['Trump'] = tweets['text'].apply(lambda tweet: word_in_text('Trump', tweet))
tweets['veterans'] = tweets['text'].apply(lambda tweet: word_in_text('veterans', tweet))
tweets['wall'] = tweets['text'].apply(lambda tweet: word_in_text('wall', tweet))

print "Times mentioned America:"
print tweets['America'].value_counts()[True]
print 'Times mentioned self:'
print tweets['Trump'].value_counts()[True]
print 'Times mentioned veterans:'
print tweets['veterans'].value_counts()[True]
print 'Times mentioned walls:'
print tweets['wall'].value_counts()[True]
print 'Times wanted to boycott things:'
print tweets['boycott'].value_counts()[True]
print 'Times was bored:'
print tweets['boring'].value_counts()[True]


Times mentioned America:
534
Times mentioned self:
1679
Times mentioned veterans:
17
Times mentioned walls:
48
Times wanted to boycott things:
7
Times was bored:
18


In [158]:
tweets['Romney'] = tweets['text'].apply(lambda tweet: word_in_text('Romney', tweet))
tweets['Megyn'] = tweets['text'].apply(lambda tweet: word_in_text('Megyn', tweet))
tweets['Marco'] = tweets['text'].apply(lambda tweet: word_in_text('Marco', tweet))

print 'Times mentioned Mitt Romney:'
print tweets['Romney'].value_counts()[True]
print 'Times mentioned Megyn Kelly:'
print tweets['Megyn'].value_counts()[True]
print 'Times mentioned Marco Rubio:'
print tweets['Marco'].value_counts()[True]

Times mentioned Mitt Romney:
48
Times mentioned Megyn Kelly:
53
Times mentioned Marco Rubio:
80


In [159]:
# Not *that* in love with America, are we now...
# Also, cute that we care more about walls than we do veterans. #MakeAmericaIngrateAgain

In [160]:
tweets.head(5)

Unnamed: 0_level_0,date,text,retweets,favorites,America,boycott,boring,Trump,veterans,wall,Romney,Megyn,Marco
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2016-03-18 17:55:00-05:00,2016-03-18 22:55:00,Everybody should boycott the @megynkelly show....,3517,10453,False,True,False,True,False,False,False,True,False
2016-03-18 16:41:00-05:00,2016-03-18 21:41:00,Thank you Arizona- I love you! #MakeAmericaGre...,3222,9352,True,False,False,True,False,False,False,False,False
2016-03-18 16:39:00-05:00,2016-03-18 21:39:00,Join me tomorrow! #Trump2016 #MakeAmericaGreat...,1742,5283,True,False,False,True,False,False,False,False,False
2016-03-18 16:37:00-05:00,2016-03-18 21:37:00,Mitt Romney is a mixed up man who doesn't have...,2755,9199,False,False,False,False,False,False,True,False,False
2016-03-18 16:21:00-05:00,2016-03-18 21:21:00,Failed Presidential Candidate Mitt Romney was ...,2770,8348,False,False,False,False,False,False,True,False,True
