In [1]:
# Leslie Huang
# Network Analysis

from datetime import datetime
import json
import math
import matplotlib.pyplot as plt
import nltk
import numpy as np
from odo import odo, discover, resource
import pandas as pd
import pytz
from sklearn import linear_model
import statsmodels.formula.api as sm
from statsmodels.iolib.summary2 import summary_col
import time

In [2]:
data = odo("mongodb://localhost/my_mongo::wm_tweets", 
            pd.DataFrame, 
            dshape = "var * {id_str: string, created_at: string, lang: string, user_statuses_count: int32, user_friends_count: int32, text: string, user_verified: bool, user_followers_count: int32, num_retweets: int32, hashtags: var * string, includes_url: bool, user_created_at: string,user_id_str: string, is_a_retweet: bool, includes_media: bool}")

# id_str is user ID
# drop MongoDB _id var
# Note: If tweet is a retweet:
# user_id_str, user_statuses_count, user_friends_count, user_followers_count will be for the user retweeting (NOT the original user)
# id_str will be for the ORIGINAL tweet

In [3]:
# Exclude low activity and new accounts: Those that have fewer than 5 followers or 5 statuses or were created within the last week

data = data[(data["user_followers_count"] > 5) & (data["user_statuses_count"] > 5)]

In [4]:
# Convert timestamps

time_columns = ["created_at", "user_created_at"]

for column in time_columns:
    data[column] = data[column].apply(lambda x: datetime.strptime(x,'%a %b %d %H:%M:%S +0000 %Y').replace(tzinfo = pytz.UTC))

In [5]:
# Exclude accounts created 1 week before the march

cutoff_date = datetime.strptime("Fri Jan 13 00:00:00 +0000 2017",'%a %b %d %H:%M:%S +0000 %Y').replace(tzinfo = pytz.UTC)

data = data[data["user_created_at"] < cutoff_date]

In [28]:
data["hashtags_lc"] = data["hashtags"].apply(lambda hashtaglist: [tag.lower() for tag in hashtaglist])

In [6]:
data["num_hashtags"] = data["hashtags"].apply(lambda x: len(x))

In [7]:
columns_to_log = ["user_followers_count", "num_retweets", "user_statuses_count", "num_hashtags"]

# Smoothing: Take log of each numerical val + 1 (to accommodate with zeroes)

for column in columns_to_log:
    new_col_name = "{}_log".format(column)
    data[new_col_name] = data[column].apply(lambda x: math.log(x + 1))

In [8]:
# Do not drop columns that appear to be duplicates

# data = data.drop_duplicates(subset = ['id_str', 'created_at', 'lang', 'statuses_count', 'friends_count', 'text', 'in_reply_to_status_id', 'verified', 'followers_count', 'num_retweets', 'followers_count_log', 'num_retweets_log', 'statuses_count_log'])

In [9]:
print(data.dtypes)

id_str                                   object
created_at                  datetime64[ns, UTC]
lang                                     object
user_statuses_count                       int32
user_friends_count                        int32
text                                     object
user_verified                              bool
user_followers_count                      int32
num_retweets                              int32
hashtags                                 object
includes_url                               bool
user_created_at             datetime64[ns, UTC]
user_id_str                              object
is_a_retweet                               bool
includes_media                             bool
num_hashtags                              int64
user_followers_count_log                float64
num_retweets_log                        float64
user_statuses_count_log                 float64
num_hashtags_log                        float64
dtype: object


In [10]:
print(data.columns.values)

['id_str' 'created_at' 'lang' 'user_statuses_count' 'user_friends_count'
 'text' 'user_verified' 'user_followers_count' 'num_retweets' 'hashtags'
 'includes_url' 'user_created_at' 'user_id_str' 'is_a_retweet'
 'includes_media' 'num_hashtags' 'user_followers_count_log'
 'num_retweets_log' 'user_statuses_count_log' 'num_hashtags_log']


In [11]:
data.tail()

Unnamed: 0,id_str,created_at,lang,user_statuses_count,user_friends_count,text,user_verified,user_followers_count,num_retweets,hashtags,includes_url,user_created_at,user_id_str,is_a_retweet,includes_media,num_hashtags,user_followers_count_log,num_retweets_log,user_statuses_count_log,num_hashtags_log
9553807,823394839303778304,2017-01-23 05:00:02+00:00,en,987,229,Steve Buscemi poses with a man who turned his ...,False,101,3605,[WomensMarch],False,2015-05-24 19:12:32+00:00,3296981266,True,True,1,4.624973,8.190354,6.895683,0.693147
9553808,823394838909288448,2017-01-23 05:00:02+00:00,en,41837,155,RT @sydnerain: Alright. Here is one indigenous...,False,100,0,[WomensMarch],False,2014-08-18 02:30:50+00:00,2741039840,False,False,1,4.615121,0.0,10.64156,0.693147
9553809,823394839496654848,2017-01-23 05:00:02+00:00,en,948,127,RT @highlightcult: Rihanna is marching at the ...,False,177,0,[WomensMarch],False,2011-07-16 02:19:42+00:00,336302350,False,True,1,5.181784,0.0,6.855409,0.693147
9553810,823394838909288448,2017-01-23 05:00:02+00:00,en,41837,155,RT @sydnerain: Alright. Here is one indigenous...,False,100,0,[WomensMarch],False,2014-08-18 02:30:50+00:00,2741039840,False,False,1,4.615121,0.0,10.64156,0.693147
9553811,823394839496654848,2017-01-23 05:00:02+00:00,en,948,127,RT @highlightcult: Rihanna is marching at the ...,False,177,0,[WomensMarch],False,2011-07-16 02:19:42+00:00,336302350,False,True,1,5.181784,0.0,6.855409,0.693147


In [12]:
# for column in data.columns.values.tolist():
#     print(data[column].describe())

In [13]:
data["num_hashtags"].value_counts()

1     7183244
2     1458366
3      434964
4      151088
5       67297
6       34229
7       30283
8        8275
9        2771
10       1863
11        416
12        192
13         95
15         72
14         24
16          6
20          6
19          2
17          2
Name: num_hashtags, dtype: int64

In [29]:
# Find out the most popular hashtags (case insensitive)
all_hashtags = []
all_hashtags.append([hashtag_list for hashtag_list in data["hashtags_lc"]])
all_hashtags = [val for sublist in all_hashtags for val in sublist]
all_hashtags = [val for sublist in all_hashtags for val in sublist]
all_hashtags = pd.Series(all_hashtags)

In [30]:
hashtag_freq = all_hashtags.value_counts()

In [32]:
hashtag_freq[0:49]

womensmarch                   8938327
womensmarchonwashington        451702
whyimarch                      205358
womensrights                   139814
icantkeepquiet                  88776
womensmarchlondon               69135
theresistance                   66389
trump                           66211
inauguration                    61877
womensmarchla                   56446
womensmarchnyc                  46220
maga                            30169
resist                          28664
themarchcontinues               28619
nastywoman                      27630
sobadevenintrovertsarehere      26981
womensmarchatx                  22953
womensmarchonaustin             22341
spicerfacts                     21312
notmypresident                  21068
equality                        20564
amjoy                           20413
hijab                           19946
lovetrumpshate                  19324
madonna                         17940
whywemarch                      17091
trumpleaks  

In [33]:
hashtag_freq[50:99]

marchonwashington        8435
marchonmain              8412
solidarity               8243
nyc                      8203
bestfanarmy              8196
arabamerican             8133
palestinianamerican      8101
feministtodolist         7950
model                    7765
nataliaborges            7739
suit                     7739
malanbreton              7739
leahsnow                 7739
shanelavancher           7739
세계여성_공동행진_서울             7687
iwillgoout               7520
blacklivesmatter         7495
sundance                 7488
resisttrump              7314
hearourvoice             7211
presidenttrump           7170
wmnyc                    7155
westandunited            7114
nycwomensmarch           7108
paris                    7052
wmla                     6995
edinburgh                6808
losangeles               6761
wmwcanada                6705
sistermarch              6654
inaugurationday          6624
hh4pp                    6522
sundaymorning            6431
alaska    

In [37]:
hashtag_freq[100:149]

feminist                         5064
love                             5049
breaking                         5021
prolife                          4954
lovatics                         4855
womensmarchglobal                4830
cnn                              4816
equalitynow                      4814
womensmarchparis                 4766
protest                          4751
boswomensmarch                   4729
nbc4dc                           4607
heretostay                       4539
msnbc                            4511
stopenslavingsaudiwomen          4492
womensmarchto                    4464
bychubbz                         4456
goddesses                        4436
unified                          4420
everything                       4410
fairbanks                        4407
imwithher                        4394
atlantamarch                     4350
wethepeople                      4328
mybodymychoice                   4287
womanpower                       4254
trumppreside

In [38]:
hashtag_freq[150:199]

maddow                  3513
boston                  3502
tupacshakur             3495
wmw                     3463
la                      3424
trumpinauguration       3412
oakland                 3411
saudiarabia             3371
seanspicer              3311
hillaryclinton          3238
riseup                  3223
notmymarch              3180
jointheresistance       3138
1u                      3133
islam                   3085
peace                   3084
video                   3072
it                      3070
berlin                  3057
uscapitol               3052
washingtondc            3007
loveislove              2992
cnnsotu                 2971
trump45                 2946
dublin                  2923
whyimarched             2897
bostonwomensmarch       2882
cair                    2827
meetthepress            2824
hillary                 2821
snapchat                2817
sophiecruz              2815
indivisible             2798
p2                      2783
draintheswamp 

In [39]:
hashtag_freq[200:249]

rt                         2439
pussypower                 2386
lgbtq                      2368
ashleyjudd                 2359
kellyanne                  2342
trumpprotest               2303
respect                    2300
womensmarchams             2271
bridgesnotwalls            2257
wearehere                  2256
protesttrump               2242
thefutureisfemale          2236
addhername                 2233
photos                     2217
womensmarch2017            2211
unity                      2208
supergirl                  2206
stillwithher               2201
womensmarchrome            2195
womensmarchphiladelphia    2193
obama                      2191
womanfortrump              2191
wmnewzealand               2190
vancouver                  2168
readytorun                 2167
marchedesfemmes            2148
stateofwomen               2146
philly                     2143
uniteblue                  2141
womensmarchdenver          2114
mexican                    2106
wmata   

In [35]:
data.shape

(9373195, 21)

In [17]:
result = sm.ols(formula = "num_retweets ~ num_hashtags", data = data).fit()
result.summary()

0,1,2,3
Dep. Variable:,num_retweets,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,6609.0
Date:,"Sun, 14 May 2017",Prob (F-statistic):,0.0
Time:,19:02:14,Log-Likelihood:,-102530000.0
No. Observations:,9373195,AIC:,205100000.0
Df Residuals:,9373193,BIC:,205100000.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,2336.7701,8.314,281.065,0.000,2320.475 2353.065
num_hashtags,-415.4462,5.110,-81.294,0.000,-425.462 -405.430

0,1,2,3
Omnibus:,14173129.665,Durbin-Watson:,1.447
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4134163202.197
Skew:,9.781,Prob(JB):,0.0
Kurtosis:,104.009,Cond. No.,3.94


In [18]:
#dfoutput = summary_col([result],stars=True)
#print(dfoutput.as_latex())