In [1]:
# Leslie Huang
# Network Analysis

from causalinference import CausalModel
from datetime import datetime
import json
import math
import matplotlib.pyplot as plt
import nltk
import numpy as np
from odo import odo, discover, resource
import pandas as pd
import pytz
from sklearn import linear_model
import statsmodels.formula.api as sm
from statsmodels.iolib.summary2 import summary_col
import time

In [41]:
data = odo("mongodb://localhost/my_mongo::wm_tweets", 
            pd.DataFrame, 
            dshape = "var * {id_str: string, created_at: string, lang: string, user_statuses_count: int32, user_friends_count: int32, text: string, user_verified: bool, user_followers_count: int32, num_retweets: int32, hashtags: var * string, includes_url: bool, user_created_at: string,user_id_str: string, is_a_retweet: bool, includes_media: bool}")

# id_str is user ID
# drop MongoDB _id var
# Note: If tweet is a retweet:
# user_id_str, user_statuses_count, user_friends_count, user_followers_count will be for the user retweeting (NOT the original user)
# id_str will be for the ORIGINAL tweet

In [42]:
# Exclude low activity and new accounts: Those that have fewer than 5 followers or 5 statuses or were created within the last week

data = data[(data["user_followers_count"] > 5) & (data["user_statuses_count"] > 5)]

In [43]:
# Convert timestamps

time_columns = ["created_at", "user_created_at"]

for column in time_columns:
    data[column] = data[column].apply(lambda x: datetime.strptime(x,'%a %b %d %H:%M:%S +0000 %Y').replace(tzinfo = pytz.UTC))

In [44]:
# Exclude accounts created 1 week before the march

cutoff_date = datetime.strptime("Fri Jan 13 00:00:00 +0000 2017",'%a %b %d %H:%M:%S +0000 %Y').replace(tzinfo = pytz.UTC)

data = data[data["user_created_at"] < cutoff_date]

In [45]:
data["hashtags_lc"] = data["hashtags"].apply(lambda hashtaglist: tuple(tag.lower() for tag in hashtaglist))

In [46]:
data["num_hashtags"] = data["hashtags"].apply(lambda x: len(x))

In [47]:
# Calculate the log age of the account in weeks (+1 for smoothing)
data["log_acct_age_weeks"] = (data["user_created_at"] - data["created_at"]).apply(lambda x: math.log((x.days / -7) + 1))

In [48]:
# Smoothing: Take log of each numerical val + 1 (to accommodate with zeroes)
columns_to_log = ["user_followers_count", "num_retweets", "user_statuses_count", "num_hashtags"]

for column in columns_to_log:
    new_col_name = "{}_log".format(column)
    data[new_col_name] = data[column].apply(lambda x: math.log(x + 1))

In [49]:
# Find out the most popular hashtags (case insensitive)
all_hashtags = []
all_hashtags.append([hashtag_list for hashtag_list in data["hashtags_lc"]])
all_hashtags = [val for sublist in all_hashtags for val in sublist]
all_hashtags = [val for sublist in all_hashtags for val in sublist]
all_hashtags = pd.Series(all_hashtags)

hashtag_freq = all_hashtags.value_counts()

In [50]:
# Count number of hashtags not in the official list

wm_hashtags = ["whyimarch", "womensmarch", "womensmarchonwashington", "imarchfor"]

data["num_unofficial_hashtags"] = data.apply(lambda row: len(set(row.hashtags_lc) - set(wm_hashtags)), axis = 1)

In [51]:
# Create dummy variables for political issues

indig_rights = pd.DataFrame(pd.Series(("indigenouswomenrise" in hashtags_lc or "indigenousrising" in hashtags_lc for hashtags_lc in data["hashtags_lc"]), index = data.index), columns = ["indig_rights"])
marriage_eq = pd.DataFrame(pd.Series(("lovetrumpshate" in hashtags_lc or "loveislove" in hashtags_lc for hashtags_lc in data["hashtags_lc"]), index = data.index), columns = ["marriage_eq"])
anti_abortion = pd.DataFrame(pd.Series(("prolife" in hashtags_lc or "prolifefeminist" in hashtags_lc for hashtags_lc in data["hashtags_lc"]), index = data.index), columns = ["anti_abortion"])
blm = pd.DataFrame(pd.Series(("blacklivesmatter" in hashtags_lc for hashtags_lc in data["hashtags_lc"]), index = data.index), columns = ["blm"])
gender_eq = pd.DataFrame(pd.Series(("womensrightsarehumanrights" in hashtags_lc or "equality" in hashtags_lc or "equalrights" in hashtags_lc for hashtags_lc in data["hashtags_lc"]), index = data.index), columns = ["gender_eq"])
repro_rights = pd.DataFrame(pd.Series(("istandwithpp" in hashtags_lc or "mybodymychoice" in hashtags_lc or "plannedparenthood" in hashtags_lc for hashtags_lc in data["hashtags_lc"]), index = data.index), columns = ["repro_rights"])
anti_trump = pd.DataFrame(pd.Series(("theresistance" in hashtags_lc or "resist" in hashtags_lc or "trump" in hashtags_lc for hashtags_lc in data["hashtags_lc"]), index = data.index), columns = ["anti_trump"])

In [52]:
# Then merge each of them with the main DF

dummies = [indig_rights, marriage_eq, anti_abortion, blm, gender_eq, repro_rights, anti_trump]

for dummy in dummies:
    data = data.merge(dummy, left_index = True, right_index = True)

In [53]:
data.head()

Unnamed: 0,id_str,created_at,lang,user_statuses_count,user_friends_count,text,user_verified,user_followers_count,num_retweets,hashtags,...,user_statuses_count_log,num_hashtags_log,num_unofficial_hashtags,indig_rights,marriage_eq,anti_abortion,blm,gender_eq,repro_rights,anti_trump
0,822670319219576832,2017-01-21 05:01:03+00:00,en,96414,2083,RT @LaurenJauregui: Less than 24 hours and the...,False,1602,0,[whyimarch],...,11.476417,0.693147,0,False,False,False,False,False,False,False
1,822670319219576832,2017-01-21 05:01:03+00:00,en,96414,2083,RT @LaurenJauregui: Less than 24 hours and the...,False,1602,0,[whyimarch],...,11.476417,0.693147,0,False,False,False,False,False,False,False
2,822670322126069760,2017-01-21 05:01:03+00:00,ko,26655,178,#세계여성_공동행진_서울 \n#womensmarch 현재 강남역 10번 출구에서는 ...,False,88,49,"[세계여성_공동행진_서울, womensmarch]",...,10.19077,1.098612,1,False,False,False,False,False,False,False
3,822670322126069760,2017-01-21 05:01:03+00:00,ko,26655,178,#세계여성_공동행진_서울 \n#womensmarch 현재 강남역 10번 출구에서는 ...,False,88,49,"[세계여성_공동행진_서울, womensmarch]",...,10.19077,1.098612,1,False,False,False,False,False,False,False
4,822670327704469504,2017-01-21 05:01:05+00:00,en,10509,913,RT @womensmarch: Together we fight for equity....,False,375,0,"[WomensMarch, WhyIMarch]",...,9.260082,1.098612,0,False,False,False,False,False,False,False


In [61]:
data.dtypes

id_str                                   object
created_at                  datetime64[ns, UTC]
lang                                     object
user_statuses_count                       int32
user_friends_count                        int32
text                                     object
user_verified                              bool
user_followers_count                      int32
num_retweets                              int32
hashtags                                 object
includes_url                               bool
user_created_at             datetime64[ns, UTC]
user_id_str                              object
is_a_retweet                               bool
includes_media                             bool
hashtags_lc                              object
num_hashtags                              int64
log_acct_age_weeks                      float64
user_followers_count_log                float64
num_retweets_log                        float64
user_statuses_count_log                 

In [54]:
result = sm.ols(formula = "num_retweets_log ~ np.power(num_hashtags, 2) + num_hashtags + user_followers_count_log + user_statuses_count_log + includes_url + includes_media + log_acct_age_weeks ", data = data).fit()
result.summary()

0,1,2,3
Dep. Variable:,num_retweets_log,R-squared:,0.062
Model:,OLS,Adj. R-squared:,0.062
Method:,Least Squares,F-statistic:,88950.0
Date:,"Thu, 18 May 2017",Prob (F-statistic):,0.0
Time:,14:05:18,Log-Likelihood:,-20788000.0
No. Observations:,9373195,AIC:,41580000.0
Df Residuals:,9373187,BIC:,41580000.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,-0.0148,0.005,-2.850,0.004,-0.025 -0.005
includes_url[T.True],-0.2067,0.002,-87.335,0.000,-0.211 -0.202
includes_media[T.True],1.1199,0.002,705.216,0.000,1.117 1.123
"np.power(num_hashtags, 2)",-0.0182,0.000,-51.495,0.000,-0.019 -0.018
num_hashtags,0.2478,0.002,109.874,0.000,0.243 0.252
user_followers_count_log,-0.0555,0.001,-84.989,0.000,-0.057 -0.054
user_statuses_count_log,-0.0020,0.001,-3.498,0.000,-0.003 -0.001
log_acct_age_weeks,0.0231,0.001,28.840,0.000,0.022 0.025

0,1,2,3
Omnibus:,5761610.489,Durbin-Watson:,1.567
Prob(Omnibus):,0.0,Jarque-Bera (JB):,45675427.988
Skew:,3.012,Prob(JB):,0.0
Kurtosis:,11.981,Cond. No.,93.0


In [None]:
#dfoutput = summary_col([result],stars=True)
#print(dfoutput.as_latex())

In [55]:
# Use Propensity Score Matching

# Create the "control" group
data_single_hashtag = data[data["num_hashtags"] == 1]

In [103]:
# Create subset of the covariates to match on

X_vec = [ 
         "includes_media", 
         "log_acct_age_weeks"]

In [114]:
# Create sample of "control" group + marriage_eq "treatment" group

def create_subset_df(dummy_name):
    return pd.concat([
                         data[data[dummy_name] == True],
                         data_single_hashtag
                        ])

In [118]:
gender_eq_subset = create_subset_df("gender_eq")
blm_subset = create_subset_df("blm")
indig_rights_subset = create_subset_df("indig_rights")
marriage_eq_subset = create_subset_df("marriage_eq")
repro_rights_subset = create_subset_df("repro_rights")
anti_trump_subset = create_subset_df("anti_trump")
anti_abortion_subset = create_subset_df("anti_abortion")

In [119]:
print(gender_eq_subset.shape, blm_subset.shape, indig_rights_subset.shape, marriage_eq_subset.shape, repro_rights_subset.shape, anti_trump_subset.shape, anti_abortion_subset.shape)

(7228148, 30) (7190739, 30) (7194617, 30) (7205234, 30) (7193945, 30) (7328213, 30) (7190614, 30)


In [120]:
# Run the causal model

gender_eq_causal = CausalModel(
                                 np.array(marriage_eq_subset["num_retweets_log"]), 
                                 np.array(marriage_eq_subset["gender_eq"]), 
                                 marriage_eq_subset[X_vec]
                                )

In [121]:
print(gender_eq_causal.summary_stats)


Summary Statistics

                   Controls (N_c=7204556)         Treated (N_t=678)             
       Variable         Mean         S.d.         Mean         S.d.     Raw-diff
--------------------------------------------------------------------------------
              Y        0.646        2.199        0.073        0.331       -0.574

                   Controls (N_c=7204556)         Treated (N_t=678)             
       Variable         Mean         S.d.         Mean         S.d.     Nor-diff
--------------------------------------------------------------------------------
             X0        0.646        0.478        0.355        0.479       -0.608
             X1        5.052        0.954        5.146        1.031        0.095



In [122]:
#marriage_eq_causal.est_via_ols()
#print(marriage_eq_causal.estimates)

In [123]:
gender_eq_causal.est_propensity_s() # estimate propensity score


TypeError: unhashable type: 'slice'

In [None]:
print(gender_eq_causal.propensity)

In [None]:
gender_eq_causal.trim() # trim anything outside of a 0.1 cutoff
print(gender_eq_causal.summary_stats) # look at covariate balance after trimming

In [None]:
gender_eq_causal.est_via_ols()
print(gender_eq_causal.estimates)