# Gender dynamics

## Prepare the tweet data

### Load the tweets

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import logging
from dateutil.parser import parse as date_parse
from utils import load_tweet_df, tweet_type
import matplotlib.pyplot as plt


logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# Set float format so doesn't display scientific notation
pd.options.display.float_format = '{:20,.2f}'.format

def tweet_transform(tweet):
    return {
        'tweet_id': tweet['id_str'], 
        'tweet_created_at': date_parse(tweet['created_at']),
        'user_id': tweet['user']['id_str'],
        'screen_name': tweet['user']['screen_name'],
        'tweet_type': tweet_type(tweet)
    }

tweet_df = load_tweet_df(tweet_transform, ['tweet_id', 'user_id', 'screen_name', 'tweet_created_at', 'tweet_type'])
tweet_df.count()

INFO:root:Loading from tweets/41feff28312c433ab004cd822212f4c2_001.json.gz
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
DEBUG:root:Loaded 300000
INFO:root:Loading from tweets/642bf140607547cb9d4c6b1fc49772aa_001.json.gz
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
DEBUG:root:Loaded 450000
DEBUG:root:Loaded 500000
DEBUG:root:Loaded 550000
INFO:root:Loading from tweets/9f7ed17c16a1494c8690b4053609539d_001.json.gz
DEBUG:root:Loaded 600000
DEBUG:root:Loaded 650000
DEBUG:root:Loaded 700000
DEBUG:root:Loaded 750000
DEBUG:root:Loaded 800000


tweet_id            817136
user_id             817136
screen_name         817136
tweet_created_at    817136
tweet_type          817136
dtype: int64

In [2]:
tweet_df.head()

Unnamed: 0,tweet_id,user_id,screen_name,tweet_created_at,tweet_type
0,875028934177542144,15731368,HowardKurtz,2017-06-14 16:35:29+00:00,original
1,875012559228874752,15731368,HowardKurtz,2017-06-14 15:30:25+00:00,retweet
2,875012374343929862,15731368,HowardKurtz,2017-06-14 15:29:41+00:00,original
3,875010759759101952,15731368,HowardKurtz,2017-06-14 15:23:16+00:00,retweet
4,875010163496939520,15731368,HowardKurtz,2017-06-14 15:20:54+00:00,retweet


## Prepare the tweeter data
This comes from the following sources:
1. User lookup: These are lists of users exported from SFM. These are the final set of beltway journalists.
2. Tweets in the dataset: Used to generate tweet counts per tweeter. However, since some beltway journalists may not have tweeted, this may be a subset of the user lookup.
3. User info lookup: Information on users that was manually coded in the beltway journalist spreadsheet or looked up from Twitter's API. This includes some accounts that were excluded from data collection for various reasons such as working for a foreign news organization or no longer working as a beltway journalist. Thus, these are a superset of the user lookup.


### Load user lookup

In [25]:
user_lookup_filepaths = ('lookups/senate_press_lookup.csv',
                         'lookups/periodical_press_lookup.csv',
                         'lookups/radio_and_television_lookup.csv')
user_lookup_df = pd.concat((pd.read_csv(user_lookup_filepath, usecols=['Uid', 'Token'], dtype={'Uid': str}) for user_lookup_filepath in user_lookup_filepaths))
user_lookup_df.set_index('Uid', inplace=True)
user_lookup_df.rename(columns={'Token': 'screen_name'}, inplace=True)
user_lookup_df.index.names = ['user_id']
# Some users may be in multiple lists, so need to drop duplicates
user_lookup_df = user_lookup_df[~user_lookup_df.index.duplicated()]

user_lookup_df.count()

screen_name    2487
dtype: int64

In [4]:
user_lookup_df.head()

Unnamed: 0_level_0,screen_name
user_id,Unnamed: 1_level_1
23455653,abettel
33919343,AshleyRParker
18580432,b_fung
399225358,b_muzz
18834692,becca_milfeld


### Tweets in dataset per tweeter

In [5]:
user_tweet_count_df = tweet_df[['user_id', 'tweet_type']].groupby(['user_id', 'tweet_type']).size().unstack()
user_tweet_count_df.fillna(0, inplace=True)
user_tweet_count_df['tweets_in_dataset'] = user_tweet_count_df.original + user_tweet_count_df.quote + user_tweet_count_df.reply + user_tweet_count_df.retweet
user_tweet_count_df.count()

tweet_type
original             2292
quote                2292
reply                2292
retweet              2292
tweets_in_dataset    2292
dtype: int64

In [6]:
user_tweet_count_df.head()

tweet_type,original,quote,reply,retweet,tweets_in_dataset
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001991865,13.0,3.0,1.0,31.0,48.0
1002229862,48.0,20.0,3.0,118.0,189.0
100270054,1.0,0.0,0.0,0.0,1.0
100802089,4.0,7.0,12.0,17.0,40.0
100860790,102.0,26.0,4.0,166.0,298.0


### Load user info

In [27]:
user_info_df = pd.read_csv('source_data/user_info_lookup.csv', names=['user_id', 'name', 'organization', 'position',
                                            'gender', 'followers_count', 'following_count', 'tweet_count',
                                            'user_created_at', 'verified', 'protected'],
                          dtype={'user_id': str}).set_index(['user_id'])
user_info_df.count()

name               2506
organization       2477
position           2503
gender             2505
followers_count    2506
following_count    2506
tweet_count        2506
user_created_at    2506
verified           2506
protected          2506
dtype: int64

In [13]:
user_info_df.head()

Unnamed: 0_level_0,name,organization,position,gender,followers_count,following_count,tweet_count,user_created_at,verified,protected
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
20711445,"Glinski, Nina",,Freelance Reporter,F,963,507,909,Thu Feb 12 20:00:53 +0000 2009,False,False
258917371,"Enders, David",,Journalist,M,1444,484,6296,Mon Feb 28 19:52:03 +0000 2011,True,False
297046834,"Barakat, Matthew",Associated Press,Northern Virginia Correspondent,M,759,352,631,Wed May 11 20:55:24 +0000 2011,True,False
455585786,"Atkins, Kimberly",Boston Herald,Chief Washington Reporter/Columnist,F,2944,2691,6277,Thu Jan 05 08:26:46 +0000 2012,True,False
42584840,"Vlahou, Toula",CQ Roll Call,Editor & Podcast Producer,F,2703,201,6366,Tue May 26 07:41:38 +0000 2009,False,False


In [89]:
user_summary_df = user_lookup_df.join((user_info_df, user_tweet_count_df), how='left')
# Fill Nans
user_summary_df['organization'].fillna('', inplace=True)
user_summary_df['original'].fillna(0, inplace=True)
user_summary_df['quote'].fillna(0, inplace=True)
user_summary_df['reply'].fillna(0, inplace=True)
user_summary_df['retweet'].fillna(0, inplace=True)
user_summary_df['tweets_in_dataset'].fillna(0, inplace=True)
# Add reach
user_summary_df['reach'] = user_summary_df.tweets_in_dataset * user_summary_df.followers_count
user_summary_df.count()

screen_name          2487
name                 2487
organization         2487
position             2484
gender               2486
followers_count      2487
following_count      2487
tweet_count          2487
user_created_at      2487
verified             2487
protected            2487
original             2487
quote                2487
reply                2487
retweet              2487
tweets_in_dataset    2487
reach                2487
dtype: int64

In [30]:
user_summary_df.head()

Unnamed: 0_level_0,screen_name,name,organization,position,gender,followers_count,following_count,tweet_count,user_created_at,verified,protected,original,quote,reply,retweet,tweets_in_dataset
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
23455653,abettel,"Bettelheim, Adriel",Politico,Health Care Editor,F,2664,1055,15990,Mon Mar 09 16:32:20 +0000 2009,True,False,289.0,12.0,6.0,52.0,359.0
33919343,AshleyRParker,"Parker, Ashley",Washington Post,White House Reporter,F,122382,2342,12433,Tue Apr 21 14:28:57 +0000 2009,True,False,172.0,67.0,11.0,120.0,370.0
18580432,b_fung,"Fung, Brian",Washington Post,Tech Reporter,M,16558,2062,44799,Sat Jan 03 15:15:57 +0000 2009,True,False,257.0,85.0,205.0,82.0,629.0
399225358,b_muzz,"Murray, Brendan",Bloomberg News,"Managing Editor, U.S. Economy",M,624,382,360,Thu Oct 27 05:34:05 +0000 2011,True,False,3.0,0.0,0.0,5.0,8.0
18834692,becca_milfeld,"Milfeld, Becca",Agence France-Presse,English Desk Editor and Journalist,F,483,993,1484,Sat Jan 10 13:58:43 +0000 2009,False,False,3.0,14.0,0.0,7.0,24.0


In [None]:
### Split by gender

In [90]:
female_user_summary_df = user_summary_df[user_summary_df.gender == 'F']
female_user_summary_df.count()

screen_name          1090
name                 1090
organization         1090
position             1089
gender               1090
followers_count      1090
following_count      1090
tweet_count          1090
user_created_at      1090
verified             1090
protected            1090
original             1090
quote                1090
reply                1090
retweet              1090
tweets_in_dataset    1090
reach                1090
dtype: int64

In [91]:
male_user_summary_df = user_summary_df[user_summary_df.gender == 'M']
male_user_summary_df.count()

screen_name          1396
name                 1396
organization         1396
position             1394
gender               1396
followers_count      1396
following_count      1396
tweet_count          1396
user_created_at      1396
verified             1396
protected            1396
original             1396
quote                1396
reply                1396
retweet              1396
tweets_in_dataset    1396
reach                1396
dtype: int64

### Add groups

In [95]:
# Rank on reach then followers count
user_summary_df = user_summary_df.sort_values(['reach', 'followers_count'], ascending=False).assign(reach_rank=[i+1 for i in range(len(user_summary_df))])
# user_summary_df['male_reach_rank'] = np.NaN
# user_summary_df.sort_values(['reach_rank'], ascending=True).assign(male_reach_rank=[i+1 if user_summary_df.gender == 'M' else np.NaN for i in range(len(user_summary_df[user_summary_df.gender == 'M']))])
user_summary_df['top_10'] = user_summary_df.reach_rank <= 10
user_summary_df['top_20'] = user_summary_df.reach_rank <= 20
user_summary_df['top_1%'] = user_summary_df.reach_rank / len(user_summary_df) <= .01
user_summary_df['top_10%'] = user_summary_df.reach_rank / len(user_summary_df) <= .1
user_summary_df['bottom_90%'] = user_summary_df.reach_rank / len(user_summary_df) > .1
user_summary_df[['screen_name', 
                 'gender',
                 'followers_count', 
                 'tweets_in_dataset', 
                 'reach', 
                 'reach_rank', 
                 'top_10',
                 'top_20',
                 'top_1%',
                 'top_10%',
                 'bottom_90%']].sort_values([('reach')], ascending=False).head(25)
# user_summary_df[user_summary_df.gender == 'M']


Unnamed: 0_level_0,screen_name,gender,followers_count,tweets_in_dataset,reach,reach_rank,top_10,top_20,top_1%,top_10%,bottom_90%
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
14529929,jaketapper,M,1305680,5078.0,6630243040.0,1,True,True,True,True,False
16031927,greta,F,1186850,4792.0,5687385200.0,2,True,True,True,True,False
18646108,BretBaier,M,1095184,2379.0,2605442736.0,3,True,True,True,True,False
89820928,mitchellreports,F,1388543,1423.0,1975896689.0,4,True,True,True,True,False
13524182,daveweigel,M,332344,4564.0,1516818016.0,5,True,True,True,True,False
61734492,Fahrenthold,M,451778,2871.0,1297054638.0,6,True,True,True,True,False
4119741,jdickerson,M,2176578,445.0,968577210.0,7,True,True,True,True,False
259395895,JohnJHarwood,M,149040,6377.0,950428080.0,8,True,True,True,True,False
50325797,chucktodd,M,1781247,522.0,929810934.0,9,True,True,True,True,False
3817401,ericgeller,M,58173,11432.0,665033736.0,10,True,True,True,True,False


In [38]:
user_summary_df[['screen_name', 'followers_count', 'tweets_in_dataset', 'reach', 'reach_rank']].sort_values([('reach')], ascending=True).head(100)

Unnamed: 0_level_0,screen_name,name,organization,position,gender,followers_count,following_count,tweet_count,user_created_at,verified,protected,original,quote,reply,retweet,tweets_in_dataset,reach,reach_rank
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
23455653,abettel,"Bettelheim, Adriel",Politico,Health Care Editor,F,2664,1055,15990,Mon Mar 09 16:32:20 +0000 2009,True,False,289.0,12.0,6.0,52.0,359.0,956376.0,1690.0
33919343,AshleyRParker,"Parker, Ashley",Washington Post,White House Reporter,F,122382,2342,12433,Tue Apr 21 14:28:57 +0000 2009,True,False,172.0,67.0,11.0,120.0,370.0,45281340.0,2391.0
18580432,b_fung,"Fung, Brian",Washington Post,Tech Reporter,M,16558,2062,44799,Sat Jan 03 15:15:57 +0000 2009,True,False,257.0,85.0,205.0,82.0,629.0,10414982.0,2212.0
399225358,b_muzz,"Murray, Brendan",Bloomberg News,"Managing Editor, U.S. Economy",M,624,382,360,Thu Oct 27 05:34:05 +0000 2011,True,False,3.0,0.0,0.0,5.0,8.0,4992.0,441.0
18834692,becca_milfeld,"Milfeld, Becca",Agence France-Presse,English Desk Editor and Journalist,F,483,993,1484,Sat Jan 10 13:58:43 +0000 2009,False,False,3.0,14.0,0.0,7.0,24.0,11592.0,556.0
