# Gender dynamics

## Tweeter data prep

### Load the tweets

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import logging
from dateutil.parser import parse as date_parse
from utils import load_tweet_df, tweet_type
import matplotlib.pyplot as plt


logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# Set float format so doesn't display scientific notation
pd.options.display.float_format = '{:20,.2f}'.format

def tweet_transform(tweet):
    return {
        'tweet_id': tweet['id_str'], 
        'tweet_created_at': date_parse(tweet['created_at']),
        'user_id': tweet['user']['id_str'],
        'screen_name': tweet['user']['screen_name'],
        'tweet_type': tweet_type(tweet)
    }

tweet_df = load_tweet_df(tweet_transform, ['tweet_id', 'user_id', 'screen_name', 'tweet_created_at', 'tweet_type'], dedupe_columns=['tweet_id'])
tweet_df.count()

INFO:root:Loading from tweets/41feff28312c433ab004cd822212f4c2_001.json.gz
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
DEBUG:root:Loaded 300000
INFO:root:Loading from tweets/642bf140607547cb9d4c6b1fc49772aa_001.json.gz
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
DEBUG:root:Loaded 450000
DEBUG:root:Loaded 500000
DEBUG:root:Loaded 550000
INFO:root:Loading from tweets/9f7ed17c16a1494c8690b4053609539d_001.json.gz
DEBUG:root:Loaded 600000
DEBUG:root:Loaded 650000
DEBUG:root:Loaded 700000
DEBUG:root:Loaded 750000
DEBUG:root:Loaded 800000


tweet_id            817136
user_id             817136
screen_name         817136
tweet_created_at    817136
tweet_type          817136
dtype: int64

In [2]:
tweet_df.head()

Unnamed: 0,tweet_id,user_id,screen_name,tweet_created_at,tweet_type
0,875028934177542144,15731368,HowardKurtz,2017-06-14 16:35:29+00:00,original
1,875012559228874752,15731368,HowardKurtz,2017-06-14 15:30:25+00:00,retweet
2,875012374343929862,15731368,HowardKurtz,2017-06-14 15:29:41+00:00,original
3,875010759759101952,15731368,HowardKurtz,2017-06-14 15:23:16+00:00,retweet
4,875010163496939520,15731368,HowardKurtz,2017-06-14 15:20:54+00:00,retweet


## Prepare the tweeter data
This comes from the following sources:
1. User lookup: These are lists of users exported from SFM. These are the final set of beltway journalists. Accounts that were suspended or deleted have been removed from this list. Also, this list will include users that did not tweet (i.e., have no tweets in dataset).
2. Tweets in the dataset: Used to generate tweet counts per tweeter. However, since some beltway journalists may not have tweeted, this may be a subset of the user lookup. Also, it may include the tweets of some users that were later excluded because their accounts were suspended or deleted or determined to not be beltway journalists.
3. User info lookup: Information on users that was manually coded in the beltway journalist spreadsheet or looked up from Twitter's API. This includes some accounts that were excluded from data collection for various reasons such as working for a foreign news organization or no longer working as a beltway journalist. Thus, these are a superset of the user lookup.

Thus, the tweeter data should include tweet and user info data only from users in the user lookup.

### Load user lookup

In [3]:
user_lookup_filepaths = ('lookups/senate_press_lookup.csv',
                         'lookups/periodical_press_lookup.csv',
                         'lookups/radio_and_television_lookup.csv')
user_lookup_df = pd.concat((pd.read_csv(user_lookup_filepath, usecols=['Uid', 'Token'], dtype={'Uid': str}) for user_lookup_filepath in user_lookup_filepaths))
user_lookup_df.set_index('Uid', inplace=True)
user_lookup_df.rename(columns={'Token': 'screen_name'}, inplace=True)
user_lookup_df.index.names = ['user_id']
# Some users may be in multiple lists, so need to drop duplicates
user_lookup_df = user_lookup_df[~user_lookup_df.index.duplicated()]

user_lookup_df.count()

screen_name    2487
dtype: int64

In [4]:
user_lookup_df.head()

Unnamed: 0_level_0,screen_name
user_id,Unnamed: 1_level_1
23455653,abettel
33919343,AshleyRParker
18580432,b_fung
399225358,b_muzz
18834692,becca_milfeld


### Tweets in dataset per tweeter

In [5]:
user_tweet_count_df = tweet_df[['user_id', 'tweet_type']].groupby(['user_id', 'tweet_type']).size().unstack()
user_tweet_count_df.fillna(0, inplace=True)
user_tweet_count_df['tweets_in_dataset'] = user_tweet_count_df.original + user_tweet_count_df.quote + user_tweet_count_df.reply + user_tweet_count_df.retweet
user_tweet_count_df.count()

tweet_type
original             2292
quote                2292
reply                2292
retweet              2292
tweets_in_dataset    2292
dtype: int64

In [6]:
user_tweet_count_df.head()

tweet_type,original,quote,reply,retweet,tweets_in_dataset
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001991865,13.0,3.0,1.0,31.0,48.0
1002229862,48.0,20.0,3.0,118.0,189.0
100270054,1.0,0.0,0.0,0.0,1.0
100802089,4.0,7.0,12.0,17.0,40.0
100860790,102.0,26.0,4.0,166.0,298.0


### Load user info

In [7]:
user_info_df = pd.read_csv('source_data/user_info_lookup.csv', names=['user_id', 'name', 'organization', 'position',
                                            'gender', 'followers_count', 'following_count', 'tweet_count',
                                            'user_created_at', 'verified', 'protected'],
                          dtype={'user_id': str}).set_index(['user_id'])
user_info_df.count()

name               2506
organization       2477
position           2503
gender             2505
followers_count    2506
following_count    2506
tweet_count        2506
user_created_at    2506
verified           2506
protected          2506
dtype: int64

In [8]:
user_info_df.head()

Unnamed: 0_level_0,name,organization,position,gender,followers_count,following_count,tweet_count,user_created_at,verified,protected
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
20711445,"Glinski, Nina",,Freelance Reporter,F,963,507,909,Thu Feb 12 20:00:53 +0000 2009,False,False
258917371,"Enders, David",,Journalist,M,1444,484,6296,Mon Feb 28 19:52:03 +0000 2011,True,False
297046834,"Barakat, Matthew",Associated Press,Northern Virginia Correspondent,M,759,352,631,Wed May 11 20:55:24 +0000 2011,True,False
455585786,"Atkins, Kimberly",Boston Herald,Chief Washington Reporter/Columnist,F,2944,2691,6277,Thu Jan 05 08:26:46 +0000 2012,True,False
42584840,"Vlahou, Toula",CQ Roll Call,Editor & Podcast Producer,F,2703,201,6366,Tue May 26 07:41:38 +0000 2009,False,False


In [9]:
user_summary_df = user_lookup_df.join((user_info_df, user_tweet_count_df), how='left')
# Fill Nans
user_summary_df['organization'].fillna('', inplace=True)
user_summary_df['original'].fillna(0, inplace=True)
user_summary_df['quote'].fillna(0, inplace=True)
user_summary_df['reply'].fillna(0, inplace=True)
user_summary_df['retweet'].fillna(0, inplace=True)
user_summary_df['tweets_in_dataset'].fillna(0, inplace=True)
user_summary_df.count()

screen_name          2487
name                 2487
organization         2487
position             2484
gender               2486
followers_count      2487
following_count      2487
tweet_count          2487
user_created_at      2487
verified             2487
protected            2487
original             2487
quote                2487
reply                2487
retweet              2487
tweets_in_dataset    2487
dtype: int64

In [10]:
user_summary_df.head()

Unnamed: 0_level_0,screen_name,name,organization,position,gender,followers_count,following_count,tweet_count,user_created_at,verified,protected,original,quote,reply,retweet,tweets_in_dataset
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
23455653,abettel,"Bettelheim, Adriel",Politico,Health Care Editor,F,2664,1055,15990,Mon Mar 09 16:32:20 +0000 2009,True,False,289.0,12.0,6.0,52.0,359.0
33919343,AshleyRParker,"Parker, Ashley",Washington Post,White House Reporter,F,122382,2342,12433,Tue Apr 21 14:28:57 +0000 2009,True,False,172.0,67.0,11.0,120.0,370.0
18580432,b_fung,"Fung, Brian",Washington Post,Tech Reporter,M,16558,2062,44799,Sat Jan 03 15:15:57 +0000 2009,True,False,257.0,85.0,205.0,82.0,629.0
399225358,b_muzz,"Murray, Brendan",Bloomberg News,"Managing Editor, U.S. Economy",M,624,382,360,Thu Oct 27 05:34:05 +0000 2011,True,False,3.0,0.0,0.0,5.0,8.0
18834692,becca_milfeld,"Milfeld, Becca",Agence France-Presse,English Desk Editor and Journalist,F,483,993,1484,Sat Jan 10 13:58:43 +0000 2009,False,False,3.0,14.0,0.0,7.0,24.0


### Remove users with no tweets in dataset

In [11]:
user_summary_df[user_summary_df.tweets_in_dataset == 0].count()

screen_name          195
name                 195
organization         195
position             195
gender               194
followers_count      195
following_count      195
tweet_count          195
user_created_at      195
verified             195
protected            195
original             195
quote                195
reply                195
retweet              195
tweets_in_dataset    195
dtype: int64

In [12]:
user_summary_df = user_summary_df[user_summary_df.tweets_in_dataset != 0]
user_summary_df.count()

screen_name          2292
name                 2292
organization         2292
position             2289
gender               2292
followers_count      2292
following_count      2292
tweet_count          2292
user_created_at      2292
verified             2292
protected            2292
original             2292
quote                2292
reply                2292
retweet              2292
tweets_in_dataset    2292
dtype: int64

## Tweeter analysis

### Gender

In [13]:
pd.DataFrame({'count':user_summary_df.gender.value_counts(), 'percentage':user_summary_df.gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})

Unnamed: 0,count,percentage
M,1299,56.7%
F,993,43.3%


### Summary

#### All

In [14]:
user_summary_df[['followers_count', 'following_count', 'tweet_count', 'original', 'quote', 'reply', 'retweet', 'tweets_in_dataset']].describe()

Unnamed: 0,followers_count,following_count,tweet_count,original,quote,reply,retweet,tweets_in_dataset
count,2292.0,2292.0,2292.0,2292.0,2292.0,2292.0,2292.0,2292.0
mean,16467.62,1444.83,9619.69,102.06,48.73,55.08,150.64,356.52
std,91886.9,3003.0,16618.09,169.43,135.9,249.18,585.08,833.76
min,6.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
25%,831.75,505.75,1449.5,10.0,1.0,1.0,8.0,32.0
50%,2419.5,998.5,4211.5,41.0,9.0,5.0,39.0,122.0
75%,7348.75,1713.5,10817.25,124.25,43.0,30.0,129.0,375.0
max,2176578.0,96194.0,208763.0,2693.0,3069.0,9033.0,21524.0,21547.0


#### Female

In [15]:
user_summary_df[user_summary_df.gender == 'F'][['followers_count', 'following_count', 'tweet_count', 'original', 'quote', 'reply', 'retweet', 'tweets_in_dataset']].describe()

Unnamed: 0,followers_count,following_count,tweet_count,original,quote,reply,retweet,tweets_in_dataset
count,993.0,993.0,993.0,993.0,993.0,993.0,993.0,993.0
mean,11609.53,1314.07,7498.74,83.84,39.27,32.06,135.55,290.72
std,65563.72,1250.56,11312.72,124.86,135.05,94.73,724.92,833.07
min,6.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
25%,825.0,567.0,1393.0,8.0,1.0,1.0,9.0,32.0
50%,2327.0,1034.0,4055.0,39.0,9.0,4.0,37.0,111.0
75%,6340.0,1659.0,8983.0,111.0,33.0,21.0,115.0,314.0
max,1388543.0,18197.0,118713.0,1440.0,3069.0,1458.0,21524.0,21547.0


#### Male

In [16]:
user_summary_df[user_summary_df.gender == 'M'][['followers_count', 'following_count', 'tweet_count', 'original', 'quote', 'reply', 'retweet', 'tweets_in_dataset']].describe()

Unnamed: 0,followers_count,following_count,tweet_count,original,quote,reply,retweet,tweets_in_dataset
count,1299.0,1299.0,1299.0,1299.0,1299.0,1299.0,1299.0,1299.0
mean,20181.31,1544.78,11241.02,115.99,55.96,72.69,162.17,406.81
std,107635.37,3833.89,19584.46,195.72,136.16,319.41,449.75,831.1
min,10.0,0.0,5.0,0.0,0.0,0.0,0.0,1.0
25%,857.5,472.0,1477.0,12.0,0.0,1.0,6.0,33.0
50%,2498.0,953.0,4401.0,44.0,9.0,6.0,40.0,131.0
75%,8341.5,1763.0,12584.5,140.0,50.5,38.5,142.0,428.0
max,2176578.0,96194.0,208763.0,2693.0,1955.0,9033.0,7528.0,11432.0


### Verified

#### All

In [17]:
pd.DataFrame({'count':user_summary_df.verified.value_counts(), 'percentage':user_summary_df.verified.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})

Unnamed: 0,count,percentage
True,1240,54.1%
False,1052,45.9%


#### Female

In [18]:
pd.DataFrame({'count':user_summary_df[user_summary_df.gender == 'F'].verified.value_counts(), 'percentage':user_summary_df[user_summary_df.gender == 'F'].verified.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})

Unnamed: 0,count,percentage
True,512,51.6%
False,481,48.4%


#### Male

In [19]:
pd.DataFrame({'count':user_summary_df[user_summary_df.gender == 'M'].verified.value_counts(), 'percentage':user_summary_df[user_summary_df.gender == 'M'].verified.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})

Unnamed: 0,count,percentage
True,728,56.0%
False,571,44.0%


## Mention data prep

### Load mentions from tweets
Including original tweets only

In [20]:
%matplotlib inline
import pandas as pd
import numpy as np
import logging
from dateutil.parser import parse as date_parse
from utils import load_tweet_df, tweet_type
import matplotlib.pyplot as plt


logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# Set float format so doesn't display scientific notation
pd.options.display.float_format = '{:20,.2f}'.format

# Simply the tweet on load
def mention_transform(tweet):
    mentions = []
    if tweet_type(tweet) == 'original':
        for mention in tweet.get('entities', {}).get('user_mentions', []):
            mentions.append({
                'tweet_id': tweet['id_str'],
                'user_id': tweet['user']['id_str'],
                'screen_name': tweet['user']['screen_name'],
                'mention_user_id': mention['id_str'],
                'mention_screen_name': mention['screen_name'],
                'tweet_created_at': date_parse(tweet['created_at'])
            })
    return mentions

mention_df = load_tweet_df(mention_transform, ['tweet_id', 'user_id', 'screen_name', 'mention_user_id',
                                           'mention_screen_name', 'tweet_created_at'], 
                           dedupe_columns=['tweet_id', 'mention_user_id'])
mention_df.count()

INFO:root:Loading from tweets/41feff28312c433ab004cd822212f4c2_001.json.gz
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
DEBUG:root:Loaded 300000
INFO:root:Loading from tweets/642bf140607547cb9d4c6b1fc49772aa_001.json.gz
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
DEBUG:root:Loaded 450000
DEBUG:root:Loaded 500000
DEBUG:root:Loaded 550000
INFO:root:Loading from tweets/9f7ed17c16a1494c8690b4053609539d_001.json.gz
DEBUG:root:Loaded 600000
DEBUG:root:Loaded 650000
DEBUG:root:Loaded 700000
DEBUG:root:Loaded 750000
DEBUG:root:Loaded 800000


tweet_id               118210
user_id                118210
screen_name            118210
mention_user_id        118210
mention_screen_name    118210
tweet_created_at       118210
dtype: int64

In [21]:
mention_df.head()

Unnamed: 0,tweet_id,user_id,screen_name,mention_user_id,mention_screen_name,tweet_created_at
0,874695069584027648,15731368,HowardKurtz,110445334,megynkelly,2017-06-13 18:28:50+00:00
1,874695069584027648,15731368,HowardKurtz,171154131,HappeningNow,2017-06-13 18:28:50+00:00
2,874652287809064960,15731368,HowardKurtz,110445334,megynkelly,2017-06-13 15:38:50+00:00
3,874281339150884864,15731368,HowardKurtz,486907980,CarleyShimkus,2017-06-12 15:04:49+00:00
4,874265555552882688,15731368,HowardKurtz,2263438704,GillianHTurner,2017-06-12 14:02:06+00:00


### Distinct tweets

In [22]:
mention_df['tweet_id'].unique().size

84942

## Mentioned analysis
That is, who is mentioned not who is mentioning.

*Note that for each of these, the complete list is being written to CSV in the output directory.*

### All
This is based on screen name, which could have changed during collection period. However, for the users that would be at the top of this list, seems unlikely.

In [23]:
all_mentioned_df = mention_df.mention_screen_name.value_counts()
all_mentioned_df.to_csv('output/all_mentioned.csv')
all_mentioned_df.head(25)

realDonaldTrump    2876
POTUS              2265
wusa9              2111
AP                 1948
USATODAY           1235
nbcwashington      1230
WSJ                1227
dcexaminer         1034
SHSanders45         927
nytimes             829
BloombergBNA        759
politico            747
SpeakerRyan         700
Scaramucci          657
PressSec            654
CNN                 628
ABC7News            604
SenJohnMcCain       599
WTOP                529
BloombergLaw        517
VP                  506
SteveScalise        505
MSNBC               486
Reuters             483
bpolitics           432
Name: mention_screen_name, dtype: int64

### Beltway journalists

In [24]:
journalists_mention_df = mention_df.join(user_summary_df['gender'], how='inner', on='mention_user_id')
journalists_mention_df.count()

tweet_id               14298
user_id                14298
screen_name            14298
mention_user_id        14298
mention_screen_name    14298
tweet_created_at       14298
gender                 14298
dtype: int64

In [25]:
journalists_mention_df.head()

Unnamed: 0,tweet_id,user_id,screen_name,mention_user_id,mention_screen_name,tweet_created_at,gender
9,873955901484486656,15731368,HowardKurtz,16157855,edhenry,2017-06-11 17:31:38+00:00,M
14,873877145117962241,15731368,HowardKurtz,16157855,edhenry,2017-06-11 12:18:41+00:00,M
6414,874439120344535041,19455864,finnygo,16157855,edhenry,2017-06-13 01:31:47+00:00,M
23175,879021641141022720,15731368,HowardKurtz,16157855,edhenry,2017-06-25 17:01:05+00:00,M
23178,878956408657633280,15731368,HowardKurtz,16157855,edhenry,2017-06-25 12:41:52+00:00,M


#### All beltway journalists

In [26]:
all_journalists_mentioned_df = journalists_mention_df.mention_screen_name.value_counts()
all_journalists_mentioned_df.to_csv('output/all_journalists_mentioned.csv')
all_journalists_mentioned_df.head(25)

AllysonRaeWx       330
TenaciousTopper    239
hbwx               235
burgessev          212
jenhab             200
seungminkim        143
jaketapper         127
WaPoSean           117
pkcapitol          116
DanaBashCNN        115
kelsey_snell       109
peterbakernyt      107
daveweigel         106
StevenTDennis      105
jonkarl            104
AshleyRParker      100
reporterjoe         98
mkraju              95
ZoeTillman          87
edatpost            84
HopeSeck            83
HardballChris       80
GlennThrush         78
jameshohmann        78
BresPolitico        78
Name: mention_screen_name, dtype: int64

In [27]:
pd.DataFrame({'count':journalists_mention_df.gender.value_counts(), 'percentage':journalists_mention_df.gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})

Unnamed: 0,count,percentage
M,8298,58.0%
F,6000,42.0%


#### Female beltway journalists

In [28]:
female_journalists_mentioned_df = journalists_mention_df[journalists_mention_df.gender == 'F'].mention_screen_name.value_counts()
female_journalists_mentioned_df.to_csv('output/female_journalists_mentioned.csv')
female_journalists_mentioned_df.head(25)

AllysonRaeWx     330
jenhab           200
seungminkim      143
DanaBashCNN      115
kelsey_snell     109
AshleyRParker    100
ZoeTillman        87
HopeSeck          83
jestei            76
morningmika       70
kasie             67
eilperin          67
mj_lee            67
pw_cunningham     67
FoxReports        65
LauraLitvan       58
margarettalev     58
sarahkliff        57
caitlinnowens     57
rachaelmbade      56
juliehdavis       55
rachanadixit      55
Oriana0214        55
jpaceDC           52
JudyWoodruff      49
Name: mention_screen_name, dtype: int64

#### Male beltway journalists

In [29]:
male_journalists_mentioned_df = journalists_mention_df[journalists_mention_df.gender == 'M'].mention_screen_name.value_counts()
male_journalists_mentioned_df.to_csv('output/male_journalists_mentioned.csv')
male_journalists_mentioned_df.head(25)

TenaciousTopper    239
hbwx               235
burgessev          212
jaketapper         127
WaPoSean           117
pkcapitol          116
peterbakernyt      107
daveweigel         106
StevenTDennis      105
jonkarl            104
reporterjoe         98
mkraju              95
edatpost            84
HardballChris       80
jameshohmann        78
BresPolitico        78
GlennThrush         78
jmartNYT            75
chrisgeidner        73
kenvogel            67
BretBaier           66
Acosta              61
pauldemko           60
danbalz             57
mikedebonis         56
Name: mention_screen_name, dtype: int64