<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Gender-dynamics" data-toc-modified-id="Gender-dynamics-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Gender dynamics</a></span><ul class="toc-item"><li><span><a href="#Tweet-data-prep" data-toc-modified-id="Tweet-data-prep-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Tweet data prep</a></span><ul class="toc-item"><li><span><a href="#Load-the-tweets" data-toc-modified-id="Load-the-tweets-1.1.1"><span class="toc-item-num">1.1.1&nbsp;&nbsp;</span>Load the tweets</a></span></li></ul></li><li><span><a href="#Tweeter-data-prep" data-toc-modified-id="Tweeter-data-prep-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Tweeter data prep</a></span><ul class="toc-item"><li><span><a href="#Prepare-the-tweeter-data" data-toc-modified-id="Prepare-the-tweeter-data-1.2.1"><span class="toc-item-num">1.2.1&nbsp;&nbsp;</span>Prepare the tweeter data</a></span></li><li><span><a href="#Load-user-lookup" data-toc-modified-id="Load-user-lookup-1.2.2"><span class="toc-item-num">1.2.2&nbsp;&nbsp;</span>Load user lookup</a></span></li><li><span><a href="#Load-user-info" data-toc-modified-id="Load-user-info-1.2.3"><span class="toc-item-num">1.2.3&nbsp;&nbsp;</span>Load user info</a></span></li><li><span><a href="#Remove-users-with-no-tweets-in-dataset" data-toc-modified-id="Remove-users-with-no-tweets-in-dataset-1.2.4"><span class="toc-item-num">1.2.4&nbsp;&nbsp;</span>Remove users with no tweets in dataset</a></span></li><li><span><a href="#Gender" data-toc-modified-id="Gender-1.2.5"><span class="toc-item-num">1.2.5&nbsp;&nbsp;</span>Gender</a></span></li></ul></li><li><span><a href="#Hashtag-data-prep" data-toc-modified-id="Hashtag-data-prep-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Hashtag data prep</a></span><ul class="toc-item"><li><span><a href="#Load-hashtags-from-original-tweets" data-toc-modified-id="Load-hashtags-from-original-tweets-1.3.1"><span class="toc-item-num">1.3.1&nbsp;&nbsp;</span>Load hashtags from original tweets</a></span></li><li><span><a href="#Add-gender" data-toc-modified-id="Add-gender-1.3.2"><span class="toc-item-num">1.3.2&nbsp;&nbsp;</span>Add gender</a></span></li></ul></li><li><span><a href="#Top-hashtags" data-toc-modified-id="Top-hashtags-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Top hashtags</a></span><ul class="toc-item"><li><span><a href="#Top-hashtags-for-female-and-male-journalists" data-toc-modified-id="Top-hashtags-for-female-and-male-journalists-1.4.1"><span class="toc-item-num">1.4.1&nbsp;&nbsp;</span>Top hashtags for female and male journalists</a></span></li><li><span><a href="#Top-hashtags-for-female-journalists" data-toc-modified-id="Top-hashtags-for-female-journalists-1.4.2"><span class="toc-item-num">1.4.2&nbsp;&nbsp;</span>Top hashtags for female journalists</a></span></li><li><span><a href="#Top-hashtags-for-male-journalists" data-toc-modified-id="Top-hashtags-for-male-journalists-1.4.3"><span class="toc-item-num">1.4.3&nbsp;&nbsp;</span>Top hashtags for male journalists</a></span></li></ul></li><li><span><a href="#Top-bigrams" data-toc-modified-id="Top-bigrams-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>Top bigrams</a></span><ul class="toc-item"><li><span><a href="#Top-bigrams-for-female-and-male-journalists" data-toc-modified-id="Top-bigrams-for-female-and-male-journalists-1.5.1"><span class="toc-item-num">1.5.1&nbsp;&nbsp;</span>Top bigrams for female and male journalists</a></span></li><li><span><a href="#Top-bigrams-for-female-journalists" data-toc-modified-id="Top-bigrams-for-female-journalists-1.5.2"><span class="toc-item-num">1.5.2&nbsp;&nbsp;</span>Top bigrams for female journalists</a></span></li><li><span><a href="#Top-bigrams-for-male-journalists" data-toc-modified-id="Top-bigrams-for-male-journalists-1.5.3"><span class="toc-item-num">1.5.3&nbsp;&nbsp;</span>Top bigrams for male journalists</a></span></li></ul></li></ul></li></ul></div>

# Gender dynamics

## Tweet data prep

### Load the tweets

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import logging
from dateutil.parser import parse as date_parse
from utils import load_tweet_df, tweet_type, tweet_text
import matplotlib.pyplot as plt


logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# Set float format so doesn't display scientific notation
pd.options.display.float_format = '{:20,.2f}'.format

def tweet_transform(tweet):
    return {
        'tweet_id': tweet['id_str'], 
        'tweet_created_at': date_parse(tweet['created_at']),
        'user_id': tweet['user']['id_str'],
        'screen_name': tweet['user']['screen_name'],
        'tweet_type': tweet_type(tweet),
        'tweet_text': tweet_text(tweet)
    }

tweet_df = load_tweet_df(tweet_transform, ['tweet_id', 'user_id', 'screen_name', 'tweet_created_at', 'tweet_type', 'tweet_text'], dedupe_columns=['tweet_id'])
tweet_df.count()

INFO:root:Loading from tweets/642bf140607547cb9d4c6b1fc49772aa_001.json.gz
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
INFO:root:Loading from tweets/9f7ed17c16a1494c8690b4053609539d_001.json.gz
DEBUG:root:Loaded 300000
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
DEBUG:root:Loaded 450000
DEBUG:root:Loaded 500000
INFO:root:Loading from tweets/41feff28312c433ab004cd822212f4c2_001.json.gz
DEBUG:root:Loaded 550000
DEBUG:root:Loaded 600000
DEBUG:root:Loaded 650000
DEBUG:root:Loaded 700000
DEBUG:root:Loaded 750000
DEBUG:root:Loaded 800000


tweet_id            817136
user_id             817136
screen_name         817136
tweet_created_at    817136
tweet_type          817136
tweet_text          817136
dtype: int64

In [2]:
tweet_df.head()

Unnamed: 0,tweet_id,user_id,screen_name,tweet_created_at,tweet_type,tweet_text
0,872631046088601600,327862439,jonathanvswan,2017-06-08 01:47:08+00:00,retweet,RT @maggieNYT: Does the spokesman who smeared ...
1,872610483647516673,327862439,jonathanvswan,2017-06-08 00:25:26+00:00,retweet,RT @TomNamako: Christie floats the first defen...
2,872609618626826240,327862439,jonathanvswan,2017-06-08 00:22:00+00:00,retweet,RT @jmartNYT: Imagine this exchange in any oth...
3,872605974699311104,327862439,jonathanvswan,2017-06-08 00:07:31+00:00,retweet,RT @maggieNYT: Is Ann Coulter the only person ...
4,872603191518646276,327862439,jonathanvswan,2017-06-07 23:56:27+00:00,retweet,RT @JonathanTurley: USA Today posted my column...


## Tweeter data prep

### Prepare the tweeter data
This comes from the following sources:
1. User lookup: These are lists of users exported from SFM. These are the final set of beltway journalists. Accounts that were suspended or deleted have been removed from this list. Also, this list will include users that did not tweet (i.e., have no tweets in dataset).
2. Tweets in the dataset: Used to generate tweet counts per tweeter. However, since some beltway journalists may not have tweeted, this may be a subset of the user lookup. Also, it may include the tweets of some users that were later excluded because their accounts were suspended or deleted or determined to not be beltway journalists.
3. User info lookup: Information on users that was manually coded in the beltway journalist spreadsheet or looked up from Twitter's API. This includes some accounts that were excluded from data collection for various reasons such as working for a foreign news organization or no longer working as a beltway journalist. Thus, these are a superset of the user lookup.

Thus, the tweeter data should include tweet and user info data only from users in the user lookup.

### Load user lookup

In [3]:
user_lookup_filepaths = ('lookups/senate_press_lookup.csv',
                         'lookups/periodical_press_lookup.csv',
                         'lookups/radio_and_television_lookup.csv')
user_lookup_df = pd.concat((pd.read_csv(user_lookup_filepath, usecols=['Uid', 'Token'], dtype={'Uid': str}) for user_lookup_filepath in user_lookup_filepaths))
user_lookup_df.set_index('Uid', inplace=True)
user_lookup_df.rename(columns={'Token': 'screen_name'}, inplace=True)
user_lookup_df.index.names = ['user_id']
# Some users may be in multiple lists, so need to drop duplicates
user_lookup_df = user_lookup_df[~user_lookup_df.index.duplicated()]

user_lookup_df.count()

screen_name    2487
dtype: int64

In [4]:
user_lookup_df.head()

Unnamed: 0_level_0,screen_name
user_id,Unnamed: 1_level_1
23455653,abettel
33919343,AshleyRParker
18580432,b_fung
399225358,b_muzz
18834692,becca_milfeld


### Load user info

In [5]:
user_info_df = pd.read_csv('source_data/user_info_lookup.csv', names=['user_id', 'name', 'organization', 'position',
                                            'gender', 'followers_count', 'following_count', 'tweet_count',
                                            'user_created_at', 'verified', 'protected'],
                          dtype={'user_id': str}).set_index(['user_id'])
user_info_df.count()

name               2506
organization       2477
position           2503
gender             2505
followers_count    2506
following_count    2506
tweet_count        2506
user_created_at    2506
verified           2506
protected          2506
dtype: int64

In [6]:
user_info_df.head()

Unnamed: 0_level_0,name,organization,position,gender,followers_count,following_count,tweet_count,user_created_at,verified,protected
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
20711445,"Glinski, Nina",,Freelance Reporter,F,963,507,909,Thu Feb 12 20:00:53 +0000 2009,False,False
258917371,"Enders, David",,Journalist,M,1444,484,6296,Mon Feb 28 19:52:03 +0000 2011,True,False
297046834,"Barakat, Matthew",Associated Press,Northern Virginia Correspondent,M,759,352,631,Wed May 11 20:55:24 +0000 2011,True,False
455585786,"Atkins, Kimberly",Boston Herald,Chief Washington Reporter/Columnist,F,2944,2691,6277,Thu Jan 05 08:26:46 +0000 2012,True,False
42584840,"Vlahou, Toula",CQ Roll Call,Editor & Podcast Producer,F,2703,201,6366,Tue May 26 07:41:38 +0000 2009,False,False


In [7]:
user_tweet_count_df = tweet_df[['user_id', 'tweet_type']].groupby(['user_id', 'tweet_type']).size().unstack()
user_tweet_count_df.fillna(0, inplace=True)
user_tweet_count_df['tweets_in_dataset'] = user_tweet_count_df.original + user_tweet_count_df.quote + user_tweet_count_df.reply + user_tweet_count_df.retweet

In [8]:
user_summary_df = user_lookup_df.join((user_info_df, user_tweet_count_df), how='left')
# Fill Nans
user_summary_df['organization'].fillna('', inplace=True)
user_summary_df['original'].fillna(0, inplace=True)
user_summary_df['quote'].fillna(0, inplace=True)
user_summary_df['reply'].fillna(0, inplace=True)
user_summary_df['retweet'].fillna(0, inplace=True)
user_summary_df['tweets_in_dataset'].fillna(0, inplace=True)
user_summary_df.count()

screen_name          2487
name                 2487
organization         2487
position             2484
gender               2486
followers_count      2487
following_count      2487
tweet_count          2487
user_created_at      2487
verified             2487
protected            2487
original             2487
quote                2487
reply                2487
retweet              2487
tweets_in_dataset    2487
dtype: int64

In [9]:
user_summary_df.head()

Unnamed: 0_level_0,screen_name,name,organization,position,gender,followers_count,following_count,tweet_count,user_created_at,verified,protected,original,quote,reply,retweet,tweets_in_dataset
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
23455653,abettel,"Bettelheim, Adriel",Politico,Health Care Editor,F,2664,1055,15990,Mon Mar 09 16:32:20 +0000 2009,True,False,289.0,12.0,6.0,52.0,359.0
33919343,AshleyRParker,"Parker, Ashley",Washington Post,White House Reporter,F,122382,2342,12433,Tue Apr 21 14:28:57 +0000 2009,True,False,172.0,67.0,11.0,120.0,370.0
18580432,b_fung,"Fung, Brian",Washington Post,Tech Reporter,M,16558,2062,44799,Sat Jan 03 15:15:57 +0000 2009,True,False,257.0,85.0,205.0,82.0,629.0
399225358,b_muzz,"Murray, Brendan",Bloomberg News,"Managing Editor, U.S. Economy",M,624,382,360,Thu Oct 27 05:34:05 +0000 2011,True,False,3.0,0.0,0.0,5.0,8.0
18834692,becca_milfeld,"Milfeld, Becca",Agence France-Presse,English Desk Editor and Journalist,F,483,993,1484,Sat Jan 10 13:58:43 +0000 2009,False,False,3.0,14.0,0.0,7.0,24.0


### Remove users with no tweets in dataset

In [10]:
user_summary_df[user_summary_df.tweets_in_dataset == 0].count()

screen_name          195
name                 195
organization         195
position             195
gender               194
followers_count      195
following_count      195
tweet_count          195
user_created_at      195
verified             195
protected            195
original             195
quote                195
reply                195
retweet              195
tweets_in_dataset    195
dtype: int64

In [11]:
user_summary_df = user_summary_df[user_summary_df.tweets_in_dataset != 0]
user_summary_df.count()

screen_name          2292
name                 2292
organization         2292
position             2289
gender               2292
followers_count      2292
following_count      2292
tweet_count          2292
user_created_at      2292
verified             2292
protected            2292
original             2292
quote                2292
reply                2292
retweet              2292
tweets_in_dataset    2292
dtype: int64

### Gender

In [12]:
journalist_gender_summary_df = pd.DataFrame({'count':user_summary_df.gender.value_counts(), 'percentage':user_summary_df.gender.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'})
journalist_gender_summary_df

Unnamed: 0,count,percentage
M,1299,56.7%
F,993,43.3%


## Hashtag data prep

### Load hashtags from original tweets

In [13]:
# Simply the tweet on load
def hashtag_transform(tweet):
    hashtags = []
    if tweet_type(tweet) == 'original':
        for hashtag in tweet.get('entities', {}).get('hashtags', []):
            hashtags.append({
                'tweet_id': tweet['id_str'],
                'user_id': tweet['user']['id_str'],
                'screen_name': tweet['user']['screen_name'],
                'hashtag': hashtag['text'],
                'tweet_created_at': date_parse(tweet['created_at'])
            })
    return hashtags

base_hashtags_df = load_tweet_df(hashtag_transform, ['tweet_id', 'user_id', 'screen_name', 'hashtag',
                                           'tweet_created_at'], 
                           dedupe_columns=['tweet_id', 'hashtag'])
base_hashtags_df.count()

INFO:root:Loading from tweets/642bf140607547cb9d4c6b1fc49772aa_001.json.gz
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
INFO:root:Loading from tweets/9f7ed17c16a1494c8690b4053609539d_001.json.gz
DEBUG:root:Loaded 300000
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
DEBUG:root:Loaded 450000
DEBUG:root:Loaded 500000
INFO:root:Loading from tweets/41feff28312c433ab004cd822212f4c2_001.json.gz
DEBUG:root:Loaded 550000
DEBUG:root:Loaded 600000
DEBUG:root:Loaded 650000
DEBUG:root:Loaded 700000
DEBUG:root:Loaded 750000
DEBUG:root:Loaded 800000


tweet_id            57083
user_id             57083
screen_name         57083
hashtag             57083
tweet_created_at    57083
dtype: int64

In [14]:
base_hashtags_df.head()

Unnamed: 0,tweet_id,user_id,screen_name,hashtag,tweet_created_at
0,872613048531529735,18482836,kristina_wong,Hooah,2017-06-08 00:35:37+00:00
1,872074947463663616,18482836,kristina_wong,SecState,2017-06-06 12:57:24+00:00
2,871531374741684224,18482836,kristina_wong,Manchester,2017-06-05 00:57:26+00:00
3,871403986276012032,18482836,kristina_wong,London,2017-06-04 16:31:14+00:00
4,871403986276012032,18482836,kristina_wong,LondonStrong,2017-06-04 16:31:14+00:00


### Add gender

In [15]:
hashtags_df = base_hashtags_df.join(user_summary_df['gender'], on='user_id')
hashtags_df.count()

tweet_id            57083
user_id             57083
screen_name         57083
hashtag             57083
tweet_created_at    57083
gender              57083
dtype: int64

## Top hashtags
### Top hashtags for female and male journalists

In [16]:
hashtags_df.hashtag.value_counts().head(25)

Comey                        808
GA06                         793
Trump                        785
Obamacare                    760
SCOTUS                       736
Russia                       617
FY18NDAA                     557
Trumpcare                    485
pharma                       474
BCRA                         440
ComeyHearing                 413
BREAKING                     384
ACA                          367
ParisAgreement               322
FDA                          307
muniland                     304
NBC4DC                       300
TheLead                      299
healthcare                   289
ComeyDay                     281
AHCA                         274
ComeyTestimony               273
CongressionalBaseballGame    271
wmata                        265
HealthcareBill               258
Name: hashtag, dtype: int64

### Top hashtags for female journalists

In [17]:
hashtags_df[hashtags_df.gender == 'F'].hashtag.value_counts().head(25)

Obamacare                    637
Comey                        511
pharma                       474
Trumpcare                    468
Trump                        380
BCRA                         362
Russia                       358
GA06                         351
ACA                          325
SCOTUS                       304
muniland                     304
FDA                          302
biotech                      242
AHCA                         223
ComeyHearing                 204
healthcare                   199
ParisAgreement               190
drugprices                   184
BREAKING                     177
ComeyTestimony               174
ComeyDay                     164
AMR                          159
CongressionalBaseballGame    151
wmata                        146
NBC4DC                       142
Name: hashtag, dtype: int64

### Top hashtags for male journalists

In [18]:
hashtags_df[hashtags_df.gender == 'M'].hashtag.value_counts().head(25)

FY18NDAA                     531
GA06                         442
SCOTUS                       432
Trump                        405
Comey                        297
TheLead                      297
Russia                       259
ComeyHearing                 209
BREAKING                     207
HealthcareBill               171
Nats                         163
NBC4DC                       158
AspenSecurity                155
ISIS                         133
ParisAgreement               132
G20                          127
Obamacare                    123
CongressionalBaseballGame    120
wmata                        119
ComeyDay                     117
CNNsotu                      115
NetNeutrality                110
NorthKorea                   109
AlexandriaShooting            99
ComeyTestimony                99
Name: hashtag, dtype: int64

## Top bigrams
Bigrams are combinations of 2 words.
### Top bigrams for female and male journalists

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text 

# Add some additional stop words
stop_words = text.ENGLISH_STOP_WORDS.union(('rt', 'https'))

# Join tweets_df with gender from user_info_df, for original tweets only
tweets_gender_df = pd.merge(tweet_df[tweet_df.tweet_type == 'original'], user_info_df[['gender']], right_index=True, left_on='user_id')

word_vectorizer = CountVectorizer(ngram_range=(2,2), analyzer='word', stop_words=stop_words)
sparse_matrix = word_vectorizer.fit_transform(tweets_gender_df['tweet_text'])
frequencies = sum(sparse_matrix).toarray()[0]
pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency']).sort_values('frequency', ascending=False).head(50)


Unnamed: 0,frequency
health care,6708
white house,5107
president trump,2217
donald trump,1600
pres trump,1417
trump says,1217
north korea,1214
senate health,1210
trump jr,1203
obamacare repeal,1115


### Top bigrams for female journalists

In [20]:
sparse_matrix = word_vectorizer.fit_transform(tweets_gender_df[tweets_gender_df.gender == 'F']['tweet_text'])
frequencies = sum(sparse_matrix).toarray()[0]
pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency']).sort_values('frequency', ascending=False).head(50)

Unnamed: 0,frequency
health care,2345
white house,1661
president trump,649
obamacare repeal,544
senate health,483
north korea,410
trump says,410
senate gop,408
donald trump,406
trump jr,398


### Top bigrams for male journalists

In [21]:
sparse_matrix = word_vectorizer.fit_transform(tweets_gender_df[tweets_gender_df.gender == 'M']['tweet_text'])
frequencies = sum(sparse_matrix).toarray()[0]
pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency']).sort_values('frequency', ascending=False).head(50)

Unnamed: 0,frequency
health care,4363
white house,3446
president trump,1568
pres trump,1286
donald trump,1194
trump says,807
trump jr,805
north korea,804
senate health,727
travel ban,714
