In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# Importing the merged Tweets DF
data = pd.read_csv('../data/cleaned_df.csv')

In [3]:
data.head()

Unnamed: 0,id,screen_name,user_id,time,link,text,source,state,position,name,party,congress
0,1.003033e+18,RepBarragan,816833925456789504,2018-06-02T17:55:58-04:00,https://www.twitter.com/RepBarragan/statuses/1...,So great to welcome @SenSanders to #CA44 for a...,Twitter for iPhone,CA,Representative,Nanette Barragan,D,115
1,1.002996e+18,RepBarragan,816833925456789504,2018-06-02T15:30:27-04:00,https://www.twitter.com/RepBarragan/statuses/1...,"In Carson with @SenSanders, @Teamsters, and co...",Twitter for iPhone,CA,Representative,Nanette Barragan,D,115
2,1.00262e+18,RepBarragan,816833925456789504,2018-06-01T14:38:33-04:00,https://www.twitter.com/RepBarragan/statuses/1...,"Yesterday, immigrant rights advocates &amp; at...",Twitter for iPhone,CA,Representative,Nanette Barragan,D,115
3,1.002744e+18,RepBarragan,816833925456789504,2018-06-01T22:49:51-04:00,https://www.twitter.com/POLITICOPress/statuses...,RT @POLITICOPress Next week is our FIRST #Wome...,Twitter for iPhone,CA,Representative,Nanette Barragan,D,115
4,1.002744e+18,RepBarragan,816833925456789504,2018-06-01T22:50:17-04:00,https://www.twitter.com/RepJayapal/statuses/10...,RT @RepJayapal We (62 members of the House) ar...,Twitter for iPhone,CA,Representative,Nanette Barragan,D,115


In [4]:
# Counting the number of tweets from each rep / senator
data_grouped = data.groupby(['screen_name'])['id'].count().reset_index()
# renaming column to tweet_count for clarity
data_grouped.rename(columns={'id': 'tweet_count'}, inplace=True)

In [5]:
# sorting the tweet count df by number of tweets to get the top 5 most prolific tweeters
data_grouped.sort_values(by=['tweet_count'], ascending=False)[:5]


Unnamed: 0,screen_name,tweet_count
203,RepDonBeyer,18325
57,JohnCornyn,17369
100,RepAndyBiggsAZ,14878
208,RepDwightEvans,12698
62,LacyClayMO1,10380


In [6]:
# merging the tweet count df back onto the main df
df = pd.merge(data, data_grouped, on='screen_name', how='outer')

In [7]:
# confirming that merge was successful
print(f'Pre-tweet count merge size: {data.shape}')
print(f'Post-tweet count merge size: {df.shape}')

Pre-tweet count merge size: (1492855, 12)
Post-tweet count merge size: (1492855, 13)


In [8]:
# checking to see that counts match up w/ new df
# counting the number of rows for each user
df['screen_name'].value_counts(ascending=False)[:5]

# the tweet count from 'data_grouped' above matches the number 
# of rows in the larger df

RepDonBeyer       18325
JohnCornyn        17369
RepAndyBiggsAZ    14878
RepDwightEvans    12698
LacyClayMO1       10380
Name: screen_name, dtype: int64

In [9]:
df_text_concat = df.groupby(['screen_name']).agg({'text': ' '.join})

df_text_concat.rename(columns={'text': 'text_concat'}, inplace=True)

df_text_concat.head()

Unnamed: 0_level_0,text_concat
screen_name,Unnamed: 1_level_1
AnthonyBrownMD4,Congratulations @AmyMcGrathKY Let’s bring it h...
AustinScottGA08,RT @AustinScottGA08 The summer months are some...
BennieGThompson,"Once again, your hard earned dollars are being..."
BettyMcCollum04,"And if you’re looking to provide help, here’s ..."
BillCassidy,"Today's jobs report says ""the unemployment rat..."


In [10]:
# checking concatinated string length to see that text has been combined
df_text_concat['text_length'] = df_text_concat['text_concat'].apply(len)

df_text_concat.head()

Unnamed: 0_level_0,text_concat,text_length
screen_name,Unnamed: 1_level_1,Unnamed: 2_level_1
AnthonyBrownMD4,Congratulations @AmyMcGrathKY Let’s bring it h...,19890
AustinScottGA08,RT @AustinScottGA08 The summer months are some...,309793
BennieGThompson,"Once again, your hard earned dollars are being...",516881
BettyMcCollum04,"And if you’re looking to provide help, here’s ...",1043220
BillCassidy,"Today's jobs report says ""the unemployment rat...",620101


In [11]:
# merging the concatinated text column back onto the larger df
df = pd.merge(df, df_text_concat, on='screen_name', how='outer')

# confirming no lost data by examining shape of data frame
df.shape

(1492855, 15)

In [12]:
df.head()

Unnamed: 0,id,screen_name,user_id,time,link,text,source,state,position,name,party,congress,tweet_count,text_concat,text_length
0,1.003033e+18,RepBarragan,816833925456789504,2018-06-02T17:55:58-04:00,https://www.twitter.com/RepBarragan/statuses/1...,So great to welcome @SenSanders to #CA44 for a...,Twitter for iPhone,CA,Representative,Nanette Barragan,D,115,4553,So great to welcome @SenSanders to #CA44 for a...,1571519
1,1.002996e+18,RepBarragan,816833925456789504,2018-06-02T15:30:27-04:00,https://www.twitter.com/RepBarragan/statuses/1...,"In Carson with @SenSanders, @Teamsters, and co...",Twitter for iPhone,CA,Representative,Nanette Barragan,D,115,4553,So great to welcome @SenSanders to #CA44 for a...,1571519
2,1.00262e+18,RepBarragan,816833925456789504,2018-06-01T14:38:33-04:00,https://www.twitter.com/RepBarragan/statuses/1...,"Yesterday, immigrant rights advocates &amp; at...",Twitter for iPhone,CA,Representative,Nanette Barragan,D,115,4553,So great to welcome @SenSanders to #CA44 for a...,1571519
3,1.002744e+18,RepBarragan,816833925456789504,2018-06-01T22:49:51-04:00,https://www.twitter.com/POLITICOPress/statuses...,RT @POLITICOPress Next week is our FIRST #Wome...,Twitter for iPhone,CA,Representative,Nanette Barragan,D,115,4553,So great to welcome @SenSanders to #CA44 for a...,1571519
4,1.002744e+18,RepBarragan,816833925456789504,2018-06-01T22:50:17-04:00,https://www.twitter.com/RepJayapal/statuses/10...,RT @RepJayapal We (62 members of the House) ar...,Twitter for iPhone,CA,Representative,Nanette Barragan,D,115,4553,So great to welcome @SenSanders to #CA44 for a...,1571519


In [13]:
# dropping duplicates as we only need concatinated text column for purposes of future analyses
# df = df.drop_duplicates(subset=None, keep='first')

df = df.drop_duplicates(['screen_name'], keep='last')


print(df.shape)
print('----------')
df.head()

(626, 15)
----------


Unnamed: 0,id,screen_name,user_id,time,link,text,source,state,position,name,party,congress,tweet_count,text_concat,text_length
4552,1.268317e+18,RepBarragan,816833925456789504,2020-06-03T19:00:18-04:00,https://www.twitter.com/RepBarragan/statuses/1...,The murder of #GeorgeFloyd by police was anoth...,Twitter Web App,CA,Representative,Nanette Diaz Barrag_n,D,116,4553,So great to welcome @SenSanders to #CA44 for a...,1571519
13345,1.268363e+18,ChrisMurphyCT,150078976,2020-06-03T22:03:29-04:00,https://www.twitter.com/ChrisMurphyCT/statuses...,The blame for the lack of federal leadership o...,Twitter Web App,CT,Senator,Christopher Murphy,D,116,8793,"New apartments, new businesses and new restaur...",2801192
19186,1.268304e+18,RepSwalwell,942156122,2020-06-03T18:10:04-04:00,https://www.twitter.com/yashar/statuses/126830...,RT @yashar James Mattis Denounces President Tr...,Twitter for iPhone,CA,Representative,Eric Swalwell,D,116,5841,"Literally, the easiest fix we could make to sa...",1843540
37511,1.268352e+18,RepDonBeyer,2962868158,2020-06-03T21:20:23-04:00,https://www.twitter.com/RepDonBeyer/statuses/1...,Calling for military violence against American...,Twitter for iPhone,VA,Representative,Donald Beyer Jr,D,116,18325,9 Baltimore students were killed with guns dur...,7087050
40677,1.268379e+18,WarrenDavidson,742735530287304704,2020-06-03T23:10:01-04:00,https://www.twitter.com/WarrenDavidson/statuse...,Heaven is diverse. \nRevelation 7:9-12 http://...,Twitter for iPhone,OH,Representative,Warren Davidson,R,116,3166,#NetNeutrality Reminder: Obama used 1934 FCC r...,958737


In [14]:
# dropping extra columns
df = df.drop(['id', 'user_id', 'time', 'link', 'text', 'source', 'congress'], axis = 1)

print(df.shape)
print('----------')
df.head()

(626, 8)
----------


Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length
4552,RepBarragan,CA,Representative,Nanette Diaz Barrag_n,D,4553,So great to welcome @SenSanders to #CA44 for a...,1571519
13345,ChrisMurphyCT,CT,Senator,Christopher Murphy,D,8793,"New apartments, new businesses and new restaur...",2801192
19186,RepSwalwell,CA,Representative,Eric Swalwell,D,5841,"Literally, the easiest fix we could make to sa...",1843540
37511,RepDonBeyer,VA,Representative,Donald Beyer Jr,D,18325,9 Baltimore students were killed with guns dur...,7087050
40677,WarrenDavidson,OH,Representative,Warren Davidson,R,3166,#NetNeutrality Reminder: Obama used 1934 FCC r...,958737


In [15]:
# resetting the index
df.reset_index(inplace = True)

# need this line other wise index column will be created
# w/ old index values.
df.drop(['index'], axis = 1, inplace = True)

df.head()

Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length
0,RepBarragan,CA,Representative,Nanette Diaz Barrag_n,D,4553,So great to welcome @SenSanders to #CA44 for a...,1571519
1,ChrisMurphyCT,CT,Senator,Christopher Murphy,D,8793,"New apartments, new businesses and new restaur...",2801192
2,RepSwalwell,CA,Representative,Eric Swalwell,D,5841,"Literally, the easiest fix we could make to sa...",1843540
3,RepDonBeyer,VA,Representative,Donald Beyer Jr,D,18325,9 Baltimore students were killed with guns dur...,7087050
4,WarrenDavidson,OH,Representative,Warren Davidson,R,3166,#NetNeutrality Reminder: Obama used 1934 FCC r...,958737


In [16]:
# Exporting the cleaned and concatinated dataframe
export_df = df.to_csv('../data/concat_df.csv', index = False) 