# 03. Tweet Concatination

This notebook combines all tweets for a single user (Senator or Representative) into a single document for further processing using spaCy, CVEC, and TF-IDF.

## Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# Importing the merged Tweets DF
data = pd.read_csv('../data/cleaned_df.csv')

In [3]:
data.head()

Unnamed: 0,id,screen_name,user_id,time,link,text,source,state,position,name,party,congress
0,1267682738053808129,CongressmanRaja,814179031956488192,2020-06-02T01:01:31-04:00,https://www.twitter.com/CongressmanRaja/status...,Tonight: “The president had staged an elaborat...,Twitter for iPhone,IL,Representative,Raja Krishnamoorthi,D,116
1,1267681644456140800,CongressmanRaja,814179031956488192,2020-06-02T00:57:11-04:00,https://www.twitter.com/CongressmanRaja/status...,"If that doesn’t make you mad, read it again. h...",Twitter for iPhone,IL,Representative,Raja Krishnamoorthi,D,116
2,1267670971730755594,CongressmanRaja,814179031956488192,2020-06-02T00:14:46-04:00,https://www.twitter.com/JStein_WaPo/statuses/1...,RT @JStein_WaPo This reporter should come out ...,Twitter for iPhone,IL,Representative,Raja Krishnamoorthi,D,116
3,1267669652659978240,CongressmanRaja,814179031956488192,2020-06-02T00:09:32-04:00,https://www.twitter.com/CongressmanRaja/status...,"@KevinMKruse Thank you, and I also loved your ...",Twitter for iPhone,IL,Representative,Raja Krishnamoorthi,D,116
4,1267859609995395081,CongressmanRaja,814179031956488192,2020-06-02T12:44:21-04:00,https://www.twitter.com/CongressmanRaja/status...,I'm proud to call @GerryConnolly a colleague a...,Twitter Web App,IL,Representative,Raja Krishnamoorthi,D,116


## Concatination of Tweets for each Senator / Rep

In [4]:
# Counting the number of tweets from each rep / senator
data_grouped = data.groupby(['screen_name'])['id'].count().reset_index()

# renaming column to tweet_count for clarity
data_grouped.rename(columns={'id': 'tweet_count'}, inplace=True)

In [5]:
# sorting the tweet count df by number of tweets to get the top 5 most prolific tweeters
data_grouped.sort_values(by=['tweet_count'], ascending=False)[:5]

Unnamed: 0,screen_name,tweet_count
45,JohnCornyn,13257
78,RepAndyBiggsAZ,12647
164,RepDonBeyer,12182
48,LacyClayMO1,10256
169,RepDwightEvans,9702


In [6]:
# merging the tweet count df back onto the main df
df = pd.merge(data, data_grouped, on='screen_name', how='outer')

In [7]:
# confirming that merge was successful
print(f'Pre-tweet count merge size: {data.shape}')
print(f'Post-tweet count merge size: {df.shape}')

Pre-tweet count merge size: (1115000, 12)
Post-tweet count merge size: (1115000, 13)


In [8]:
# checking to see that counts match up w/ new df
# counting the number of rows for each user
df['screen_name'].value_counts(ascending=False)[:5]

# the tweet count from 'data_grouped' above matches the number 
# of rows in the larger df

JohnCornyn        13257
RepAndyBiggsAZ    12647
RepDonBeyer       12182
LacyClayMO1       10256
RepDwightEvans     9702
Name: screen_name, dtype: int64

In [9]:
df_text_concat = df.groupby(['screen_name']).agg({'text': ' '.join})

df_text_concat.rename(columns={'text': 'text_concat'}, inplace=True)

df_text_concat.head()

Unnamed: 0_level_0,text_concat
screen_name,Unnamed: 1_level_1
AustinScottGA08,I am deeply saddened to hear of former Rep. Sa...
BennieGThompson,It is unfortunate that the President used a ch...
BettyMcCollum04,"And if you’re looking to provide help, here’s ..."
BillCassidy,Central Private School broke ground last week ...
BillPascrell,Overnight trump barricaded the White House per...


In [10]:
# checking concatinated string length to see that text has been combined
df_text_concat['text_length'] = df_text_concat['text_concat'].apply(len)

df_text_concat.head()

Unnamed: 0_level_0,text_concat,text_length
screen_name,Unnamed: 1_level_1,Unnamed: 2_level_1
AustinScottGA08,I am deeply saddened to hear of former Rep. Sa...,154594
BennieGThompson,It is unfortunate that the President used a ch...,405870
BettyMcCollum04,"And if you’re looking to provide help, here’s ...",1034566
BillCassidy,Central Private School broke ground last week ...,425080
BillPascrell,Overnight trump barricaded the White House per...,2362375


In [11]:
# merging the concatinated text column back onto the larger df
df = pd.merge(df, df_text_concat, on='screen_name', how='outer')

# confirming no lost data by examining shape of data frame
df.shape

(1115000, 15)

In [12]:
df.head()

Unnamed: 0,id,screen_name,user_id,time,link,text,source,state,position,name,party,congress,tweet_count,text_concat,text_length
0,1267682738053808129,CongressmanRaja,814179031956488192,2020-06-02T01:01:31-04:00,https://www.twitter.com/CongressmanRaja/status...,Tonight: “The president had staged an elaborat...,Twitter for iPhone,IL,Representative,Raja Krishnamoorthi,D,116,4845,Tonight: “The president had staged an elaborat...,1623390
1,1267681644456140800,CongressmanRaja,814179031956488192,2020-06-02T00:57:11-04:00,https://www.twitter.com/CongressmanRaja/status...,"If that doesn’t make you mad, read it again. h...",Twitter for iPhone,IL,Representative,Raja Krishnamoorthi,D,116,4845,Tonight: “The president had staged an elaborat...,1623390
2,1267670971730755594,CongressmanRaja,814179031956488192,2020-06-02T00:14:46-04:00,https://www.twitter.com/JStein_WaPo/statuses/1...,RT @JStein_WaPo This reporter should come out ...,Twitter for iPhone,IL,Representative,Raja Krishnamoorthi,D,116,4845,Tonight: “The president had staged an elaborat...,1623390
3,1267669652659978240,CongressmanRaja,814179031956488192,2020-06-02T00:09:32-04:00,https://www.twitter.com/CongressmanRaja/status...,"@KevinMKruse Thank you, and I also loved your ...",Twitter for iPhone,IL,Representative,Raja Krishnamoorthi,D,116,4845,Tonight: “The president had staged an elaborat...,1623390
4,1267859609995395081,CongressmanRaja,814179031956488192,2020-06-02T12:44:21-04:00,https://www.twitter.com/CongressmanRaja/status...,I'm proud to call @GerryConnolly a colleague a...,Twitter Web App,IL,Representative,Raja Krishnamoorthi,D,116,4845,Tonight: “The president had staged an elaborat...,1623390


In [13]:
# dropping duplicates as we only need concatinated text column for purposes of future analyses
df = df.drop_duplicates(['screen_name'], keep='last')

print(df.shape)
print('----------')
df.head()

(524, 15)
----------


Unnamed: 0,id,screen_name,user_id,time,link,text,source,state,position,name,party,congress,tweet_count,text_concat,text_length
4844,1268372252548124673,CongressmanRaja,814179031956488192,2020-06-03T22:41:24-04:00,https://www.twitter.com/CongressmanRaja/status...,Somehow this is yet another example of this Wh...,Twitter for iPhone,IL,Representative,Raja Krishnamoorthi,D,116,4845,Tonight: “The president had staged an elaborat...,1623390
8431,1268277347117129731,RepMcGovern,242426145,2020-06-03T16:24:17-04:00,https://www.twitter.com/RepMcGovern/statuses/1...,Let's talk police militarization.\n\nRight now...,Twitter Web App,MA,Representative,James McGovern,D,116,3587,Who the hell does Donald Trump think he is tea...,1313476
11845,1268366811948634113,justinamash,233842454,2020-06-03T22:19:47-04:00,https://www.twitter.com/RepPressley/statuses/1...,RT @RepPressley Qualified immunity shields pol...,Twitter for iPhone,MI,Representative,Justin Amash,L,116,3414,"Rioting and looting—that’s selfish, destructiv...",811799
18194,1268362708795240451,ChrisMurphyCT,150078976,2020-06-03T22:03:29-04:00,https://www.twitter.com/ChrisMurphyCT/statuses...,The blame for the lack of federal leadership o...,Twitter Web App,CT,Senator,Christopher Murphy,D,116,6349,It’s important that every American have the ch...,2076035
21392,1268263947767078919,NormaJTorres,236279233,2020-06-03T15:31:03-04:00,https://www.twitter.com/NormaJTorres/statuses/...,Our community is grieving from the painful eve...,Twitter Web App,CA,Representative,Norma Torres,D,116,3198,"@jolingkent take care, thank you for continuin...",1154739


In [14]:
# dropping extra columns
df = df.drop(['id', 'user_id', 'time', 'link', 'text', 'source', 'congress'], axis = 1)

print(df.shape)
print('----------')
df.head()

(524, 8)
----------


Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length
4844,CongressmanRaja,IL,Representative,Raja Krishnamoorthi,D,4845,Tonight: “The president had staged an elaborat...,1623390
8431,RepMcGovern,MA,Representative,James McGovern,D,3587,Who the hell does Donald Trump think he is tea...,1313476
11845,justinamash,MI,Representative,Justin Amash,L,3414,"Rioting and looting—that’s selfish, destructiv...",811799
18194,ChrisMurphyCT,CT,Senator,Christopher Murphy,D,6349,It’s important that every American have the ch...,2076035
21392,NormaJTorres,CA,Representative,Norma Torres,D,3198,"@jolingkent take care, thank you for continuin...",1154739


In [15]:
# resetting the index
df.reset_index(inplace = True)

# need this line other wise index column will be created
# w/ old index values.
df.drop(['index'], axis = 1, inplace = True)

df.head()

Unnamed: 0,screen_name,state,position,name,party,tweet_count,text_concat,text_length
0,CongressmanRaja,IL,Representative,Raja Krishnamoorthi,D,4845,Tonight: “The president had staged an elaborat...,1623390
1,RepMcGovern,MA,Representative,James McGovern,D,3587,Who the hell does Donald Trump think he is tea...,1313476
2,justinamash,MI,Representative,Justin Amash,L,3414,"Rioting and looting—that’s selfish, destructiv...",811799
3,ChrisMurphyCT,CT,Senator,Christopher Murphy,D,6349,It’s important that every American have the ch...,2076035
4,NormaJTorres,CA,Representative,Norma Torres,D,3198,"@jolingkent take care, thank you for continuin...",1154739


In [16]:
# Exporting the cleaned and concatinated dataframe
export_df = df.to_csv('../data/concat_df.csv', index = False) 