# Data Wrangling

In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline

## Create Tweet Nodes

In [2]:
tweets = pd.read_csv("tweet-user-edges.csv")

In [3]:
tweets.head()

Unnamed: 0,tweetId,UserId,twitterName,dateCreated,First Name,Last Name,Political Affiliation
0,1442922344037552128,83508245,dckurek,2021-09-28T18:41:09.000Z,Damien,Kurek,Conservative
1,1442922342644998150,83508245,dckurek,2021-09-28T18:41:09.000Z,Damien,Kurek,Conservative
2,1440394467337334798,83508245,dckurek,2021-09-21T19:16:16.000Z,Damien,Kurek,Conservative
3,1440126344067629056,83508245,dckurek,2021-09-21T01:30:51.000Z,Damien,Kurek,Conservative
4,1449082201023389703,4568748862,ziad_aboultaif,2021-10-15T18:38:14.000Z,Ziad,Aboultaif,Conservative


In [4]:
justTweets = tweets[['tweetId', 'dateCreated']]

In [5]:
justTweets.head()

Unnamed: 0,tweetId,dateCreated
0,1442922344037552128,2021-09-28T18:41:09.000Z
1,1442922342644998150,2021-09-28T18:41:09.000Z
2,1440394467337334798,2021-09-21T19:16:16.000Z
3,1440126344067629056,2021-09-21T01:30:51.000Z
4,1449082201023389703,2021-10-15T18:38:14.000Z


In [6]:
justTweets = justTweets.set_index('tweetId')

In [7]:
justTweets.to_csv('tweet_nodes.csv')

## Create MP Nodes 

In [8]:
mps_df = tweets.drop(['tweetId','dateCreated'], axis=1)

In [9]:
mps_df.head()

Unnamed: 0,UserId,twitterName,First Name,Last Name,Political Affiliation
0,83508245,dckurek,Damien,Kurek,Conservative
1,83508245,dckurek,Damien,Kurek,Conservative
2,83508245,dckurek,Damien,Kurek,Conservative
3,83508245,dckurek,Damien,Kurek,Conservative
4,4568748862,ziad_aboultaif,Ziad,Aboultaif,Conservative


In [10]:
mps_df = mps_df.drop_duplicates()

In [11]:
mps_df.head()

Unnamed: 0,UserId,twitterName,First Name,Last Name,Political Affiliation
0,83508245,dckurek,Damien,Kurek,Conservative
4,4568748862,ziad_aboultaif,Ziad,Aboultaif,Conservative
14,44070432,LailaGoodridge,Laila,Goodridge,Conservative
21,1425866189780160514,POhara2021,Patrick,O'Hara,Bloc Québécois
34,1420552213253038080,NathalieSincla3,Nathalie,Sinclair-Desgagné,Bloc Québécois


In [12]:
mps_df = mps_df.set_index('UserId')

In [13]:
mps_df.to_csv('mps_nodes.csv')

## Get Retweeting User Nodes

In [14]:
tweet_retweet_df = pd.read_csv('tweet-retweet.csv')

In [15]:
tweet_retweet_df.tail()

Unnamed: 0,tweetId,userId
17599,1443563877774602249,422122218
17600,1443563877774602249,2176915064
17601,1443563877774602249,37552089
17602,1443563877774602249,1380961578288316416
17603,1443563877774602249,153515039


In [16]:
retweeters_df = tweet_retweet_df['userId']

In [17]:
retweeters_df.tail()

17599              422122218
17600             2176915064
17601               37552089
17602    1380961578288316416
17603              153515039
Name: userId, dtype: int64

In [30]:
new_df = retweeters_df.drop_duplicates()
new_df = new_df.set_index('userId')

AttributeError: 'Series' object has no attribute 'set_index'

In [22]:
tweet_retweet_df['userId'].size

17604

In [23]:
new_df.size

6540

In [26]:

new_df.to_csv('retweetingUsers_nodes.csv')

AttributeError: 'Series' object has no attribute 'set_index'

In [32]:
new_df = pd.read_csv('retweetingUsers_nodes.csv')

In [34]:
new_df = new_df.set_index('userId')

In [35]:
new_df.head()

Unnamed: 0_level_0,Unnamed: 0
userId,Unnamed: 1_level_1
260265236,0
787774368566816768,1
857464194626727936,2
1307814874999742464,3
2550603348,4


In [36]:
new_df = new_df.drop('Unnamed: 0', axis = 1)

In [37]:
new_df.to_csv('retweetingUsers_nodes.csv')

## Edges for MPs and Tweets

In [38]:
mp_tweets = pd.read_csv('tweet-user-edges.csv')

In [42]:
mp_tweets.head()

Unnamed: 0,tweetId,UserId
0,1442922344037552128,83508245
1,1442922342644998150,83508245
2,1440394467337334798,83508245
3,1440126344067629056,83508245
4,1449082201023389703,4568748862


In [41]:
mp_tweets = mp_tweets[['tweetId', 'UserId']]

In [45]:
mp_tweets = mp_tweets.rename(columns={'tweetId': 'Source', 'UserId': 'Target'})

In [48]:
mp_tweets = mp_tweets.set_index('Source')

In [50]:
mp_tweets.head()

Unnamed: 0_level_0,Target
Source,Unnamed: 1_level_1
1442922344037552128,83508245
1442922342644998150,83508245
1440394467337334798,83508245
1440126344067629056,83508245
1449082201023389703,4568748862


In [49]:
mp_tweets.to_csv('tweet-mp-edges.csv')

# Subset Network Creation

Take a subset of MP's to show off the network

In [51]:
mps_nodes = pd.read_csv('mps_nodes.csv')

In [52]:
mps_nodes_subset = mps_nodes.sample(5)

In [53]:
mps_nodes_subset.head()

Unnamed: 0,UserId,twitterName,First Name,Last Name,Political Affiliation
94,564207331,MichaelKramSK,Michael,Kram,Conservative
180,2352629420,JennaSudds,Jenna,Sudds,Liberal
24,1044738781171961856,GregMcLeanYYC,Greg,McLean,Conservative
60,4504267216,m_pauze,Monique,Pauzé,Bloc Québécois
51,1686740173,LarryMaguireMP,Larry,Maguire,Conservative


Get all tweets from these users

In [55]:
tweet_mp_edges = pd.read_csv('tweet-mp-edges.csv')

In [56]:
tweet_mp_edges.head()

Unnamed: 0,Source,Target
0,1442922344037552128,83508245
1,1442922342644998150,83508245
2,1440394467337334798,83508245
3,1440126344067629056,83508245
4,1449082201023389703,4568748862


Run a join to get the correct tweets

In [57]:
tweet_mp_edges_subset = pd.merge(tweet_mp_edges, mps_nodes_subset, left_on='Target', right_on='UserId')

In [59]:
tweet_mp_edges_subset.head(100)

Unnamed: 0,Source,Target,UserId,twitterName,First Name,Last Name,Political Affiliation
0,1445512064978542592,1044738781171961856,1044738781171961856,GregMcLeanYYC,Greg,McLean,Conservative
1,1444012251720011777,1044738781171961856,1044738781171961856,GregMcLeanYYC,Greg,McLean,Conservative
2,1443606785194033157,1044738781171961856,1044738781171961856,GregMcLeanYYC,Greg,McLean,Conservative
3,1440809249307697153,1044738781171961856,1044738781171961856,GregMcLeanYYC,Greg,McLean,Conservative
4,1439760917349109763,1044738781171961856,1044738781171961856,GregMcLeanYYC,Greg,McLean,Conservative
...,...,...,...,...,...,...,...
86,1440154571511193602,2352629420,2352629420,JennaSudds,Jenna,Sudds,Liberal
87,1440114449336213504,2352629420,2352629420,JennaSudds,Jenna,Sudds,Liberal
88,1440010814644264961,2352629420,2352629420,JennaSudds,Jenna,Sudds,Liberal
89,1439945181780979713,2352629420,2352629420,JennaSudds,Jenna,Sudds,Liberal


Get the Retweeters of these tweets 

In [66]:
tweet_retweet_edges = pd.read_csv('tweet-retweet-edges.csv')
tweet_retweet_edges = tweet_retweet_edges.rename(columns={'Target': 'Tweet', 'Source': 'Retweeter'})

In [67]:
tweet_retweet_edges.head()

Unnamed: 0,Tweet,Retweeter
0,1442922344037552128,260265236
1,1442922344037552128,787774368566816768
2,1442922344037552128,857464194626727936
3,1442922344037552128,1307814874999742464
4,1442922344037552128,2550603348


Run a join on the tweet ids to get that subset 

In [68]:
tweet_retweet_edges_subset = pd.merge(tweet_retweet_edges, tweet_mp_edges_subset, left_on='Tweet', right_on='Source')

In [71]:
tweet_retweet_edges_subset.head(1000)

Unnamed: 0,Tweet,Retweeter,Source,Target,UserId,twitterName,First Name,Last Name,Political Affiliation
0,1445512064978542592,1016018605907906560,1445512064978542592,1044738781171961856,1044738781171961856,GregMcLeanYYC,Greg,McLean,Conservative
1,1445512064978542592,78627903,1445512064978542592,1044738781171961856,1044738781171961856,GregMcLeanYYC,Greg,McLean,Conservative
2,1445512064978542592,1103082890030604288,1445512064978542592,1044738781171961856,1044738781171961856,GregMcLeanYYC,Greg,McLean,Conservative
3,1445512064978542592,98449349,1445512064978542592,1044738781171961856,1044738781171961856,GregMcLeanYYC,Greg,McLean,Conservative
4,1444012251720011777,1293510025680478208,1444012251720011777,1044738781171961856,1044738781171961856,GregMcLeanYYC,Greg,McLean,Conservative
...,...,...,...,...,...,...,...,...,...
412,1439945181780979713,2324359094,1439945181780979713,2352629420,2352629420,JennaSudds,Jenna,Sudds,Liberal
413,1439945181780979713,302005426,1439945181780979713,2352629420,2352629420,JennaSudds,Jenna,Sudds,Liberal
414,1439945181780979713,177651264,1439945181780979713,2352629420,2352629420,JennaSudds,Jenna,Sudds,Liberal
415,1439945181780979713,1079374636608114689,1439945181780979713,2352629420,2352629420,JennaSudds,Jenna,Sudds,Liberal


# Clean DF's and persist

In [75]:
clean_mps_nodes_subset = mps_nodes_subset.set_index('UserId')

In [76]:
clean_mps_nodes_subset.head()

Unnamed: 0_level_0,twitterName,First Name,Last Name,Political Affiliation
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
564207331,MichaelKramSK,Michael,Kram,Conservative
2352629420,JennaSudds,Jenna,Sudds,Liberal
1044738781171961856,GregMcLeanYYC,Greg,McLean,Conservative
4504267216,m_pauze,Monique,Pauzé,Bloc Québécois
1686740173,LarryMaguireMP,Larry,Maguire,Conservative


In [77]:
clean_mps_nodes_subset.to_csv('SubsetNetwork/mp_nodes.csv')

In [78]:
clean_tweet_mp_edges_subset = tweet_mp_edges_subset[['Source','Target']]
clean_tweet_mp_edges_subset =clean_tweet_mp_edges_subset.set_index('Source')
clean_tweet_mp_edges_subset.head()

Unnamed: 0_level_0,Target
Source,Unnamed: 1_level_1
1445512064978542592,1044738781171961856
1444012251720011777,1044738781171961856
1443606785194033157,1044738781171961856
1440809249307697153,1044738781171961856
1439760917349109763,1044738781171961856


In [80]:
clean_tweet_mp_edges_subset.to_csv('SubsetNetwork/tweet_mp_edge.csv')

In [82]:
clean_tweet_retweet_edges_subset = tweet_retweet_edges_subset[['Tweet', 'Retweeter']]
clean_tweet_retweet_edges_subset = clean_tweet_retweet_edges_subset.rename(columns={'Tweet': 'Target', 'Retweeter': 'Source'})
clean_tweet_retweet_edges_subset = clean_tweet_retweet_edges_subset.set_index('Target')
clean_tweet_retweet_edges_subset.head()

Unnamed: 0_level_0,Source
Target,Unnamed: 1_level_1
1445512064978542592,1016018605907906560
1445512064978542592,78627903
1445512064978542592,1103082890030604288
1445512064978542592,98449349
1444012251720011777,1293510025680478208
