# Data Wrangling

In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline

## Create Tweet Nodes

In [2]:
tweets = pd.read_csv("tweet-user-edges.csv")

In [3]:
tweets.head()

Unnamed: 0,tweetId,UserId,twitterName,dateCreated,First Name,Last Name,Political Affiliation
0,1442922344037552128,83508245,dckurek,2021-09-28T18:41:09.000Z,Damien,Kurek,Conservative
1,1442922342644998150,83508245,dckurek,2021-09-28T18:41:09.000Z,Damien,Kurek,Conservative
2,1440394467337334798,83508245,dckurek,2021-09-21T19:16:16.000Z,Damien,Kurek,Conservative
3,1440126344067629056,83508245,dckurek,2021-09-21T01:30:51.000Z,Damien,Kurek,Conservative
4,1449082201023389703,4568748862,ziad_aboultaif,2021-10-15T18:38:14.000Z,Ziad,Aboultaif,Conservative


In [4]:
justTweets = tweets[['tweetId', 'dateCreated']]

In [5]:
justTweets.head()

Unnamed: 0,tweetId,dateCreated
0,1442922344037552128,2021-09-28T18:41:09.000Z
1,1442922342644998150,2021-09-28T18:41:09.000Z
2,1440394467337334798,2021-09-21T19:16:16.000Z
3,1440126344067629056,2021-09-21T01:30:51.000Z
4,1449082201023389703,2021-10-15T18:38:14.000Z


In [6]:
justTweets = justTweets.set_index('tweetId')

In [7]:
justTweets.to_csv('tweet_nodes.csv')

## Create MP Nodes 

In [8]:
mps_df = tweets.drop(['tweetId','dateCreated'], axis=1)

In [9]:
mps_df.head()

Unnamed: 0,UserId,twitterName,First Name,Last Name,Political Affiliation
0,83508245,dckurek,Damien,Kurek,Conservative
1,83508245,dckurek,Damien,Kurek,Conservative
2,83508245,dckurek,Damien,Kurek,Conservative
3,83508245,dckurek,Damien,Kurek,Conservative
4,4568748862,ziad_aboultaif,Ziad,Aboultaif,Conservative


In [10]:
mps_df = mps_df.drop_duplicates()

In [11]:
mps_df.head()

Unnamed: 0,UserId,twitterName,First Name,Last Name,Political Affiliation
0,83508245,dckurek,Damien,Kurek,Conservative
4,4568748862,ziad_aboultaif,Ziad,Aboultaif,Conservative
14,44070432,LailaGoodridge,Laila,Goodridge,Conservative
21,1425866189780160514,POhara2021,Patrick,O'Hara,Bloc Québécois
34,1420552213253038080,NathalieSincla3,Nathalie,Sinclair-Desgagné,Bloc Québécois


In [12]:
mps_df = mps_df.set_index('UserId')

In [13]:
mps_df.to_csv('mps_nodes.csv')

## Get Retweeting User Nodes

In [14]:
tweet_retweet_df = pd.read_csv('tweet-retweet.csv')

In [15]:
tweet_retweet_df.tail()

Unnamed: 0,tweetId,userId
17599,1443563877774602249,422122218
17600,1443563877774602249,2176915064
17601,1443563877774602249,37552089
17602,1443563877774602249,1380961578288316416
17603,1443563877774602249,153515039


In [18]:
retweeters_df = tweet_retweet_df[['userId']]

In [19]:
retweeters_df.tail()

Unnamed: 0,userId
17599,422122218
17600,2176915064
17601,37552089
17602,1380961578288316416
17603,153515039


In [20]:
new_df = retweeters_df.drop_duplicates()
new_df = new_df.set_index('userId')

In [None]:
tweet_retweet_df['userId'].size

In [None]:
new_df.size

In [None]:

new_df.to_csv('retweetingUsers_nodes.csv')

In [None]:
new_df = pd.read_csv('retweetingUsers_nodes.csv')

In [None]:
new_df = new_df.set_index('userId')

In [None]:
new_df.head()

In [None]:
new_df = new_df.drop('Unnamed: 0', axis = 1)

In [None]:
new_df.to_csv('retweetingUsers_nodes.csv')

## Edges for MPs and Tweets

In [21]:
mp_tweets = pd.read_csv('tweet-user-edges.csv')

In [22]:
mp_tweets.head()

Unnamed: 0,tweetId,UserId,twitterName,dateCreated,First Name,Last Name,Political Affiliation
0,1442922344037552128,83508245,dckurek,2021-09-28T18:41:09.000Z,Damien,Kurek,Conservative
1,1442922342644998150,83508245,dckurek,2021-09-28T18:41:09.000Z,Damien,Kurek,Conservative
2,1440394467337334798,83508245,dckurek,2021-09-21T19:16:16.000Z,Damien,Kurek,Conservative
3,1440126344067629056,83508245,dckurek,2021-09-21T01:30:51.000Z,Damien,Kurek,Conservative
4,1449082201023389703,4568748862,ziad_aboultaif,2021-10-15T18:38:14.000Z,Ziad,Aboultaif,Conservative


In [23]:
mp_tweets = mp_tweets[['tweetId', 'UserId']]

In [None]:
mp_tweets = mp_tweets.rename(columns={'tweetId': 'Source', 'UserId': 'Target'})

In [None]:
mp_tweets = mp_tweets.set_index('Source')

In [24]:
mp_tweets.head()

Unnamed: 0,tweetId,UserId
0,1442922344037552128,83508245
1,1442922342644998150,83508245
2,1440394467337334798,83508245
3,1440126344067629056,83508245
4,1449082201023389703,4568748862


In [None]:
mp_tweets.to_csv('tweet-mp-edges.csv')

# Subset Network Creation

Take a subset of MP's to show off the network

In [None]:
mps_nodes = pd.read_csv('mps_nodes.csv')

In [None]:
mps_nodes_subset = mps_nodes.sample(5)

In [None]:
mps_nodes_subset.head()

Get all tweets from these users

In [None]:
tweet_mp_edges = pd.read_csv('tweet-mp-edges.csv')

In [None]:
tweet_mp_edges.head()

Run a join to get the correct tweets

In [None]:
tweet_mp_edges_subset = pd.merge(tweet_mp_edges, mps_nodes_subset, left_on='Target', right_on='UserId')

In [None]:
tweet_mp_edges_subset.head(100)

Get the Retweeters of these tweets 

In [None]:
tweet_retweet_edges = pd.read_csv('tweet-retweet-edges.csv')
tweet_retweet_edges = tweet_retweet_edges.rename(columns={'Target': 'Tweet', 'Source': 'Retweeter'})

In [None]:
tweet_retweet_edges.head()

Run a join on the tweet ids to get that subset 

In [None]:
tweet_retweet_edges_subset = pd.merge(tweet_retweet_edges, tweet_mp_edges_subset, left_on='Tweet', right_on='Source')

In [None]:
tweet_retweet_edges_subset.head(1000)

# Clean DF's and persist

In [None]:
clean_mps_nodes_subset = mps_nodes_subset.set_index('UserId')

In [None]:
clean_mps_nodes_subset.head()

In [None]:
clean_mps_nodes_subset.to_csv('SubsetNetwork/mp_nodes.csv')

In [None]:
clean_tweet_mp_edges_subset = tweet_mp_edges_subset[['Source','Target']]
clean_tweet_mp_edges_subset =clean_tweet_mp_edges_subset.set_index('Source')
clean_tweet_mp_edges_subset.head(300)

In [None]:
clean_tweet_mp_edges_subset.to_csv('SubsetNetwork/tweet_mp_edge.csv')

In [None]:
clean_tweet_retweet_edges_subset = tweet_retweet_edges_subset[['Tweet', 'Retweeter']]
clean_tweet_retweet_edges_subset = clean_tweet_retweet_edges_subset.rename(columns={'Tweet': 'Target', 'Retweeter': 'Source'})
clean_tweet_retweet_edges_subset = clean_tweet_retweet_edges_subset.set_index('Target')
clean_tweet_retweet_edges_subset.head()

In [None]:
clean_tweet_retweet_edges_subset.to_csv('SubsetNetwork/tweet_retweet_edge.csv')

Nodes

In [None]:
import csv

In [None]:
f1 = open('SubsetNetwork/retweeterNodes.csv', 'w')
writer1 = csv.writer(f1)
for id in clean_tweet_retweet_edges_subset['Source'].unique():
    f1.write("{},\n".format(id))
    

In [None]:
tweet_mp_edges_subset[['Source']].set_index('Source').to_csv('SubsetNetwork/tweetNodes.csv')