In [2]:
import pandas as pd
import numpy as np
import networkx as nx 
import pickle
import re
import itertools
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from svgpathtools import svg2paths
from svgpath2mpl import parse_path
import matplotlib as mpl
from operator import itemgetter
import seaborn as sns
sns.set()
from fa2 import ForceAtlas2
from community import community_louvain
import tqdm
import nltk

In [3]:
Handle_data = pd.read_csv('../Data/Processed/Twitter_Handles_updated.csv', index_col=0)

with open('../Data/Processed/congress_cleaned_processed.pkl', 'rb') as handle:
    Tweets = pickle.load(handle)

with open('../Data/Processed/Usr_ID_dict.pickle', 'rb') as handle:
    Usr_ID_dict = pickle.load(handle)

ID_Usr_dict = {v: k for k, v in Usr_ID_dict.items()}

Extract new data frame based on handle and tweet

In [4]:
Handle_Tweet_df = pd.DataFrame({
    'Handle': Tweets['user_id'].map(ID_Usr_dict),
    'Tweet': Tweets['text']
})

Add list of all tags as new columns

In [5]:
Handle_Tweet_df['tags'] = [re.findall('(?<=@)\S+',tw) for tw in Handle_Tweet_df['Tweet']]

Drop columns where handle is na - this could be bacuse congress memebers have multiple accounts and the mapping misses it

In [6]:
Handle_Tweet_df = Handle_Tweet_df.dropna(axis = 0, subset=['Handle'])

Lower all handles to allow for comparison

In [7]:
Handle_Tweet_df['Handle'] = [handle.lower() for handle in Handle_Tweet_df['Handle']]

Filter out all tags that are not congress memebers

In [8]:
congress_members_lower = [usr.lower() for usr in Handle_data.index]
Handle_Tweet_df['tags'] = [[tag for tag in tags if tag in congress_members_lower] for tags in Handle_Tweet_df['tags']]

Remove all tweets that has no tags as they are not relevant in this cae

In [9]:
Handle_Tweet_df = Handle_Tweet_df[[len(tags)>0 for tags in Handle_Tweet_df['tags']]]

Create a new data frame that contains tweets with politicians as indecies and tagged politicians as columns.

In [10]:
Text_df = pd.DataFrame(
    data = '',
    index = set(Handle_Tweet_df['Handle']),
    columns = set([tags for tag in Handle_Tweet_df['tags'] for tags in tag])
)

In [11]:
for row in tqdm.tqdm(Handle_Tweet_df.iterrows()):
    Text_df.loc[row[1]['Handle'], row[1]['tags']] += row[1]['Tweet']+' '

212274it [04:14, 832.52it/s]


In [12]:
sentiment_table = pd.read_table('../Data/Processed/Sentiment.txt', delimiter="\t")
sentiment_dict = dict(zip(sentiment_table['word'],sentiment_table['happiness_average']))

In [13]:
def sentimentcalc(tokens):
    
    # Get all words with a sentiment score from the token list
    sents = [sentiment_dict[word] for word in tokens if word in sentiment_dict.keys()]
    
    # Return nan if no words have a score
    if len(sents) == 0:
        return 0
    
    # Else return the mean of scores
    else:
        return np.mean(sents)

In [14]:
tokenizer = nltk.RegexpTokenizer(r'\w+')
def tokenize_tweet_calc(tweets):
    tokens = tokenizer.tokenize(tweets)
    return sentimentcalc(tokens)

In [15]:
Text_df_sent = Text_df.applymap(tokenize_tweet_calc)

In [16]:
Text_df_sent

Unnamed: 0,senrobportman,repterrisewell,jeffflake,replowenthal,staceyplaskett,repjoemorelle,sanfordbishop,repannaeshoo,repriggleman,congressmangt,...,repdean,usrepkeating,replipinski,repstevechabot,sentomcotton,repcasten,repkendrahorn,warrendavidson,senshelby,repkaygranger
senrobportman,5.572385,0.000000,5.027778,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,5.594028,5.655294,0.000000,0.000000,5.541143,5.582857,0.000000
repterrisewell,0.000000,5.497415,0.000000,0.000000,5.263119,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,5.995000,0.000000,5.290769,0.000000
jeffflake,5.413846,0.000000,5.462290,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
replowenthal,0.000000,0.000000,0.000000,5.477669,0.000000,0.0,0.0,5.796296,0.0,0.0,...,0.0,5.818966,0.000000,5.525806,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
staceyplaskett,0.000000,5.614857,0.000000,0.000000,5.455182,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,5.721538,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
repkendrahorn,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,5.618359,0.000000,0.000000,0.000000
repcasten,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.000000,5.443038,0.000000,0.000000,5.502582,5.326061,0.000000,0.000000,0.000000
warrendavidson,5.398366,0.000000,0.000000,0.000000,5.876667,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.000000,5.695714,5.185556,5.492222,0.000000,0.000000,5.517688,0.000000,0.000000
senshelby,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,5.502571,0.000000,0.000000,0.000000,5.544500,5.661212


In [17]:
Text_df_sent['realdonaldtrump'].sort_values()

francisrooney      0.000000
repslotkin         0.000000
senbrianschatz     0.000000
repdonyoung        0.000000
repfinkenauer      0.000000
                     ...   
buddforcongress    5.735055
repmartharoby      5.750750
gregformontana     5.769820
sen_joemanchin     5.791478
reptorressmall     5.905161
Name: realdonaldtrump, Length: 580, dtype: float64