In [None]:
import json
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import holoviews as hv
hv.extension('bokeh')

In [None]:
PoliticianLatents = json.load(open("PoliticianLatentsE2.json", 'r'))
PoliticianSentences = json.load(open('PoliticianSentences.json', 'r'))

In [None]:
AllLatents = None
for key, value in PoliticianLatents.items():
    print(key, np.shape(PoliticianLatents[key]))
    PoliticianLatents[key] = np.array(value)
    if AllLatents is None:
        AllLatents = PoliticianLatents[key]
    else:
        AllLatents = np.append(AllLatents, PoliticianLatents[key], axis=0)

left = ['test.SenSanders', 'test.ewarren', 'test.AOC', 'test.HillaryClinton', 'test.SenatorDurbin', 'test.Sen_JoeManchin', 'test.SenatorTester', 'test.SenSchumer', 'test.cbellantoni', 'test.donnabrazile', 'test.ggreenwald', 'test.nicopitney']
right = ['test.senatemajldr', 'test.LindseyGrahamSC', 'test.realDonaldTrump', 'test.MittRomney', 'test.GOPLeader', 'test.RepDougCollins', 'test.ewerickson', 'test.mindyfinn', 'test.TPCarney', 'test.anamariecox', 'test.TuckerCarlson']

print(len(right)+len(left))

In [None]:
Sentences = []
for key, value in PoliticianSentences.items():
    for sen in value:
        example = {'Name': key, 'Sentence': sen}
        Sentences.append(example)
Sentences = pd.DataFrame(Sentences)

In [None]:
PoliticianSentences['test.realDonaldTrump'][0]

In [None]:
tsne = TSNE(n_components=2).fit_transform(AllLatents)
PreFrame = []
colors=[]
c = 0
for key, value in PoliticianLatents.items():
    colors.append({'Name': key, 'Color': 'blue' if key in left else 'red'})
    for latent in value:
        example = {'Name': key, 'Latent': latent, 'tsne': tsne[c]}
        c += 1
        PreFrame.append(example)
data = pd.DataFrame(PreFrame)
colors = pd.DataFrame(colors)
combined = data.set_index('Name').join(colors.set_index('Name'))
combined = combined.reset_index()
data.count()

In [None]:
plot_data = [list(combined.iloc[el].tsne) + [combined.iloc[el].Color] for el in range(combined.count()[0])]

In [None]:
def compare_politicians(left, right):
    left = list(data[data['Name']==f'test.{left}'].tsne)
    right = list(data[data['Name']==f'test.{right}'].tsne)
    others = list(data.query(f"Name not in ('test.{left}', 'test.{right}')").tsne)
    return left, right, others
   

In [None]:
hv.Scatter(plot_data, vdims=['y', 'z']).opts(color='z', width=900, height=600, size=10)

In [None]:
Sentences

In [None]:
def udf(x):
    if (x[0] > 2 and x[0] < 12) and (x[1] > 2 and x[1] < 25):
        return True
    else:
        return False
# combined[[True]*19590]
DemocraticSentences = combined[list(map(lambda x: udf(x), combined.tsne))].index
Sentences.iloc[DemocraticSentences]

In [None]:
combined.drop('Color', axis=1).groupby(['Name', 'Names']).agg(lambda x: np.mean(x, axis=0))

In [None]:
np.array(data.groupby(['Name']).agg(lambda x: np.mean(x, axis=0)).tsne).mean(axis=0)

In [None]:
averages_red = np.array(combined[combined['Color']=='red'].tsne).mean(axis=0)
num_red = len(np.array(combined[combined['Color']=='red'].tsne))

averages_blue = np.array(combined[combined['Color']=='blue'].tsne).mean(axis=0)
num_blue = len(np.array(combined[combined['Color']=='blue'].tsne))

p_red = num_red/(num_red+num_blue)
p_blue = num_blue/(num_red+num_blue)

p_red*averages_red + p_blue*averages_blue

In [None]:
combined.head()

In [None]:
l, r, o = compare_politicians('senatemajldr', 'AOC')

lscat = hv.Scatter(l).opts(size=8, width=900, height=600, color='blue')
rscat = hv.Scatter(r).opts(size=8, width=900, height=600, color='red')
oscat = hv.Scatter(o).opts(size=8, width=900, height=600, color='lightgrey')

oscat * lscat * rscat

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

labels = [1 if el=='blue' else 0 for el in list(combined.Color)]

# Reshape Data
dtrain, dtest, ltrain, ltest = train_test_split(np.array(combined.Latent), np.array(labels), test_size=.2, shuffle=True)
dtrain = np.array([list(el) for el in list(dtrain)])
ltrain = np.array([el for el in list(ltrain)])
dtest = np.array([list(el) for el in list(dtest)])
ltest = np.array([el for el in list(ltest)])

rfc = RandomForestClassifier().fit(dtrain, ltrain)
preds_cl = rfc.predict(dtest)
print(sum([preds_cl[i]==ltest[i] for i in range(len(ltest))]))
score = rfc.score(dtest, ltest)
cnf_m = confusion_matrix(ltest, preds_cl)
print("confusion matrix:")
print(cnf_m)
print("model accuracy:")
print(score)

