# Setup

In [1]:
%matplotlib notebook

import itertools
from functools import partial
import numpy as np
import gensim, logging
import pandas as pnd
from sklearn.cluster import *
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, RandomizedPCA
import matplotlib.pyplot as plt

## Topic Probs

In [2]:
df_probs = pnd.read_csv("../models/topic-models/topic.256-1000.model.topic-probs")
del df_probs["stddev"]
del df_probs["mean"]
del df_probs["256"]
prob_columns = list(range(256))
df_probs.columns = ["word"] + prob_columns

In [3]:
df_probs.head(1)

Unnamed: 0,word,0,1,2,3,4,5,6,7,8,...,246,247,248,249,250,251,252,253,254,255
0,telenovela,5.078101e-10,5.014856e-10,1.822816e-10,3.043188e-10,1.651538e-10,4.153677e-10,2.926231e-10,2.955674e-10,5.652887e-10,...,2.693026e-10,2.569924e-10,4.039424e-10,6.2208e-10,2.303773e-10,1.613092e-10,3.159167e-10,2.640946e-10,2.761099e-10,4.271237e-10


## Topics

In [4]:
df_topics = pnd.read_csv("../models/topic-models/topic.256-1000.model.ssv", sep=" ", header=None, encoding="utf-8")
topic_words = set(df_topics.values.flatten())

# Analysis

In [5]:
df_probs["mean"] = df_probs[prob_columns].mean(axis=1)
df_probs["stddev"] = df_probs[prob_columns].std(axis=1)
df_probs = df_probs[df_probs.word.apply(lambda w: w in topic_words)]

In [6]:
df_probs

Unnamed: 0,word,0,1,2,3,4,5,6,7,8,...,248,249,250,251,252,253,254,255,mean,stddev
1,series,5.078101e-10,3.318563e-04,6.821682e-04,7.119882e-05,4.995659e-05,2.142833e-04,7.525114e-04,2.488282e-04,8.737697e-04,...,9.810487e-04,6.220800e-10,2.521309e-04,5.092526e-04,5.566623e-05,2.640946e-10,8.762730e-04,4.526954e-03,0.001089,0.004551
2,american,5.078101e-10,5.014856e-10,1.010494e-03,4.489382e-04,1.651538e-10,4.269398e-04,1.479242e-03,8.075338e-04,7.166130e-04,...,4.729282e-04,3.173304e-03,4.908204e-03,1.613092e-10,3.159167e-10,1.222829e-04,1.725247e-03,4.271237e-10,0.001353,0.002979
5,television,5.078101e-10,5.014856e-10,1.822816e-10,1.310771e-04,1.810735e-05,4.153677e-10,7.507926e-04,5.948727e-04,8.023902e-05,...,4.039424e-10,6.220800e-10,2.303773e-10,1.613092e-10,3.159167e-10,2.640946e-10,2.761099e-10,7.944652e-06,0.000458,0.002389
6,starring,5.078101e-10,5.014856e-10,1.822816e-10,3.043188e-10,1.651538e-10,4.153677e-10,2.926231e-10,9.177703e-04,5.652887e-10,...,4.039424e-10,6.220800e-10,2.303773e-10,4.876192e-04,3.159167e-10,2.640946e-10,2.761099e-10,4.271237e-10,0.000097,0.000616
14,spanish,5.078101e-10,5.014856e-10,7.690690e-05,1.519329e-05,3.330445e-05,4.153677e-10,5.442629e-05,3.385247e-05,5.652887e-10,...,5.338272e-05,4.703626e-03,2.303773e-10,1.613092e-10,3.159167e-10,2.640946e-10,8.433004e-05,4.271237e-10,0.000284,0.002375
16,opera,5.078101e-10,5.014856e-10,1.822816e-10,8.460440e-05,5.707013e-05,4.153677e-10,2.926231e-10,4.777207e-03,5.652887e-10,...,4.039424e-10,6.220800e-10,2.303773e-10,1.266422e-04,3.159167e-10,2.640946e-10,2.761099e-10,4.271237e-10,0.000153,0.001759
17,shot,5.078101e-10,5.014856e-10,8.486507e-04,5.332472e-05,1.651538e-10,4.153677e-10,2.926231e-10,2.955674e-10,1.864858e-04,...,4.039424e-10,6.220800e-10,2.303773e-10,1.040613e-04,3.159167e-10,2.640946e-10,2.761099e-10,4.271237e-10,0.000159,0.000594
19,florida,5.078101e-10,5.014856e-10,1.822816e-10,4.557927e-05,1.651538e-10,4.153677e-10,2.926231e-10,2.955674e-10,2.788984e-04,...,4.039424e-10,1.662474e-04,2.303773e-10,1.613092e-10,2.999806e-05,3.102335e-05,2.761099e-10,4.271237e-10,0.000238,0.002542
21,daily,5.078101e-10,2.567469e-04,4.692934e-05,3.043188e-10,1.773536e-04,3.008910e-04,1.260793e-02,5.034455e-05,5.652887e-10,...,4.039424e-10,6.220800e-10,2.303773e-10,1.045351e-04,3.159167e-10,1.346921e-04,2.761099e-10,4.271237e-10,0.000143,0.000815
22,life,5.078101e-10,5.014856e-10,1.822816e-10,2.690056e-04,3.235077e-07,1.187299e-03,4.657723e-04,5.850353e-04,5.652887e-10,...,8.422583e-05,6.220800e-10,2.303773e-10,6.521747e-03,5.325376e-04,2.481849e-04,2.320150e-03,5.435526e-04,0.000736,0.001680


## Highest mean

In [7]:
df_probs.sort_values(by="mean", ascending=False).head(10)[["word", "mean"]]

Unnamed: 0,word,mean
188,also,0.004739
430,first,0.003866
169,one,0.003301
797,new,0.003234
302,two,0.002761
35,may,0.002097
2268,south,0.001783
1043,county,0.001717
786,state,0.00169
472,district,0.001675


## Lowest mean

In [None]:
df_probs.sort_values(by="mean", ascending=True).head(10)[["word", "mean"]]

## Highest std. dev.

In [8]:
df_probs.sort_values(by="stddev", ascending=False).head(10)[["word", "stddev"]]

Unnamed: 0,word,stddev
797,new,0.013261
279,school,0.011067
699,film,0.010237
35,may,0.008308
1161,york,0.008129
4047,station,0.008006
472,district,0.007321
1300,church,0.007216
297,university,0.007131
1636,list,0.006812


## Lowest std. dev.

In [9]:
df_probs.sort_values(by="stddev", ascending=True).head(10)[["word", "stddev"]]

Unnamed: 0,word,stddev
23721,osaka,0.000225
1199,jacques,0.000268
51626,nerve,0.000271
16076,robot,0.000273
18638,barangay,0.000275
15606,tissue,0.000296
1952,wolf,0.000299
15589,muscle,0.000303
8117,wave,0.000303
19288,bone,0.000306
