In [1]:
vocabulary_size = 380000
vector_size = 200
num_topics = 256
word_vectors = vocabulary_size * vector_size
topic_vectors = num_topics * vector_size
dot_product_values = num_topics * vocabulary_size
exp_dot_product_values = num_topics * vocabulary_size

all_together = (word_vectors + topic_vectors + dot_product_values + exp_dot_product_values) * 8.0 / (1024 * 1024)
all_together

2064.599609375

* Spearmint for analogy reasoning
* Gaussian LDA
* Evaluate word analogy reasoning
* evalutate topic models
* find background noise
* find word pairs

# Setup

In [2]:
%matplotlib notebook

import itertools
import logging
from functools import partial

import gensim
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pnd
from sklearn.cluster import *
from sklearn.decomposition import PCA, RandomizedPCA
from sklearn.manifold import TSNE

from knub.thesis.util import *
matplotlib.style.use('ggplot')

In [3]:
from IPython.core.display import HTML
HTML("""
<style>
div.text_cell_render p, div.text_cell_render ul, table.dataframe {
font-size:1.3em;
line-height:1.1em;
}
</style>
""")

# Preprocessing

In [4]:
MODEL = "../models/topic-models/topic.full.alpha-1-100.256-400.model"
MODEL = "../models/topic-models/topic.256-400.first-2000.alpha-001.beta-001.model"

In [5]:
print "Load vectors"
vectors = load_skip_gram()
model = TopicModelLoader(MODEL, vectors)
print "Load topic probs"
df_topic_probs_full = model.load_topic_probs()
print "Load topics"
df_topics = model.load_topics()
print "Load topic similars"
df_topic_similars = model.load_all_topic_similars()

Load vectors
Load topic probs
Load topics
Load topic similars


# Topic Probs Analysis

In [6]:
df_topic_probs = df_topic_probs_full[df_topic_probs_full["word"].apply(lambda w: w in model.topic_words)].copy()
df_topic_probs["stddev"] = df_topic_probs[model.prob_columns].std(axis=1)

word-prob does not sum to one, because we only write out frequent words

In [7]:
df_topic_probs_full["word-prob"].sum()

0.9596501221011898

In [8]:
df_topic_probs.head(3)[model.prob_columns].sum(axis=1)

1    1.0
2    1.0
5    1.0
dtype: float64

In [9]:
def topic_prob_difference_from_first_to(row, n):
    s = sorted(row, reverse=True)
    return s[0] - s[n - 1]
    

for diff in [2, 5, 50]:
    column_name = "diff-" + str(diff)
    df_topic_probs_full[column_name] = df_topic_probs_full[model.prob_columns].apply(
        partial(topic_prob_difference_from_first_to, n=diff), axis=1)

## Strength of topic prevalence

### Against second best topic

In [10]:
plt.figure()
df_topic_probs_full["diff-2"].hist(bins=20)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7feb086b8e90>

### Against fifth best topic

In [11]:
plt.figure()
df_topic_probs_full["diff-5"].hist(bins=20)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7feab0b1db90>

### Against fiftieth best topic

In [12]:
plt.figure()
df_topic_probs_full["diff-50"].hist(bins=20)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7feaafb31750>

## Most common words

In [13]:
df_topic_probs_full.sort_values(by="word-prob", ascending=False).head(10)[["word", "word-prob"]]

Unnamed: 0,word,word-prob
188,also,0.004701
430,first,0.004131
169,one,0.003596
797,new,0.003485
302,two,0.00292
483,time,0.002149
279,school,0.002056
951,years,0.001993
35,may,0.001904
412,later,0.001726


## Highest std. dev.

In [14]:
df_topic_probs.sort_values(by="stddev", ascending=False).head(10)[["word", "stddev"]]

Unnamed: 0,word,stddev
94710,gmina,0.062499
173965,autobots,0.062499
194457,decepticons,0.062498
192465,woreda,0.062498
194455,autobot,0.062498
167477,tambon,0.062498
18922,barangay,0.062495
190248,bulbophyllum,0.062495
24870,gastropod,0.062463
195697,megatron,0.062452


## Lowest std. dev.

In [15]:
df_topic_probs.sort_values(by="stddev", ascending=True).head(10)[["word", "stddev"]]

Unnamed: 0,word,stddev
169,one,0.004855
302,two,0.00515
430,first,0.005165
188,also,0.005223
354,known,0.00568
680,part,0.005735
199,following,0.006626
505,history,0.007499
412,later,0.007548
2167,early,0.007572


# Correlation TM similarity and WE similarity

### Ten most similar words for each top-10-topic word

 Topic model similarity evaluated using different probability distribution similarity measures (evaluated on the normalized word-topic distributions):
 
 * [Jensen-Shannon divergence](https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence)
 * Hellinger distance
 * Bhattacharyya coefficient

In [16]:
df_topic_similars.head()

Unnamed: 0,word,similar_word,tm_sim_max,tm_sim_sum,tm_sim_bhattacharyya,tm_sim_hellinger,tm_sim_jensen-shannon,we_sim
0,magic,magical,0.931362,0.751752,0.851728,0.614939,0.842833,0.708294
1,magic,fae,0.914796,0.655015,0.780721,0.531727,0.766116,0.413486
2,magic,summoning,0.916409,0.646953,0.783936,0.535173,0.767636,0.518642
3,magic,hideous,0.920577,0.668182,0.786882,0.538353,0.784488,0.437921
4,magic,enchant,0.920613,0.68483,0.809282,0.563287,0.795434,0.505162


### Correlation between TM and WE similarity

In [17]:
model.sim_functions = ["max", "sum", "bhattacharyya", "hellinger", "jensen-shannon"]

sim_corrs_spearman = []
sim_corrs_pearson = []
for sim_function in model.sim_functions:
    corr_spearman = df_topic_similars[["tm_sim_%s" % sim_function, "we_sim"]].corr("spearman").ix[0,1]
    corr_pearson = df_topic_similars[["tm_sim_%s" % sim_function, "we_sim"]].corr("pearson").ix[0,1]
    sim_corrs_spearman.append(corr_spearman)
    sim_corrs_pearson.append(corr_pearson)

df_tmp = pnd.DataFrame(model.sim_functions, columns=["sim_function"])
df_tmp["sim_corr_spearman"] = sim_corrs_spearman
df_tmp["sim_corr_pearson"] = sim_corrs_pearson
df_tmp

Unnamed: 0,sim_function,sim_corr_spearman,sim_corr_pearson
0,max,0.101827,0.108241
1,sum,0.372219,0.366995
2,bhattacharyya,0.37489,0.388173
3,hellinger,0.37489,0.359338
4,jensen-shannon,0.372435,0.386489


**Note: Similar results Google vectors**

### Distribution of TM similarity

In [18]:
plt.figure()
df_topic_similars["tm_sim_jensen-shannon"].hist(bins=100)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7feab1c02b90>

### Distribution of WE similarity

In [19]:
plt.figure()
df_topic_similars["we_sim"].hist(bins=50)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7feab197dbd0>

In [20]:
we_percentile = df_topic_similars["we_sim"].quantile(q=.30)
we_percentile

0.17520551857253092

In [21]:
df_tmp = df_topic_probs[["word", "stddev"]]
df_tmp.columns = ["w", "stddev"]
df_result = df_topic_similars.merge(df_tmp, left_on="similar_word", right_on="w")
del df_result["w"]
word_prob_quantile = df_result["stddev"].quantile(0.8)

### High TM similarity, low WE similarity

In [22]:
df_large_sim_diff = df_result[(df_result["we_sim"] < 0.4) & (df_result["stddev"] > 0.025)]
df_large_sim_diff.iloc[np.random.permutation(len(df_large_sim_diff))]

Unnamed: 0,word,similar_word,tm_sim_max,tm_sim_sum,tm_sim_bhattacharyya,tm_sim_hellinger,tm_sim_jensen-shannon,we_sim,stddev
2150,kerr,saga,0.973470,0.903994,0.947563,0.771009,0.945753,-0.025304,0.054336
1140,sterling,rita,0.972352,0.908872,0.948587,0.773255,0.945167,0.222052,0.057493
809,legal,justice,0.898065,0.716042,0.841182,0.601481,0.835725,0.369424,0.038477
1068,name,may,0.910959,0.533377,0.719336,0.470223,0.651198,0.071585,0.030829
2000,zoo,dogs,0.980870,0.934876,0.964786,0.812346,0.962417,0.379262,0.057648
1004,dan,hunt,0.939222,0.718575,0.838402,0.598007,0.827664,0.224454,0.043118
1704,barma,kokborok,0.962670,0.912321,0.937188,0.749376,0.933692,0.285683,0.061987
312,stafford,bugs,0.993655,0.978031,0.985362,0.879012,0.987373,-0.077880,0.060781
1547,roller,vineyards,0.973184,0.931448,0.964993,0.812898,0.963568,0.008926,0.058201
1551,autism,fonts,0.989661,0.978712,0.989309,0.896604,0.987821,0.019459,0.061425


### High TM similarity, high WE similarity

In [23]:
df_small_sim_diff = df_result[(df_result["we_sim"] > 0.8) & (df_result["stddev"] > 0.025)]
df_small_sim_diff.iloc[np.random.permutation(len(df_small_sim_diff))]

Unnamed: 0,word,similar_word,tm_sim_max,tm_sim_sum,tm_sim_bhattacharyya,tm_sim_hellinger,tm_sim_jensen-shannon,we_sim,stddev
1589,gothenburg,stockholm,0.987410,0.960926,0.979241,0.855920,0.977461,0.905866,0.057109
1590,stockholm,gothenburg,0.987410,0.960926,0.979241,0.855920,0.977461,0.905866,0.057900
1017,election,elections,0.875013,0.784894,0.911046,0.701749,0.909413,0.889273,0.047813
1422,ethiopia,eritrea,0.973053,0.948248,0.973458,0.837082,0.972210,0.876582,0.054944
1380,argentina,chile,0.940928,0.854074,0.935098,0.745242,0.924440,0.824045,0.045401
967,cardiff,swansea,0.974716,0.941826,0.977611,0.850372,0.975840,0.901173,0.058046
1783,faso,burkina,0.997894,0.997742,0.999908,0.990430,0.999900,0.871338,0.058742
1417,gun,guns,0.942154,0.844268,0.924142,0.724577,0.916678,0.844736,0.045179
374,czech,slovak,0.986991,0.968473,0.982599,0.868086,0.982170,0.801042,0.060220
569,genus,species,0.858041,0.710947,0.877675,0.650250,0.855629,0.809780,0.028558


# Findings

* syntatic variations play a bigger role in WE models, example:

  **(development, developed)**: TM-sim: 0.960519 WE-SIM: 0.360895
  
  **(composed, composers)** TM-SIM: 0.973376 WE-SIM: 0.329483
  
  **(works, working)** TM-SIM: 0.969470 WE-SIM: 0.274090
* topic models are better at capturing loose relationships, such as:

  **(war, commander)** TM-SIM: 0.922352 WE-SIM: 0.187498
  
  **(living, households)** TM-SIM: 0.983162 WE-SIM: 0.207906
  
  **(county, rural)** TM-SIM: 0.882099 WE-SIM: 0.257984
  

# Concept categorization in TM and WE

Roughly the same results after using the same algorithm for both systems

In [24]:
def get_embedding_from_word_embedding(word):
    try:
        return vectors[word]
    except:
        return vectors["this"]

columns = [str(i) for i in range(256)]
def get_embedding_from_topics(word):
    df_row = df_topic_probs_full[df_topic_probs_full["word"] == word]
    assert len(df_row) == 1, "not exactly one row found: " + word + " " + len(df_row)
    return df_row[columns].iloc[0,:].tolist()

def get_df_concept(embedding_function):
    df_concept = pnd.read_csv(
        "/home/knub/Repositories/master-thesis/data/concept-categorization/battig_concept-categorization.tsv",
        sep="\t",
        header=None)
    df_concept.columns = ["word", "concept"]
    df_concept["embeddings"] = df_concept["word"].apply(embedding_function)
    return df_concept

df_we_concept = get_df_concept(get_embedding_from_word_embedding)
df_tm_concept = get_df_concept(get_embedding_from_topics)
df_tm_concept.head(2)

Unnamed: 0,word,concept,embeddings
0,dog,land-mammals,"[0.00462947223056, 0.000812221796925, 3.766611..."
1,elephant,land-mammals,"[7.67627605217e-09, 3.94692328836e-08, 0.00024..."


In [25]:
len(df_tm_concept.ix[0,"embeddings"])

256

In [26]:
from sklearn import metrics

# http://stats.stackexchange.com/questions/95731/how-to-calculate-purity
def single_cluster_purity(df_param):
    return df_param["concept"].value_counts().max()

def calculate_purity(df_param):
    purity = float(sum([single_cluster_purity(df_cluster_group)
                        for _, df_cluster_group
                        in df_param.groupby("cluster_id")])) / len(df_param)
    return purity


def evaluate_clustering_algorithm(df_param, clustering):
    X = np.array(df_param["embeddings"].tolist())
    X_sim = metrics.pairwise.pairwise_distances(X, metric="cosine")
    # sim or not sim? PCA or not PCA?
    clusters = clustering.fit_predict(pca(X_sim, 10))
    df_param["cluster_id"] = clusters
    return calculate_purity(df_param)

In [27]:
for df_concept in [df_we_concept, df_tm_concept]:
    print "-" * 100
    for clustering in [KMeans(n_clusters=10, init="k-means++", n_jobs=1),
                       AgglomerativeClustering(n_clusters=10, linkage="ward"),
                       AgglomerativeClustering(n_clusters=10, linkage="complete"),
                       AgglomerativeClustering(n_clusters=10, linkage="average"),
                       AffinityPropagation(damping=0.5),
                       AffinityPropagation(damping=0.6),
                       AffinityPropagation(damping=0.7),
                       AffinityPropagation(damping=0.8),
                       AffinityPropagation(damping=0.9),
                   SpectralClustering(n_clusters=3)]:
        print clustering.__class__.__name__
        print evaluate_clustering_algorithm(df_concept, clustering)

----------------------------------------------------------------------------------------------------
KMeans
0.768292682927
AgglomerativeClustering
0.780487804878
AgglomerativeClustering
0.768292682927
AgglomerativeClustering
0.731707317073
AffinityPropagation
0.719512195122
AffinityPropagation
0.719512195122
AffinityPropagation
0.670731707317
AffinityPropagation
0.670731707317
AffinityPropagation
0.707317073171
SpectralClustering
0.353658536585
----------------------------------------------------------------------------------------------------
KMeans
0.59756097561
AgglomerativeClustering
0.621951219512
AgglomerativeClustering
0.548780487805
AgglomerativeClustering
0.609756097561
AffinityPropagation
0.524390243902
AffinityPropagation
0.524390243902
AffinityPropagation
0.524390243902
AffinityPropagation
0.524390243902
AffinityPropagation
0.524390243902
SpectralClustering
0.317073170732


# Word Similarity

## Similarity

In [28]:
def word_similarity(f):
    try:        
        df_sim = pnd.read_csv(MODEL + f, sep="\t")
        df_sim["embedding-sim"] = df_sim[["word1", "word2"]].apply(
            lambda x: model.get_similarity(x["word1"], x["word2"], vectors), axis=1)
        topic_sim_column = df_sim.columns[3]
        
        topic_corr     = df_sim[["human-sim", topic_sim_column]].corr("spearman").ix[0,1]
        embedding_corr = df_sim[["human-sim", "embedding-sim"]].corr("spearman").ix[0, 1]
        
        return pnd.DataFrame([[topic_corr, embedding_corr]],
                             columns=["topic_corr", "embedding_corr"],
                             index=[f])
    except Exception as e:
        return None

df_tmp = pnd.concat([word_similarity(".wordsim353-all-bhattacharyya"),
            word_similarity(".wordsim353-all-hellinger"),
            word_similarity(".wordsim353-all-jensen-shannon"),
            word_similarity(".wordsim353-all-sum"),
            word_similarity(".wordsim353-rel-bhattacharyya"),
            word_similarity(".wordsim353-rel-hellinger"),
            word_similarity(".wordsim353-rel-jensen-shannon"),
            word_similarity(".wordsim353-rel-sum"),
            word_similarity(".wordsim353-sim-bhattacharyya"),
            word_similarity(".wordsim353-sim-hellinger"),
            word_similarity(".wordsim353-sim-jensen-shannon"),
            word_similarity(".wordsim353-sim-sum")])
df_tmp.sort_values(by="topic_corr", ascending=False)

ValueError: All objects passed were None