# Agenda
1. concept categorization data
1. concept categorization (how to define on topic models?)
1. spearmint
1. wikipedia training data
1. plots

# Data

* Latest Wikipedia corpus
* Extracted plain text
* Only used first 1000 words per document
* 87,380,300 sentences
* 1,813,672,600 words, 9,996,018 unique words

## topic model training
* Using mallet
* 256 topics, 400 iterations

## word2vec training
* skip-gram model in gensim
* remove words occurring less than 50 times --> 386,046 words unique words (98 % of original corpus)


# Setup

In [230]:
%matplotlib notebook

import itertools
from functools import partial
import numpy as np
import gensim, logging
import pandas as pnd
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, RandomizedPCA
import matplotlib.pyplot as plt

In [112]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [139]:
from IPython.core.display import HTML
HTML("""
<style>
div.text_cell_render p, div.text_cell_render ul, table.dataframe {
font-size:1.3em;
line-height:1.1em;
}
</style>
""")

# Preprocessing

## Topic model

In [114]:
# Prepare data in long form

df_topics = pnd.read_csv("../code/resources/topics.txt", header=None)
df_topics["topic"] =  df_topics.index
df_topics["topic_name"] = df_topics[0]

df = pnd.melt(df_topics, id_vars=["topic", "topic_name"], var_name="position", value_name="word")
df = df[["word", "topic", "topic_name", "position"]]
df = df.sort_values(by=["topic", "position"]).reset_index(drop=True)
df[df.topic == 0]

Unnamed: 0,word,topic,topic_name,position
0,english,0,english,0
1,england,0,english,1
2,king,0,english,2
3,one,0,english,3
4,scotland,0,english,4
5,britain,0,english,5
6,british,0,english,6
7,london,0,english,7
8,john,0,english,8
9,name,0,english,9


## Word embeddings

In [4]:
WORD2VEC_VECTOR_FILE = "/home/knub/Repositories/master-thesis/data/GoogleNews-vectors-negative300.bin"
GLOVE_VECTOR_FILE = "/home/knub/Repositories/master-thesis/data/glove.6B.50d.txt"
OUR_VECTOR_FILE = "/home/knub/Repositories/master-thesis/data/embedding.model.skipgram"

#word2vec = gensim.models.Word2Vec.load_word2vec_format(GLOVE_VECTOR_FILE, binary=False)
word2vec = gensim.models.Word2Vec.load_word2vec_format(WORD2VEC_VECTOR_FILE, binary=True)

In [115]:
df = df[df["word"].apply(lambda word: word in word2vec)]    
df["embeddings"] = df["word"].apply(lambda word: word2vec[word])
df[df.topic == 0]

Unnamed: 0,word,topic,topic_name,position,embeddings
0,english,0,english,0,"[-0.139648, -0.292969, 0.0800781, -0.00927734,..."
1,england,0,english,1,"[-0.367188, -0.0349121, 0.11084, 0.400391, 0.1..."
2,king,0,english,2,"[0.125977, 0.0297852, 0.00860596, 0.139648, -0..."
3,one,0,english,3,"[0.0456543, -0.145508, 0.15625, 0.166016, 0.10..."
4,scotland,0,english,4,"[-0.0732422, -0.176758, 0.108398, 0.240234, 0...."
5,britain,0,english,5,"[-0.208984, 0.160156, 0.0380859, 0.566406, -0...."
6,british,0,english,6,"[-0.0888672, 0.024292, 0.074707, 0.359375, -0...."
7,london,0,english,7,"[-0.460938, 0.0279541, -0.267578, 0.410156, -0..."
8,john,0,english,8,"[-0.253906, 0.147461, -0.081543, 0.208984, -0...."
9,name,0,english,9,"[0.148438, 0.152344, 0.0693359, 0.0478516, -0...."


In [242]:
# financial, muslim, teams in sport, atom physics, math
nice_topics = [5, 117, 158, 164, 171]

df_part = df[df.topic.apply(lambda topic: topic in nice_topics)].copy()
# Show topics of interest
df_tmp = pnd.DataFrame(df_part.groupby("topic")["word"].apply(lambda l: l.tolist()).tolist())
df_tmp.index = nice_topics
df_tmp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
5,economic,economy,financial,foreign,gdp,government,bank,oil,trade,countries
117,islamic,muslim,islam,population,arab,ibn,muslims,muhammad,afghanistan,arabic
158,team,league,season,first,game,new,football,nfl,record,teams
164,electrons,particles,electron,energy,charge,electric,units,current,particle,one
171,distribution,mean,probability,sample,data,random,population,used,two,function


## Dimensionality reduction

In [125]:
def pca(embeddings):
    pca = RandomizedPCA(n_components=2)
    return pca.fit_transform(embeddings)

def tsne(embeddings):
    tsne = TSNE(n_components=2)
    return tsne.fit_transform(embeddings)

def tsne_with_init_pca(embeddings):
    tsne = TSNE(n_components=2, init="pca")
    return tsne.fit_transform(embeddings)

# Topic model in word embedding space

## Plot preparation

In [126]:
def plot_topics_in_embedding_space(reduction_method, df):
    embeddings = np.array(df["embeddings"].tolist())
    X = reduction_method(embeddings)
    df_tmp = df.copy()
    df_tmp["x"] = X[:,0]
    df_tmp["y"] = X[:,1]
    df_tmp = df_tmp[df_tmp.topic.apply(lambda topic: topic in nice_topics)]
    colors = {5: "red", 117: "blue", 158: "green", 164: "yellow", 171: "black"}
    plt.figure(figsize=(12, 8))
    plt.scatter(df_tmp.x, df_tmp.y, c=df_tmp.topic.apply(lambda topic: colors[topic]), s=80)
    
    ylim = plt.gca().get_ylim()
    step = (ylim[1] - ylim[0]) / 100
    
    for index, row in df_tmp.iterrows():
        plt.text(row.x, row.y - step, row.word, horizontalalignment='center', verticalalignment='top')

## PCA

In [127]:
#plot_topics_in_embedding_space(pca, df)

In [128]:
plot_topics_in_embedding_space(pca, df_part)

<IPython.core.display.Javascript object>

## TSNE

In [129]:
plot_topics_in_embedding_space(tsne, df)

<IPython.core.display.Javascript object>

## TSNE with PCA initialization

In [130]:
#plot_topics_in_embedding_space(tsne_with_init_pca, df)

## Findings

Topics from the topic model do not seem to be in similar positions in the vector space, in general.

* **Specifity**: The more specific a word is, the closer it is to similar words in the word embedding space. See the "muslim", "islam", "mohammad" cluster.
* **Ambiguity**: Ambiguous words are a special problem, for example "current" is far away from the other physic terms because it has too many meanings. In fact, it is very close to the word "new".
* **Context-based similarity**: Topic models can assign different similarities between words based on the context. They are good at finding similar words in a context, which might not be immediately obvious. Example: "distribution" is not very similar to "function", however in the company of "mean", "probability", "data", "random" it is. See also "Exploring the Space of Topic Coherence Measures" by Röder et al.

# Word embedding similarity of topics 

## Avg. pairwise similarity

In [207]:
def average_pairwise_similarity(words, word2vec):
    word_pairs = itertools.permutations(words, 2)
    similarities = [word2vec.similarity(word1, word2) for word1, word2 in word_pairs if word1 < word2]
    return np.mean(similarities)

def average_top_similarity(words, word2vec):
    word_pairs = itertools.permutations(words, 2)
    similarities = [(word1, word2vec.similarity(word1, word2)) for word1, word2 in word_pairs]
    max_similarities = [max([s for w, s in l]) for _, l in itertools.groupby(similarities, lambda s: s[0])]
    return np.mean(max_similarities)

#average_top_similarity(["king", "queen", "test"], word2vec)      

In [215]:
topic_lengths = list(range(2, 11))
def calculate_similarities_for_topic(df_topic, sim_function):
    words_in_topic = df_topic["word"].tolist()
    
    average_similarities = [sim_function(words_in_topic[:topic_length], word2vec)
                            for topic_length in topic_lengths]
    
    return pnd.Series(average_similarities)

def calculate_similarity_matrix(sim_function):
    def partial_function(df_topic):
        return calculate_similarities_for_topic(df_topic, sim_function)

    df_similarities = df.groupby("topic").apply(partial_function)
    df_similarities.columns = ["%s-words" % i for i in topic_lengths]
    return df_similarities

In [224]:
df_similarities = calculate_similarity_matrix(average_pairwise_similarity)
df_similarities.mean()

2-words     0.286192
3-words     0.246161
4-words     0.232866
5-words     0.220750
6-words     0.209611
7-words     0.205302
8-words     0.197678
9-words     0.192038
10-words    0.188011
dtype: float64

In [216]:
means = df_similarities.mean().tolist()
plt.figure(figsize=(12, 8))
plt.scatter(topic_lengths, means, s=80)
plt.title("Avg. similarity (cosine similarity in WE space) of topics up to the nth word")
plt.xlim(0, 11)
plt.xticks(list(range(1, 12)))
#plt.ylim((0, 0.35))
plt.xlabel("topic length")
plt.ylabel("average similarity")

<IPython.core.display.Javascript object>

<matplotlib.text.Text at 0x7fd6a392b2d0>

## Highest-similar topics

For comparison, here are a few standard similarities:

**king-queen**: {{word2vec.similarity("king", "queen")}}
**topic-topics**: {{word2vec.similarity("topic", "topics")}}
**buy-purchase**: {{word2vec.similarity("buy", "purchase")}}

In [217]:
def show_highest_similar_topics(topic_length, nr_topics=3):
    column = "%s-words" % topic_length
    df_top = df_similarities.sort_values(by=column, ascending=False)[:nr_topics]
    return df_top.join(df_topics)[[column] + list(range(topic_length))]

In [218]:
show_highest_similar_topics(3)

Unnamed: 0_level_0,3-words,0,1,2
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
164,0.704689,electrons,particles,electron
117,0.694833,islamic,muslim,islam
219,0.664669,india,pakistan,indian


In [219]:
show_highest_similar_topics(6)

Unnamed: 0_level_0,6-words,0,1,2,3,4,5
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
190,0.528635,music,band,album,rock,song,songs
239,0.501541,dna,proteins,protein,rna,cell,gene
188,0.472879,game,player,ball,players,play,football


In [220]:
show_highest_similar_topics(10)

Unnamed: 0_level_0,10-words,0,1,2,3,4,5,6,7,8,9
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
239,0.433494,dna,proteins,protein,rna,cell,gene,genes,cells,sequence,amino
117,0.426365,islamic,muslim,islam,population,arab,ibn,muslims,muhammad,afghanistan,arabic
61,0.376226,day,year,time,calendar,days,month,years,months,date,second


## Findings

* In general, similarity is not very high after the first few words

# Concept categorization in WE

In [356]:
embeddings = np.array(df_part["embeddings"].tolist())
pca = RandomizedPCA(n_components=50)
embeddings = pca.fit_transform(embeddings)
embeddings.shape

(50, 50)

In [375]:
kmeans = KMeans(n_clusters=5, init="k-means++", n_jobs=1)
clusters = kmeans.fit_predict(embeddings)
print kmeans.inertia_
df_part["cluster_id"] = clusters

263.405825802


In [376]:
def topic_purity(df_topic):
    topic_cluster_counts = df_topic["cluster_id"].value_counts()
    max_cluster_in_topic = topic_cluster_counts.idxmax()
    max_cluster_in_topic_count = topic_cluster_counts.max()
    
    cluster_size = len(df_part[df_part["cluster_id"] == max_cluster_in_topic])
    return float(max_cluster_in_topic_count) / cluster_size

purity = np.mean([topic_purity(df_topic) for _, df_topic in df_part.groupby("topic")])
purity

0.7811391223155929