# Topic Model Exploration

BOW, Document-term matrix, TF-IDF. Based on tutorials: https://radimrehurek.com/gensim/tut1.html

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [1]:
# test corpora based on: http://topicmodels.west.uni-koblenz.de/ckling/tmt/part1.pdf
'''documents = ["probabilistic topic model",
    "probabilistic topic model",
    "probabilistic topic model",
    "famous fashion model",
    "famous fashion model",
    "famous fashion model",
    "famous fashion model",
    "famous fashion model",
    "famous fashion model",
    "famous fashion model"
]'''
documents = ["modem the steering linux. modem, linux the modem. steering the modem. linux!",
    "linux; the linux. the linux modem linux. the modem, clutch the modem. gear.",
    "gear! clutch the steering, steering, linux. the steering clutch gear. clutch the gear; the clutch.",
    "the the the. clutch clutch clutch! steering gear; steering gear gear; steering gear!!!!"]
len(documents)

4

In [2]:
from nltk.tokenize import word_tokenize
import string

texts = [[word for word in word_tokenize(document) if word not in string.punctuation] for document in documents]
print(texts)

[['modem', 'the', 'steering', 'linux', 'modem', 'linux', 'the', 'modem', 'steering', 'the', 'modem', 'linux'], ['linux', 'the', 'linux', 'the', 'linux', 'modem', 'linux', 'the', 'modem', 'clutch', 'the', 'modem', 'gear'], ['gear', 'clutch', 'the', 'steering', 'steering', 'linux', 'the', 'steering', 'clutch', 'gear', 'clutch', 'the', 'gear', 'the', 'clutch'], ['the', 'the', 'the', 'clutch', 'clutch', 'clutch', 'steering', 'gear', 'steering', 'gear', 'gear', 'steering', 'gear']]


In [3]:
from gensim import corpora

dictionary = corpora.Dictionary(texts)
print(dictionary.token2id)

{'modem': 0, 'the': 1, 'steering': 2, 'linux': 3, 'clutch': 4, 'gear': 5}




In [4]:
bow = [dictionary.doc2bow(text) for text in texts]
bow

[[(0, 4), (1, 3), (2, 2), (3, 3)],
 [(0, 3), (1, 4), (3, 4), (4, 1), (5, 1)],
 [(1, 4), (2, 3), (3, 1), (4, 4), (5, 3)],
 [(1, 3), (2, 3), (4, 3), (5, 4)]]

In [5]:
import pandas as pd
import numpy as np

#columns = ['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10']
columns = ['D1', 'D2', 'D3', 'D4']
doc_term = pd.DataFrame(np.zeros((len(dictionary.token2id), len(bow))), columns = columns, index = dictionary.token2id)

In [6]:
for i, doc in enumerate(bow):
    for tp in doc:
        doc_term.iat[tp[0], i] = tp[1]
print(doc_term)

           D1   D2   D3   D4
modem     4.0  3.0  0.0  0.0
the       3.0  4.0  4.0  3.0
steering  2.0  0.0  3.0  3.0
linux     3.0  4.0  1.0  0.0
clutch    0.0  1.0  4.0  3.0
gear      0.0  1.0  3.0  4.0


In [7]:
from gensim import models

model = models.TfidfModel(bow)
tfidf_model = [model[doc] for doc in bow]
tfidf_model

[[(0, 0.936603022962913), (2, 0.19436268823376643), (3, 0.29154403235064963)],
 [(0, 0.8624176140851579),
  (3, 0.47724753317857443),
  (4, 0.11931188329464361),
  (5, 0.11931188329464361)],
 [(2, 0.50709255283711),
  (3, 0.1690308509457033),
  (4, 0.6761234037828132),
  (5, 0.50709255283711)],
 [(2, 0.5144957554275266), (4, 0.5144957554275266), (5, 0.6859943405700354)]]

In [8]:
tfidf = pd.DataFrame(np.zeros((len(dictionary.token2id), len(bow))), columns = columns, index = dictionary.token2id)

In [9]:
for i, doc in enumerate(tfidf_model):
    for tp in doc:
        tfidf.iat[tp[0], i] = tp[1]
print(tfidf)

                D1        D2        D3        D4
modem     0.936603  0.862418  0.000000  0.000000
the       0.000000  0.000000  0.000000  0.000000
steering  0.194363  0.000000  0.507093  0.514496
linux     0.291544  0.477248  0.169031  0.000000
clutch    0.000000  0.119312  0.676123  0.514496
gear      0.000000  0.119312  0.507093  0.685994


# SVD, LSA (LSI)

## With gensim

In [10]:
lsi = models.LsiModel(bow, id2word=dictionary, num_topics=2) # initialize an LSI transformation
corpus_lsi = lsi[bow] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi

lsi.print_topics(-1)

[(0,
  '0.624*"the" + 0.384*"clutch" + 0.376*"gear" + 0.364*"steering" + 0.335*"linux" + 0.277*"modem"'),
 (1,
  '0.583*"modem" + 0.514*"linux" + -0.412*"gear" + -0.397*"clutch" + -0.242*"steering" + 0.099*"the"')]

In [11]:
lsi.projection.u # left singular vectors (U)

array([[ 0.27744519,  0.5826725 ],
       [ 0.62416737,  0.09912868],
       [ 0.36351351, -0.24249522],
       [ 0.33508594,  0.51399463],
       [ 0.38393822, -0.39735845],
       [ 0.37630578, -0.4120414 ]])

In [12]:
lsi.projection.s # singular values (Ʃ)

array([ 11.25526786,   6.53892079])

In [13]:
from gensim import matutils

# https://github.com/RaRe-Technologies/gensim/wiki/Recipes-&-FAQ#q3-how-do-you-calculate-the-matrix-v-in-lsi-space
(matutils.corpus2dense(corpus_lsi, len(lsi.projection.s)).T / lsi.projection.s).T # right singular vectors (V^T)

array([[ 0.41887654,  0.48240457,  0.58523405,  0.4993289 ],
       [ 0.56355929,  0.51860406, -0.40412391, -0.50013462]])

In [14]:
# dimensionality reduction
for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
     print(doc)

[(0, 4.7145677241169297), (1, 3.6850694969601743)]
[(0, 5.4295928192507432), (1, 3.3911108898747315)]
[(0, 6.5869661866505886), (1, -2.6425342939042058)]
[(0, 5.6200804337666819), (1, -3.2703405625661435)]


## With scikit-learn

In [15]:
from sklearn.utils.extmath import randomized_svd

U, Sigma, VT = randomized_svd(doc_term.values.T, n_components=2)

In [16]:
print(U)

[[ 0.41887655  0.56355928]
 [ 0.48240458  0.51860406]
 [ 0.58523407 -0.40412392]
 [ 0.49932889 -0.5001346 ]]


In [17]:
print(Sigma)

[ 11.25526786   6.53892079]


In [18]:
print(VT)

[[ 0.27744519  0.62416737  0.36351351  0.33508594  0.38393822  0.37630578]
 [ 0.5826725   0.09912868 -0.24249522  0.51399463 -0.39735845 -0.4120414 ]]


Using [TruncatedSVD](http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html) below to 1) perform dimensionality reduction on the doc-term matrix, 2) get extra characteristics like explained variance

In [19]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=2, n_iter=1)
svd.fit(doc_term.values.T)
print(svd.transform(doc_term.values.T))

[[ 4.71456772  3.6850695 ]
 [ 5.42959282  3.39111089]
 [ 6.58696619 -2.64253429]
 [ 5.62008043 -3.27034056]]


In [20]:
'''SVD suffers from a problem called "sign indeterminancy", which means the
sign of the ``components_`` and the output from transform depend on the
algorithm and random state. To work around this, fit instances of this
class to data once, then keep the instance around to do transformations.'''
print(svd.components_) # V^T

[[ 0.27744519  0.62416737  0.36351351  0.33508594  0.38393822  0.37630578]
 [ 0.5826725   0.09912868 -0.24249522  0.51399463 -0.39735845 -0.4120414 ]]


In [21]:
print(svd.explained_variance_)

[  0.4467348  10.6047913]


In [22]:
print(svd.explained_variance_ratio_)

[ 0.03591838  0.85264654]
