In [None]:
import pandas as pd
wiki_df = pd.read_json("../data/wikidata_pandas.json")

In [None]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.mixture import GaussianMixture
from sklearn.feature_selection import mutual_info_regression
import pandas as pd

def no_number_preprocessor(tokens):
    r = re.sub('(\d)+', '', tokens.lower())
    return r
  
vectorizer = TfidfVectorizer(stop_words='english', preprocessor=no_number_preprocessor)
bag_of_words = vectorizer.fit_transform(wiki_df.text)
svd = TruncatedSVD(n_components=50)
lsa = svd.fit_transform(bag_of_words)

topic_encoded_df = pd.DataFrame(lsa)
dictionary = vectorizer.get_feature_names()
encoding_matrix = pd.DataFrame(svd.components_,
                               columns=dictionary).T


#### Interpret The Encoding Matrix

Note that topic 1 is not semantically coherent (mix of baseball and jazz words).

In [None]:
display(encoding_matrix[[0,1,2]].sort_values(0, ascending=False).head(20))

#### Interpret The Encoding Matrix

Note that topic 2 is semantically coherent (just baseball words).

In [None]:
display(encoding_matrix[[0,1,2]].sort_values(1, ascending=False).head(20))

#### Interpret The Encoding Matrix

Note that topic 3 is semantically coherent (just jazz words).

In [None]:
display(encoding_matrix[[0,1,2]].sort_values(2, ascending=False).head(20))

### Cluster Model

A classical method for building a cluster model. 

![](https://www.evernote.com/l/AAGuoaYyLFNOLL7fUDxfug7PS3ugGJt-68MB/image.png)

1. Raw data (bag of words)
2. Used to create Low-Rank Model (LSA)
3. Fit cluster model to that (GMM)

Typically, the LRM is constructed by performing an eigenvector (or variant) decomposition on the original data and selecting the vectors from the decomposed matrix with the highest eigenvalues i.e. the first _n_ vectors.

## Document Clustering via Gaussian Mixture Model

In [None]:
gmm = GaussianMixture(n_components=2)
labels = gmm.fit_predict(topic_encoded_df[range(2)])

In [None]:
labels.shape

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1,2,figsize=(20,5))
fig.suptitle('GMM prepared on first twenty vectors from topic model', fontsize=16)
for val in wiki_df.category.unique():
    topic_1 = topic_encoded_df[wiki_df.category == val][0].values
    topic_2 = topic_encoded_df[wiki_df.category == val][1].values
    print(val)
    color = "red" if val == "Baseball" else "green"
    label = val
    ax[0].scatter(topic_1, topic_2, c=color, alpha=0.3, label=label)
  
topic_encoded_df.plot(kind="scatter", x=0, y=1, c=["red" if label == 0 else "green" for label in labels], ax=ax[1])  
# made the colors represent different books

ax[0].set_xlabel('topic_1')
ax[0].set_ylabel('topic_2')
ax[0].axvline(linewidth=0.5)
ax[0].axhline(linewidth=0.5)
ax[0].legend()
ax[1].legend()

# display(fig)

In [None]:
from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(n_components=2)
labels = gmm.fit_predict(topic_encoded_df[range(1,3)])

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1,2,figsize=(20,5))
fig.suptitle('GMM prepared on first twenty vectors from topic model', fontsize=16)
for val in wiki_df.category.unique():
    topic_1 = topic_encoded_df[wiki_df.category == val][1].values
    topic_2 = topic_encoded_df[wiki_df.category == val][2].values
    print(val)
    color = "red" if val == "Baseball" else "green"
    label = val
    ax[0].scatter(topic_1, topic_2, c=color, alpha=0.3, label=label)
  
topic_encoded_df.plot(kind="scatter", x=1, y=2, alpha=0.3, c=["red" if label == 1 else "green" for label in labels], ax=ax[1])  
# made the colors represent different books

ax[0].set_xlabel('topic_1')
ax[0].set_ylabel('topic_2')
ax[0].axvline(linewidth=0.5)
ax[0].axhline(linewidth=0.5)
ax[0].legend()
ax[1].legend()

# display(fig)

In [None]:
topic_lists = [
    [dictionary.index(word) for word in 
        list(encoding_matrix[i].sort_values(ascending=False).head(20).index.values)
    ]
    for i in range(50)
]

In [None]:
bag_of_words = pd.DataFrame(bag_of_words.todense())

In [None]:
from itertools import combinations

def PMI(i1, i2):
    return mutual_info_regression(bag_of_words[[i1]], bag_of_words[i2])[0]

def coherence(topic_indices):
    pairwise_indices = list(combinations(topic_indices, 2))
    sum = 0
    for pair in pairwise_indices:
        sum += PMI(*pair)
    return sum

In [None]:
coherences = [coherence(topic_list) for topic_list in topic_lists]