# Topic Modeling: probabilistic LSA / Non-negative Matrix Factorization

### Loading Libraries

In [1]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualziation
import seaborn as sns
import matplotlib.pyplot as plt

# Warnings
import warnings

# Scikit-Learn
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Path & Random
from pathlib import Path
from random import randint

In [2]:
np.random.seed(42)

sns.set_style('white')

warnings.filterwarnings('ignore')

### Loading BBC data

In [3]:
DATA_DIR = Path('../data')

In [4]:
path = DATA_DIR / 'bbc'

files = sorted(list(path.glob('**/*.txt')))

doc_list = []

for i, file in enumerate(files):
    with open(str(file), encoding='latin1') as f:
        topic = file.parts[-2]
        lines = f.readlines()
        heading = lines[0].strip()
        body = ' '.join([l.strip() for l in lines[1:]])
        doc_list.append([topic.capitalize(), heading, body])

#### DataFrame Conversion

In [5]:
docs = pd.DataFrame(doc_list, columns=['Category', 'Heading', 'Article'])

docs.info()

### Creating Train & Test Sets

In [6]:
train_docs, test_docs = train_test_split(docs, 
                                         stratify=docs.Category, 
                                         test_size=50, 
                                         random_state=42)

In [7]:
train_docs.shape, test_docs.shape

In [8]:
pd.Series(test_docs.Category).value_counts()

### Vectorizing Train & Test Sets

In [9]:
vectorizer = TfidfVectorizer(max_df=.2, 
                             min_df=.01, 
                             stop_words='english')

train_dtm = vectorizer.fit_transform(train_docs.Article)
words = vectorizer.get_feature_names()

train_dtm

In [10]:
test_dtm = vectorizer.transform(test_docs.Article)

test_dtm

### Getting Token Counts

In [11]:
train_token_count = train_dtm.sum(0).A.squeeze()

tokens = vectorizer.get_feature_names()

word_count = pd.Series(train_token_count, index=tokens).sort_values(ascending=False)
word_count.head(10)

### Probabilistic Latent Semantic Analysis

#### Implementation using Non-Negative Matrix Factorization

In [12]:
n_components = 5

topic_labels = ['Topic {}'.format(i) for i in range(1, n_components+1)]

In [13]:
nmf = NMF(n_components=n_components, 
          random_state=42, 
          solver='mu',
          beta_loss='kullback-leibler', 
          max_iter=1000)

nmf.fit(train_dtm)

In [14]:
nmf.reconstruction_err_

### Exploring Topics

In [15]:
train_doc_topics = nmf.transform(train_dtm)

train_doc_topics.shape

In [16]:
i = randint(0, len(train_docs))

(train_docs.iloc[i, :2].append(pd.Series(train_doc_topics[i], 
                                         index=topic_labels)))

In [17]:
train_result = pd.DataFrame(data=train_doc_topics,
                   columns=topic_labels,
                   index=train_docs.Category)

In [18]:
test_eval = pd.DataFrame(data=nmf.transform(test_dtm), 
                         columns=topic_labels,
                         index=test_docs.Category)

In [19]:
result = pd.melt(train_result.assign(Data='Train')
                 .append(test_eval.assign(Data='Test'))
                 .reset_index(),
                 id_vars=['Data', 'Category'],
                 var_name='Topic',
                 value_name='Weight')

result = pd.melt(train_result.assign(Data='Train')
                 .append(test_eval.assign(Data='Test'))
                 .reset_index(),
                 id_vars=['Data', 'Category'],
                 var_name='Topic',
                 value_name='Weight')

g =sns.catplot(x='Category', 
               y='Weight', 
               hue='Topic', 
               row='Data', 
               kind='bar', 
               data=result, 
               height=3,
               aspect=4);

plt.show()

#### Most Important Words by Topic

In [20]:
topics = pd.DataFrame(nmf.components_.T,
                      index=tokens,
                      columns=topic_labels)

topics.loc[word_count.head(10).index]

In [21]:
fig, ax = plt.subplots(figsize=(12, 4))

top_words, top_vals = pd.DataFrame(), pd.DataFrame()

for topic, words_ in topics.items():
    top10 = words_.nlargest(10).index
    vals = words_.loc[top10].values
    top_vals[topic] = vals
    top_words[topic] = top10.tolist()

sns.heatmap(pd.DataFrame(top_vals), 
            annot=top_words, 
            fmt = '', 
            center=0, 
            cmap=sns.diverging_palette(0, 255, sep=1, n=256), 
            ax=ax);

ax.set_title('Top Words per Topic')
fig.tight_layout();
plt.show()

In [22]:
topics = pd.DataFrame(nmf.components_.T,
                      index=words,
                      columns=topic_labels)

topics.head()

In [23]:
top_words = {}

for topic, words_ in topics.items():
    top_words[topic] = words_.nlargest(10).index.tolist()

pd.DataFrame(top_words)