# Topic Modeling: Latent Semantic Analysis/Indexing

### Loading Libraries

In [4]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualziation
import seaborn as sns
import matplotlib.pyplot as plt

# Warnings
import warnings

# Scikit-Learn
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Path & Random
from pathlib import Path
from random import randint

In [5]:
sns.set_style('white')

warnings.filterwarnings('ignore')

### Loading BBC data

In [6]:
DATA_DIR = Path('../data')

In [7]:
path = DATA_DIR / 'bbc'

files = sorted(list(path.glob('**/*.txt')))

doc_list = []

for i, file in enumerate(files):
    with open(str(file), encoding='latin1') as f:
        topic = file.parts[-2]
        lines = f.readlines()
        heading = lines[0].strip()
        body = ' '.join([l.strip() for l in lines[1:]])
        doc_list.append([topic.capitalize(), heading, body])

#### DataFrame Conversion

In [8]:
docs = pd.DataFrame(doc_list, columns=['Category', 'Heading', 'Article'])
docs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  0 non-null      object
 1   Heading   0 non-null      object
 2   Article   0 non-null      object
dtypes: object(3)
memory usage: 132.0+ bytes


### Creating Train & Test Sets

In [15]:
train_docs, test_docs = train_test_split(docs,
                                         stratify=docs.Category,
                                         test_size=50,
                                         random_state=42)

In [16]:
train_docs.shape, test_docs.shape

In [17]:
pd.Series(test_docs.Category).value_counts()

### Vectorizing Train & Test Sets

In [18]:
vectorizer = TfidfVectorizer(max_df=.25,
                             min_df=.01,
                             stop_words='english',
                             binary=False)

train_dtm = vectorizer.fit_transform(train_docs.Article)
train_dtm

In [19]:
test_dtm = vectorizer.transform(test_docs.Article)
test_dtm

#### Getting Token Count

In [20]:
train_token_count = train_dtm.sum(0).A.squeeze()

tokens = vectorizer.get_feature_names()

word_count = pd.Series(train_token_count,
                       index=tokens).sort_values(ascending=False)

word_count.head(10)

### Latent Semantic Analysis

In [21]:
n_components = 5

topic_labels = ['Topic {}'.format(i) for i in range(1, n_components + 1)]

In [22]:
svd = TruncatedSVD(n_components=n_components, n_iter=5, random_state=42)

svd.fit(train_dtm)

In [23]:
svd.singular_values_

In [24]:
svd.explained_variance_ratio_.sum()

### Exploring Topics

In [25]:
train_doc_topics = svd.transform(train_dtm)

In [26]:
train_doc_topics.shape

#### Topic Weights for Sample Article

In [27]:
i = randint(0, len(train_docs))

(train_docs.iloc[i, :2].append(
    pd.Series(train_doc_topics[i], index=topic_labels)))

#### Average topic weight per category

In [28]:
train_result = pd.DataFrame(data=train_doc_topics,
                            columns=topic_labels,
                            index=train_docs.Category)

train_result.groupby(level='Category').mean().plot.bar(figsize=(14, 5), rot=0);

#### Topics Weights of Most Frequent Words

In [29]:
topics = pd.DataFrame(svd.components_.T, index=tokens, columns=topic_labels)

topics.loc[word_count.head(10).index]

#### Most Important Words by Topic

In [30]:
fig, ax = plt.subplots(figsize=(12, 4))

top_words, top_vals = pd.DataFrame(), pd.DataFrame()

for topic, words_ in topics.items():
    top10 = words_.abs().nlargest(10).index
    vals = words_.loc[top10].values
    top_vals[topic] = vals
    top_words[topic] = top10.tolist()
sns.heatmap(pd.DataFrame(top_vals),
            annot=top_words,
            fmt='',
            center=0,
            cmap=sns.diverging_palette(0, 255, sep=1, n=256),
            ax=ax)
ax.set_title('Top Words per Topic')
sns.despine()
fig.tight_layout()
plt.show()

#### Topics Weights for Test Set

In [31]:
test_eval = pd.DataFrame(data=svd.transform(test_dtm),
                         columns=topic_labels,
                         index=test_docs.Category)

In [32]:
sns.set(font_scale=1.3)

result = pd.melt(train_result.assign(Data='Train').append(
    test_eval.assign(Data='Test')).reset_index(),
                 id_vars=['Data', 'Category'],
                 var_name='Topic',
                 value_name='Weight')

g = sns.catplot(x='Category',
                y='Weight',
                hue='Topic',
                row='Data',
                kind='bar',
                data=result,
                aspect=3.5);

plt.show()

### Categories in 2-D

In [33]:
pca = PCA(n_components=2)

svd2d = pd.DataFrame(pca.fit_transform(train_result),
                     columns=['PC1',
                              'PC2']).assign(Category=train_docs.Category)

categories_2d = svd2d.groupby('Category').mean()

In [34]:
plt.quiver(np.zeros(5),
           np.zeros(5),
           categories_2d.PC1.values,
           categories_2d.PC2.values,
           scale=.035)

plt.title('Topic Directions in 2D Space');