# Topic Modeling: Latent Dirichlet Allocation with sklearn

### Loading Libraries

In [9]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualziation
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

# Warnings
import warnings


# Path & Collection
from pathlib import Path
from collections import OrderedDict

# PyLDAvis
import pyLDAvis
from pyLDAvis.sklearn import prepare
import pyLDAvis.sklearn_models as sklearn_lda

Worlcloud
from termcolor import colored
from wordcloud import WordCloud

In [15]:
%matplotlib inline

In [16]:
np.random.seed(42)

sns.set_style('white')

warnings.filterwarnings('ignore')

In [17]:
pyLDAvis.enable_notebook()

In [18]:
DATA_DIR = Path('../data')

data_path = DATA_DIR / 'bbc'

In [19]:
results_path = Path('results')

model_path = Path('results', 'bbc')

if not model_path.exists():
    model_path.mkdir(exist_ok=True, parents=True)

### Loading BBC data

In [20]:
files = sorted(list(data_path.glob('**/*.txt')))

doc_list = []

for i, file in enumerate(files):
    with open(str(file), encoding='latin1') as f:
        topic = file.parts[-2]
        lines = f.readlines()
        heading = lines[0].strip()
        body = ' '.join([l.strip() for l in lines[1:]])
        doc_list.append([topic.capitalize(), heading, body])

#### DataFrame Conversion

In [21]:
docs = pd.DataFrame(doc_list, columns=['topic', 'heading', 'article'])

docs.info()

### Creating Train & Test Sets

In [22]:
train_docs, test_docs = train_test_split(docs,
                                         stratify=docs.topic,
                                         test_size=125,
                                         random_state=42)

In [23]:
train_docs.shape, test_docs.shape

In [24]:
pd.Series(test_docs.topic).value_counts()

### Vectorizing train & test sets

In [25]:
vectorizer = TfidfVectorizer(max_df=.11, 
                             min_df=.026, 
                             stop_words='english')

train_dtm = vectorizer.fit_transform(train_docs.article)
words = vectorizer.get_feature_names()
train_dtm

In [26]:
test_dtm = vectorizer.transform(test_docs.article)
test_dtm

### LDA with sklearn

In [27]:
n_components = 5

topic_labels = [f'Topic {i}' for i in range(1, n_components+1)]

In [28]:
lda_base = LatentDirichletAllocation(n_components=n_components,
                                     n_jobs=-1,
                                     learning_method='batch',
                                     max_iter=10)

lda_base.fit(train_dtm)

#### Persisting Model

In [29]:
joblib.dump(lda_base, model_path / 'lda_10_iter.pkl')

In [30]:
lda_base = joblib.load(model_path / 'lda_10_iter.pkl') 

lda_base

### Exploring Topics & Word Distributions

In [33]:
topics_count = lda_base.components_
print(topics_count.shape)

topics_count[:5]

In [34]:
topics_prob = topics_count / topics_count.sum(axis=1).reshape(-1, 1)

topics = pd.DataFrame(topics_prob.T,
                      index=words,
                      columns=topic_labels)

topics.head()

In [35]:
topics[topics.gt(0).all(1)].shape[0] == topics.shape[0]

In [36]:
fig, ax = plt.subplots(figsize=(10, 14))

sns.heatmap(topics.sort_values(topic_labels, ascending=False),
            cmap='Blues', ax=ax, cbar_kws={'shrink': .6})

fig.tight_layout()

In [37]:
top_words = {}

for topic, words_ in topics.items():
    top_words[topic] = words_.nlargest(10).index.tolist()

pd.DataFrame(top_words)

In [38]:
fig, axes = plt.subplots(nrows=5, sharey=True, sharex=True, figsize=(10, 15))

for i, (topic, prob) in enumerate(topics.items()):
    sns.distplot(prob, ax=axes[i], bins=100, kde=False, norm_hist=False)
    axes[i].set_yscale('log')
    axes[i].xaxis.set_major_formatter(FuncFormatter(lambda x, _: '{:.1%}'.format(x)))
fig.suptitle('Topic Distributions')
sns.despine()
fig.tight_layout()
plt.show()

### Evaluating Fit on Train Set

In [40]:
train_preds = lda_base.transform(train_dtm)

train_preds.shape

In [41]:
train_eval = pd.DataFrame(train_preds, columns=topic_labels, index=train_docs.topic)

train_eval.head()

In [42]:
train_eval.groupby(level='topic').mean().plot.bar(title='Avg. Topic Probabilities');

In [43]:
df = train_eval.groupby(level='topic').idxmax(
    axis=1).reset_index(-1, drop=True)
sns.heatmap(df.groupby(level='topic').value_counts(normalize=True)
            .unstack(-1), annot=True, fmt='.1%', cmap='Blues', square=True)
plt.title('Train Data: Topic Assignments');
plt.show()

### Evaluating Fit on Test Set

In [44]:
test_preds = lda_base.transform(test_dtm)

test_eval = pd.DataFrame(test_preds, columns=topic_labels, index=test_docs.topic)
test_eval.head()

In [45]:
test_eval.groupby(level='topic').mean().plot.bar(title='Avg. Topic Probabilities',
                                                 figsize=(12, 4),
                                                 rot=0)
plt.xlabel('')
sns.despine()
plt.tight_layout()
plt.show()

In [46]:
df = test_eval.groupby(level='topic').idxmax(axis=1).reset_index(-1, drop=True)

sns.heatmap(df.groupby(level='topic').value_counts(normalize=True).unstack(-1), 
            annot=True, fmt='.1%', cmap='Blues', square=True)
plt.title('Topic Assignments');
plt.show()

### Retraining until Perplexity No Longer Decreases

In [47]:
lda_opt = LatentDirichletAllocation(n_components=5,
                                    n_jobs=-1,
                                    max_iter=500,
                                    learning_method='batch',
                                    evaluate_every=5,
                                    verbose=1,
                                    random_state=42)

lda_opt.fit(train_dtm)

In [48]:
joblib.dump(lda_opt, model_path / 'lda_opt.pkl')

In [49]:
lda_opt = joblib.load(model_path / 'lda_opt.pkl')

In [50]:
train_opt_eval = pd.DataFrame(data=lda_opt.transform(train_dtm),
                              columns=topic_labels,
                              index=train_docs.topic)

In [51]:
test_opt_eval = pd.DataFrame(data=lda_opt.transform(test_dtm),
                             columns=topic_labels, 
                             index=test_docs.topic)

### Compare Train & Test Topic Assignments

In [52]:
fig, axes = plt.subplots(ncols=2, figsize=(18, 8))

source = ['Train', 'Test']

for i, df in enumerate([train_opt_eval, test_opt_eval]):
    df = df.groupby(level='topic').idxmax(axis=1).reset_index(-1, drop=True)
    sns.heatmap(df.groupby(level='topic').value_counts(normalize=True)
                .unstack(-1), annot=True, fmt='.1%', cmap='Blues', square=True, ax=axes[i])
    axes[i].set_title('{} Data: Topic Assignments'.format(source[i]))

### Exploring Misclassified Articles

In [53]:
test_assignments = test_docs.assign(predicted=test_opt_eval.idxmax(axis=1).values)
test_assignments.head()

In [54]:
misclassified = test_assignments[(test_assignments.topic == 'Entertainment') & (
    test_assignments.predicted == 'Topic 4')]

misclassified.heading

In [55]:
misclassified.article.tolist()

### PyLDAVis

#### RefitTING using All Data

In [None]:
vectorizer = CountVectorizer(max_df=.5, 
                             min_df=5,
                             stop_words='english',
                             max_features=2000)

dtm = vectorizer.fit_transform(docs.article)

In [None]:
lda_all = LatentDirichletAllocation(n_components=5,
                                    max_iter=500,
                                    learning_method='batch',
                                    evaluate_every=10,
                                    random_state=42,
                                    verbose=1)

lda_all.fit(dtm)

In [None]:
joblib.dump(lda_all, model_path /'lda_all.pkl')

In [None]:
lda_all = joblib.load(model_path / 'lda_all.pkl')

#### Lambda

In [56]:
prepare(lda_all, dtm, vectorizer)

### Topics as WordClouds

In [57]:
topics_prob = lda_all.components_ / lda_all.components_.sum(axis=1).reshape(-1, 1)

topics = pd.DataFrame(topics_prob.T,
                      index=vectorizer.get_feature_names(),
                      columns=topic_labels)

In [58]:
w = WordCloud()

fig, axes = plt.subplots(nrows=5, figsize=(15, 30))

axes = axes.flatten()

for t, (topic, freq) in enumerate(topics.items()):
    w.generate_from_frequencies(freq.to_dict())
    axes[t].imshow(w, interpolation='bilinear')
    axes[t].set_title(topic, fontsize=18)
    axes[t].axis('off')

### Visualizing `topic-word` Associations per Document

In [60]:
dtm_ = pd.DataFrame(data=lda_all.transform(dtm),
                    columns=topic_labels,
                    index=docs.topic)

In [61]:
dtm_.head()

In [62]:
color_dict = OrderedDict()

color_dict['Topic 1'] = {'color': 'white', 'on_color': 'on_blue'}
color_dict['Topic 2'] = {'color': 'white', 'on_color': 'on_green'}
color_dict['Topic 3'] = {'color': 'white', 'on_color': 'on_red'}
color_dict['Topic 4'] = {'color': 'white', 'on_color': 'on_magenta'}
color_dict['Topic 5'] = {'color': 'blue', 'on_color': 'on_yellow'}

In [63]:
dtm_['article'] = docs.article.values
dtm_['heading'] = docs.heading.values

sample = dtm_[dtm_[topic_labels].gt(.05).all(1)]
sample

In [64]:
colored_text = []

for word in sample.iloc[0, 5].split():
    try:
        topic = topics.loc[word.strip().lower()].idxmax()
        colored_text.append(colored(word, **color_dict[topic]))
    except:
        colored_text.append(word)

print(' '.join([colored(k, **v) for k, v in color_dict.items()]))
print('\n',sample.iloc[0, 6], '\n')
text = ' '.join(colored_text)
print(text)