# Topic Modeling: Latent Dirichlet Allocation with `gensim`

### Loading Libraries

In [5]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualziation
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

# Warnings
import warnings


# Path & Collection
from pathlib import Path
from collections import OrderedDict

# PyLDAvis
import pyLDAvis
from pyLDAvis.sklearn import prepare
import pyLDAvis.sklearn_models as sklearn_lda

# Worlcloud
from termcolor import colored
from wordcloud import WordCloud

In [6]:
%matplotlib inline

In [7]:
np.random.seed(42)

sns.set_style('white')

warnings.filterwarnings('ignore')

In [8]:
pyLDAvis.enable_notebook()

### Loading BBC Data

In [9]:
DATA_DIR = Path('../data')

In [10]:
path = DATA_DIR / 'bbc'

files = path.glob('**/*.txt')

doc_list = []

for i, file in enumerate(files):
    with open(str(file), encoding='latin1') as f:
        topic = file.parts[-2]
        lines = f.readlines()
        heading = lines[0].strip()
        body = ' '.join([l.strip() for l in lines[1:]])
        doc_list.append([topic.capitalize(), heading, body])

#### DataFrame Conversion

In [12]:
docs = pd.DataFrame(doc_list, columns=['topic', 'heading', 'article'])

docs.info()

### Creating Train & Test Sets

In [13]:
train_docs, test_docs = train_test_split(docs, 
                                         stratify=docs.topic, 
                                         test_size=50, 
                                         random_state=42)

In [14]:
train_docs.shape, test_docs.shape

In [15]:
pd.Series(test_docs.topic).value_counts()

### Vectorizing Train & Test Sets

In [16]:
vectorizer = CountVectorizer(max_df=.2, 
                             min_df=3, 
                             stop_words='english', 
                             max_features=2000)

train_dtm = vectorizer.fit_transform(train_docs.article)
words = vectorizer.get_feature_names()
train_dtm

In [17]:
test_dtm = vectorizer.transform(test_docs.article)

test_dtm

### LDA with `gensim`

#### Using `CountVectorizer` Input

In [18]:
max_df = .2

min_df = 3

max_features = 2000

stop_words = pd.read_csv('http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words', 
                         header=None, 
                         squeeze=True).tolist()

In [19]:
vectorizer = CountVectorizer(max_df=max_df, 
                             min_df=min_df, 
                             stop_words='english', 
                             max_features=max_features)

train_dtm = vectorizer.fit_transform(train_docs.article)

test_dtm = vectorizer.transform(test_docs.article)

### Sklearn DTM to gensim Data Structures Conversion

In [20]:
train_corpus = Sparse2Corpus(train_dtm, documents_columns=False)
test_corpus = Sparse2Corpus(test_dtm, documents_columns=False)

id2word = pd.Series(vectorizer.get_feature_names()).to_dict()

### Training Model & Review Results

In [21]:
LdaModel(corpus=train_corpus, 
         num_topics=100, 
         id2word=None, 
         distributed=False, 
         chunksize=2000,                   
         passes=1,                         
         update_every=1,                   
         alpha='symmetric', 
         eta=None,                         
         decay=0.5,                        
         offset=1.0,                       
         eval_every=10,                    
         iterations=50,                    
         gamma_threshold=0.001,            
         minimum_probability=0.01,         
         random_state=None, 
         ns_conf=None, 
         minimum_phi_value=0.01,           
         per_word_topics=False,            
         callbacks=None);

In [None]:
num_topics = 5

topic_labels = ['Topic {}'.format(i) for i in range(1, num_topics+1)]

In [None]:
lda_gensim = LdaModel(corpus=train_corpus,
                      num_topics=num_topics,
                      id2word=id2word)

In [22]:
topics = lda_gensim.print_topics()

topics[0]

### Evaluating Topic Coherence

In [23]:
coherence = lda_gensim.top_topics(corpus=train_corpus, coherence='u_mass')

In [24]:
topic_coherence = []

topic_words = pd.DataFrame()

for t in range(len(coherence)):
    label = topic_labels[t]
    topic_coherence.append(coherence[t][1])
    df = pd.DataFrame(coherence[t][0], columns=[(label, 'prob'), (label, 'term')])
    df[(label, 'prob')] = df[(label, 'prob')].apply(lambda x: '{:.2%}'.format(x))
    topic_words = pd.concat([topic_words, df], axis=1)
                      
topic_words.columns = pd.MultiIndex.from_tuples(topic_words.columns)
pd.set_option('expand_frame_repr', False)
topic_words.head().to_csv('topic_words.csv', index=False)
print(topic_words.head())

pd.Series(topic_coherence, index=topic_labels).plot.bar(figsize=(12,4));

### Using `gensim` Dictionary

In [26]:
docs = [d.split() for d in train_docs.article.tolist()]

docs = [[t for t in doc if t not in stop_words] for doc in docs]

In [27]:
dictionary = Dictionary(docs)

dictionary.filter_extremes(no_below=min_df, no_above=max_df, keep_n=max_features)

In [28]:
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [29]:
print('Number of unique tokens: %d' % len(dictionary))

print('Number of documents: %d' % len(corpus))

In [30]:
num_topics = 5
chunksize = 500
passes = 20
iterations = 400
eval_every = None 

temp = dictionary[0]  
id2word = dictionary.id2token

In [31]:
model = LdaModel(corpus=corpus,
                 id2word=id2word,
                 chunksize=chunksize,
                 alpha='auto',
                 eta='auto',
                 iterations=iterations,
                 num_topics=num_topics,
                 passes=passes, 
                 eval_every=eval_every)

In [32]:
model.show_topics()

### Evaluating Topic Assignments on the Test Set

In [33]:
docs_test = [d.split() for d in test_docs.article.tolist()]
docs_test = [[t for t in doc if t not in stop_words] for doc in docs_test]

test_dictionary = Dictionary(docs_test)
test_dictionary.filter_extremes(no_below=min_df, no_above=max_df, keep_n=max_features)
test_corpus = [dictionary.doc2bow(doc) for doc in docs_test]

In [34]:
gamma, _ = model.inference(test_corpus)
topic_scores = pd.DataFrame(gamma)

topic_scores.head(10)

In [35]:
topic_probabilities = topic_scores.div(topic_scores.sum(axis=1), axis=0)
topic_probabilities.head()

In [37]:
topic_probabilities.idxmax(axis=1).head()

In [38]:
predictions = test_docs.topic.to_frame('topic').assign(predicted=topic_probabilities.idxmax(axis=1).values)

heatmap_data = predictions.groupby('topic').predicted.value_counts().unstack()

sns.heatmap(heatmap_data, annot=True, cmap='Blues');
plt.show