# Topic Modelling
Automatic topic modelling using LDA.
Some related reading:
* <a href="http://www.cse.chalmers.se/~richajo/dit862/L13/LDA%20with%20gensim%20(small%20example).html">LDA with gensim</a>
* <a href="https://radimrehurek.com/gensim/corpora/textcorpus.html">TextCorpus</a>
* <a href="https://radimrehurek.com/gensim/models/ldamodel.html">LDAmodel</a>


## Imports

In [1]:
import gensim
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

## Load data

In [2]:
# Load data
data_raw = pd.read_csv("../data/wikipedia_300.csv").values

# Get the text data (X) and labels (y)
X = data_raw[:,0:-1]
# Convert class label strings to integers
y_raw = data_raw[:,-1]
encoder = LabelEncoder()
encoder.fit(y_raw)
y = encoder.transform(y_raw)

# Encoded labels
print("Encodings:")
print("\t0:", encoder.inverse_transform([0])[0])
print("\t1:", encoder.inverse_transform([1])[0])

# Flatten input matrix to vector
X = X.ravel()
print("Examples: {}".format(X.shape[0]))

Encodings:
	0: Games
	1: Programming
Examples: 300


## Load and create the corpus

In [3]:
corpus = gensim.corpora.textcorpus.TextCorpus("../data/wikipedia_300.csv")

## Generate LDA model

In [4]:
model = gensim.models.LdaModel(corpus, 
                               id2word=corpus.dictionary, 
                               alpha='auto', 
                               minimum_probability=0.01,
                               iterations=100, 
                               num_topics=2,
                               random_state=0,
                               passes=6)

## Show top 10 words in all the topics

In [5]:
for topic_id in range(model.num_topics):
    print("{}:".format(topic_id))
    topk = model.show_topic(topic_id, 10)
    for wp in topk:
        print("\t{0:.4f}:  {1}".format(wp[1], wp[0]))

0:
	0.0113:  software
	0.0073:  design
	0.0057:  development
	0.0050:  game
	0.0041:  programming
	0.0040:  code
	0.0039:  engineering
	0.0038:  systems
	0.0036:  process
	0.0034:  model
1:
	0.0197:  game
	0.0170:  games
	0.0102:  video
	0.0030:  series
	0.0029:  list
	0.0029:  use
	0.0027:  console
	0.0025:  million
	0.0024:  time
	0.0024:  nintendo


## Evaluate prediction accuracy

In [6]:
no_corr = 0
tot = len(X)
for i in range(0,tot):
    doc = X[i].split()
    doc_vector = model.id2word.doc2bow(doc)
    doc_topics = model[doc_vector]
    
    # Get label
    p = -1
    label = -1
    for dt in doc_topics:
        if dt[1] > p:
            p = dt[1]
            label = dt[0]
    
    if label != y[i]:
        no_corr += 1
    
acc = no_corr / tot
print("Accuracy: {0:.1f}%".format(acc * 100))

Accuracy: 82.3%
