[View in Colaboratory](https://colab.research.google.com/github/librairy/notebooks/blob/master/Intro_TopicModels.ipynb)

# Introduction to Topic Models

This Google Colab Notebook serves as an introduction to Probabilistic Topic Models. 

Textual data can be loaded from a Google Sheet and topics derived from  LDA can be generated. 

First, it is necessary to indicate the training google sheet and the number of words to show per topic.


In [0]:
#@title Google Colab Authentication
!pip install --upgrade -q gspread
#!pip install -q gensim

from google.colab import auth
auth.authenticate_user()

import gspread
from oauth2client.client import GoogleCredentials

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import numpy as np


# Load Input Data


In [0]:
#@title Load and preview data from a Google Sheet

corpus = 'texts' #@param {type:"string"}
preview = 10 #@param {type:"integer"}


gc = gspread.authorize(GoogleCredentials.get_application_default())

worksheet = gc.open(corpus).sheet1

# get_all_values gives a list of rows.
rows = worksheet.get_all_values()

# convert the 3rd column values to a list
documents = []
for row in rows[1:]:
  documents.append(row[2])
            
# Convert to a DataFrame and render.
import pandas as pd
dataset_df = pd.DataFrame.from_records(rows)
dataset_df.head(n=preview)


# PreProcess Texts

Create bag-of-words from tokens in Document:


In [0]:
#@title Tokenization

tf_vectorizer = CountVectorizer(
    stop_words=None,
    min_df=1,
    max_df=0.95,
    lowercase=False,
    max_features=None,
    ngram_range=(1,1),
    analyzer = 'word'
)
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()
vocab = tf_vectorizer.vocabulary_

print("Vocabulary Size: ", len(tf_feature_names))

# Build a Topic Model

Now it's time to build a topic model by setting values for:
- number of topics
- alpha
- beta

In [0]:
#@title Run LDA

topics = 2 #@param {type:"integer"}

alpha = 1.0 #@param {type:"number"}

beta = 1.0 #@param {type:"number"}

no_top_words = 10

no_top_documents = 3


# Run LDA
lda_model = LatentDirichletAllocation(
    n_components=topics, 
    doc_topic_prior=alpha, 
    topic_word_prior=beta, 
    max_iter=100, 
    learning_method='online', 
    learning_offset=50.,
    random_state=0).fit(tf)
lda_W = lda_model.transform(tf)
lda_H = lda_model.components_

print("LDA Topics")
for topic_idx, topic in enumerate(lda_H):
    print("-"*30)
    print(" Topic ",(topic_idx)," :")
    print("["," | ".join([tf_feature_names[i]
                    for i in topic.argsort()[:-no_top_words - 1:-1]]),"]")
    top_doc_indices = np.argsort( lda_W[:,topic_idx] )[::-1][0:no_top_documents]
    for doc_index in top_doc_indices:
        row_index = doc_index +1
        print("[",doc_index,"] (",rows[row_index][0],") \'",rows[row_index][1],"\'")
        print("\t",lda_W[doc_index])

# Sorted List of Terms by  Frequency

In [0]:
s = tf.toarray().sum(axis=0)
st = sorted(range(len(s)), key=lambda k: s[k], reverse=True)
for i,x in enumerate(st[:20]):
  print(tf_vectorizer.get_feature_names()[x],s[x])

# Get Topic Distributions for each training document


In [0]:
for i,v in enumerate(lda_W):
  print("(",i,")",rows[i+1][0],":",v)

# Infer Topic Distribution for a new Document

In [0]:
text = "this is an example" #@param {type:"string"}

print("Topic Distribution: ", lda_model.transform(tf_vectorizer.transform([text])))
