# Topic Modelling with LDA

#### 


#### For illustration only

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
documents = ["cat eat rice", "secret message", "today go shopping"]
tf_vectorizer = CountVectorizer()
tf_vectorized_documents = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 10
lda_model = LatentDirichletAllocation(n_components=no_topics)
lda_output = lda_model.fit_transform(tf_vectorized_documents)
lda_components = lda_model.components_


###  Import the neccessary libraries
- LDA works only with bag of words approach only
- Regular expressions re, gensim and spacy are used to process texts. 
- PyLDAvis and matplotlib for visualization and numpy
- Pandas for manipulating and viewing data in tabular format.



In [2]:
# Sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

# 
import numpy as np
import pandas as pd
import re, nltk


# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

  from collections import Mapping


#### display_topic() is a commonly used function to display topics and related terms
- model - the lda model
- feature_names - the features names
- no_top_words - how many terms to display

** Do not change this function ** 

In [3]:

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

#### Function for Data Cleaning
- Clean_documents() is a function to perform data cleansing for a raw document
- This is a snub function for now. When you prepare your own set of data, you will write your own pre-processing logic

In [4]:
def clean_documents(document):
    # placeholder: Write data preparation codes here
    
    return document

###  Data Processing 

#### This is the section to modify if you have other sources
- Load in the documents from its source
- The LDA topic model algorithm requires a document word matrix as the main input.
- Vectorise the document using count vectorizing
- LDA can only use raw term counts for LDA because it is a probabilistic graphical model


In [5]:

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = clean_documents(dataset.data)

no_features = 1000
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf_vectorized_documents = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

#### Create the LDA model and apply LDA to the corpus of document
- Fit and transform the vectorize document (tf)
- Display the model attributes
- Displays the topics terms
- Since most cells contain zeros, the result (lda_model) will be in the form of a sparse matrix to save memory


In [6]:

no_topics = 10    # this s just a wild guess
lda_model = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
lda_output = lda_model.fit_transform(tf_vectorized_documents)

print(lda_model)  # To look at the modelModel attributes


LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=50.0,
                          max_doc_update_iter=100, max_iter=5,
                          mean_change_tol=0.001, n_components=10, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)


### Preview the terms of each topic
- Displays the topics terms

In [7]:
no_top_words = 20
display_topics(lda_model, tf_feature_names, no_top_words)

Topic 0:
people gun armenian armenians war turkish states israel said children jews 000 state new guns israeli vs military years american
Topic 1:
government people law mr use president don think right public make state going privacy private security know new rights want
Topic 2:
space program output entry data nasa use science research build section center launch time high earth year rules long satellite
Topic 3:
key car chip used keys bike use bit clipper number phone like cars just engine ground des algorithm good secret
Topic 4:
edu file com available mail ftp files information image send list use version server email pub software cs code window
Topic 5:
god people does jesus say think believe don know just way like true question life time christian did point bible
Topic 6:
windows use drive thanks does problem know card like using db scsi dos disk bit need pc memory mac work
Topic 7:
ax max b8f g9v a86 pl 145 1d9 0t 34u 1t 3t giz bhj wm 2di 75u 2tm cx bxn
Topic 8:
just don like th

### Finding dominate topics for a document
- To classify a document as belonging to a particular topic, a logical approach is to see which topic has the highest contribution to that document and assign it.


In [8]:

# column names
topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(documents))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style to first 25 document
df_document_topic2 = df_document_topic.head(25).style.applymap(color_green).applymap(make_bold)
df_document_topic2



Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
Doc0,0.53,0.0,0.0,0.0,0.0,0.24,0.0,0.0,0.2,0.0,0
Doc1,0.0,0.0,0.0,0.0,0.0,0.55,0.0,0.0,0.42,0.0,5
Doc2,0.3,0.28,0.0,0.0,0.0,0.0,0.0,0.0,0.39,0.0,8
Doc3,0.0,0.24,0.0,0.43,0.0,0.0,0.08,0.0,0.23,0.0,3
Doc4,0.01,0.01,0.01,0.01,0.13,0.31,0.01,0.01,0.53,0.01,8
Doc5,0.01,0.01,0.01,0.01,0.01,0.51,0.01,0.01,0.42,0.01,5
Doc6,0.0,0.09,0.0,0.0,0.04,0.0,0.3,0.0,0.37,0.19,8
Doc7,0.0,0.0,0.0,0.0,0.0,0.52,0.0,0.0,0.4,0.05,5
Doc8,0.0,0.33,0.0,0.0,0.0,0.0,0.0,0.0,0.64,0.0,8
Doc9,0.0,0.0,0.0,0.45,0.0,0.0,0.15,0.0,0.38,0.0,3


### Question: Inspect the content of the documents

- Visually look for a document with a dominant topic > 0.6 ( doc 14/topic 6 , doc 18/topic 3)
- Use the document number to access the original corpus
- prints the related document / text

In [None]:
# TO DO:
# your codes


### Optional Question:
- If we want the table above to assign a different color (e.g. red) to documents with topics weight of 0.5, what code changes are needed?


### Optional Question:
* Having topics named as topic0, topic1 etc isn't helpful in a real application. Discuss several possible ways to assign meaningful names to the row header.
    

#### Reference:
- www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/
