In [None]:
"""
Run this command. This will clone/download the necessary jupyter notebook and data files required
"""

!git clone https://github.com/limaih/itu108_topicmodel

In [None]:
''' step 1. import necessary libraries
'''
import nltk
nltk.download('stopwords') 
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

import gensim
from gensim import corpora

import string
from pathlib import Path
from pprint import pprint

In [None]:
''' step 2. read in files (from directory) for analysis
'''
#r is the raw sting literals so that windows path slash won't create problem
data_folder = Path(r'news') 
#read each file from the directory into an array and name it corpus 
corpus = []
filenames = []

for filename in data_folder.iterdir():
    #encoding for macbook: encoding = "ISO-8859-1"
    fp = open(str(filename), 'r', encoding = "ISO-8859-1")
    corpus.append(fp.read())
    #keep the filename for later use
    filenames.append(filename.name) 
    fp.close()
    
print(corpus.__len__())

In [None]:
''' step3. conduct preprocessing steps
'''
#stemming - English
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

#addon to stop words
domain_stop = []
stop.update(domain_stop)

def clean(doc):
    punc_free = ''.join([ch for ch in doc.lower() if ch not in exclude])
    stop_free = ' '.join([i for i in punc_free.split() if i not in stop]) 
    normalized = ' '.join(lemma.lemmatize(word) for word in stop_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in corpus]

In [None]:
''' step 4. prepare word representation - term frequency or doc term matrix 
'''
dictionary = corpora.Dictionary(doc_clean) 
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [None]:
''' step 5. create lda model
'''
topic_num = 
word_num = 
Lda = gensim.models.ldamodel.LdaModel
ldamodel = Lda(doc_term_matrix, num_topics = topic_num, id2word = dictionary, passes=20) 

pprint(ldamodel.print_topics(num_topics=topic_num, num_words=word_num))

In [None]:
''' step 6. Compute Perplexity
'''
print('Perplexity: ', )


In [None]:
''' step 7. Assigned Topic and Probability
'''
print('\nFile name and its corresponding topic id with probability:')
dic_topic_doc = {}
for index, doc in enumerate(doc_clean):
    #for doc in doc_clean:
    bow = dictionary.doc2bow(doc)
    
    #get topic distribution of the ldamodel
    t = ldamodel.get_document_topics(bow) 
   
    #sort the probability value in descending order to extract the top contributing topic id 
    sorted_t = sorted(t, key=lambda x: x[1], reverse=True) 
    
    #print only the filename 
    print(filenames[index],sorted_t) 
    
    #get the top scoring item 
    top_item = sorted_t.pop(0) 
   
    #create dictionary and keep key as topic id and filename and probability in tuple as value 
    dic_topic_doc.setdefault(top_item[0],[]).append((filenames[index],top_item[1]))

In [None]:
print('\nTopic id, number of documents, list of documents with probability and represented topic words:')
for key,value in dic_topic_doc.items():
    sorted_value = sorted(value, key=lambda x: x[1], reverse=True)
    print(key,len(value),sorted_value)
    #print the topic word and most represented doc
    print(ldamodel.print_topic(key,word_num))

In [None]:
# install pyLDAvis package
!pip install pyLDAvis

In [None]:
''' step 8. Visualize topics and keywords
'''
# plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

# visualize the topics and keywords
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary)
vis