# Topic Modeling With Gensim LDA - Unigram, Bigram, Mallet
By: Jiali Huang

## Load/Install packages

In [1]:
# load packages

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
from gensim import corpora
from gensim.models import CoherenceModel
import re
import numpy as np
import pandas as pd

In [20]:
# download libraries

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('gutenberg')
nltk.download('wordnet')
!pip install gensim==3.8.3 #specific version needed for lda mallet with set seed feature

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [3]:
# import stopwords for preprocessing

from nltk.corpus import stopwords
stops = stopwords.words('english')


In [4]:
# create corpus, using 20 newsgroup data without targets
# unsupervised learning

from sklearn.datasets import fetch_20newsgroups
newsgroups = fetch_20newsgroups(subset='train')
newsgroups = newsgroups['data']


Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


## Pre-processing

In [5]:
# pre-process text

newsgroups = [re.sub('\s+', ' ', sent) for sent in newsgroups] # replace new lines with space
newsgroups = [re.sub('(\S+@\S+\s*)', '', sent) for sent in newsgroups] # remove emails


In [6]:
newsgroups[:5]

["From: (where's my thing) Subject: WHAT car is this!? Nntp-Posting-Host: rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: 15 I was wondering if anyone out there could enlighten me on this car I saw the other day. It was a 2-door sports car, looked to be from the late 60s/ early 70s. It was called a Bricklin. The doors were really small. In addition, the front bumper was separate from the rest of the body. This is all I know. If anyone can tellme a model name, engine specs, years of production, where this car is made, history, or whatever info you have on this funky looking car, please e-mail. Thanks, - IL ---- brought to you by your neighborhood Lerxst ---- ",
 "From: (Guy Kuo) Subject: SI Clock Poll - Final Call Summary: Final call for SI clock reports Keywords: SI,acceleration,clock,upgrade Article-I.D.: shelley.1qvfo9INNc3s Organization: University of Washington Lines: 11 NNTP-Posting-Host: carson.u.washington.edu A fair number of brave souls who upgraded 

In [17]:
# data are in strings, need to turn into words/tokens

tokens_newsgroups = [word_tokenize(sent) for sent in newsgroups]


In [18]:
# data is still noisy, filtering will take care of the rest

tokens_newsgroups[1]

['From',
 ':',
 '(',
 'Guy',
 'Kuo',
 ')',
 'Subject',
 ':',
 'SI',
 'Clock',
 'Poll',
 '-',
 'Final',
 'Call',
 'Summary',
 ':',
 'Final',
 'call',
 'for',
 'SI',
 'clock',
 'reports',
 'Keywords',
 ':',
 'SI',
 ',',
 'acceleration',
 ',',
 'clock',
 ',',
 'upgrade',
 'Article-I.D',
 '.',
 ':',
 'shelley.1qvfo9INNc3s',
 'Organization',
 ':',
 'University',
 'of',
 'Washington',
 'Lines',
 ':',
 '11',
 'NNTP-Posting-Host',
 ':',
 'carson.u.washington.edu',
 'A',
 'fair',
 'number',
 'of',
 'brave',
 'souls',
 'who',
 'upgraded',
 'their',
 'SI',
 'clock',
 'oscillator',
 'have',
 'shared',
 'their',
 'experiences',
 'for',
 'this',
 'poll',
 '.',
 'Please',
 'send',
 'a',
 'brief',
 'message',
 'detailing',
 'your',
 'experiences',
 'with',
 'the',
 'procedure',
 '.',
 'Top',
 'speed',
 'attained',
 ',',
 'CPU',
 'rated',
 'speed',
 ',',
 'add',
 'on',
 'cards',
 'and',
 'adapters',
 ',',
 'heat',
 'sinks',
 ',',
 'hour',
 'of',
 'usage',
 'per',
 'day',
 ',',
 'floppy',
 'disk',
 'fun

In [30]:
# manual filter, can use built in if preferred

def filter(file):
  filtered_token = []
  for sent in file:
    temp_list = []
    for token in sent:
      if token.lower() not in stops and len(token) > 3 and token.isalpha():  # conditions
        token = token.lower()
        lem_token = WordNetLemmatizer().lemmatize(token) # lemmatization
        temp_list.append(lem_token)
    filtered_token.append(temp_list)
  return filtered_token

filtered_tokens_newsgroups = filter(tokens_newsgroups)


In [31]:
# after filtering, looks much cleaner

filtered_tokens_newsgroups[1]

['subject',
 'clock',
 'poll',
 'final',
 'call',
 'summary',
 'final',
 'call',
 'clock',
 'report',
 'keywords',
 'acceleration',
 'clock',
 'upgrade',
 'organization',
 'university',
 'washington',
 'line',
 'fair',
 'number',
 'brave',
 'soul',
 'upgraded',
 'clock',
 'oscillator',
 'shared',
 'experience',
 'poll',
 'please',
 'send',
 'brief',
 'message',
 'detailing',
 'experience',
 'procedure',
 'speed',
 'attained',
 'rated',
 'speed',
 'card',
 'adapter',
 'heat',
 'sink',
 'hour',
 'usage',
 'floppy',
 'disk',
 'functionality',
 'floppy',
 'especially',
 'requested',
 'summarizing',
 'next',
 'day',
 'please',
 'network',
 'knowledge',
 'base',
 'done',
 'clock',
 'upgrade',
 'answered',
 'poll',
 'thanks']

## Gensim LDA Unigram

In [32]:
# get necessary inputs for gensim unigram

dictionary = corpora.Dictionary(filtered_tokens_newsgroups)
corpus = [dictionary.doc2bow(text) for text in filtered_tokens_newsgroups]


In [46]:
# input to gensim vanilla unigram

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 20, random_state=12, id2word=dictionary, passes=15)

# get topics

topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.028*"key" + 0.022*"number" + 0.021*"chip" + 0.019*"block" + 0.017*"bit" + 0.016*"serial" + 0.015*"encrypted" + 0.014*"message" + 0.013*"cipher" + 0.011*"random"')
(1, '0.028*"drive" + 0.014*"disk" + 0.013*"problem" + 0.012*"card" + 0.011*"system" + 0.009*"scsi" + 0.008*"driver" + 0.008*"memory" + 0.008*"window" + 0.008*"file"')
(2, '0.010*"state" + 0.008*"government" + 0.006*"american" + 0.006*"year" + 0.006*"crime" + 0.006*"people" + 0.005*"firearm" + 0.005*"weapon" + 0.005*"rate" + 0.005*"control"')
(3, '0.032*"line" + 0.030*"subject" + 0.028*"organization" + 0.016*"university" + 0.009*"would" + 0.009*"thanks" + 0.009*"know" + 0.008*"anyone" + 0.008*"writes" + 0.008*"article"')
(4, '0.012*"israel" + 0.011*"right" + 0.011*"said" + 0.010*"israeli" + 0.010*"people" + 0.007*"state" + 0.006*"arab" + 0.006*"would" + 0.005*"know" + 0.004*"time"')
(5, '0.013*"drug" + 0.012*"disease" + 0.011*"medical" + 0.010*"health" + 0.009*"study" + 0.009*"patient" + 0.009*"doctor" + 0.007*"child" +

In [35]:
# Compute Perplexity

print('Perplexity: ', ldamodel.log_perplexity(corpus))  

# Compute Coherence Score

coherence_model_lda = CoherenceModel(model=ldamodel, texts=filtered_tokens_newsgroups, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Perplexity:  -9.568304192787805
Coherence Score:  0.5459241301147186


## Gensim LDA Bigram

In [40]:
# Gensim LDA, bigrams

# inititate bigrams

bigram = gensim.models.Phrases(filtered_tokens_newsgroups, min_count=3, threshold=1)
bigram_mod = gensim.models.phrases.Phraser(bigram)

# create bigrams

texts_bigrams = [bigram_mod[bigram_mod[sent]] for sent in filtered_tokens_newsgroups]


In [41]:
print(filtered_tokens_newsgroups[1])
print(texts_bigrams[1])

['subject', 'clock', 'poll', 'final', 'call', 'summary', 'final', 'call', 'clock', 'report', 'keywords', 'acceleration', 'clock', 'upgrade', 'organization', 'university', 'washington', 'line', 'fair', 'number', 'brave', 'soul', 'upgraded', 'clock', 'oscillator', 'shared', 'experience', 'poll', 'please', 'send', 'brief', 'message', 'detailing', 'experience', 'procedure', 'speed', 'attained', 'rated', 'speed', 'card', 'adapter', 'heat', 'sink', 'hour', 'usage', 'floppy', 'disk', 'functionality', 'floppy', 'especially', 'requested', 'summarizing', 'next', 'day', 'please', 'network', 'knowledge', 'base', 'done', 'clock', 'upgrade', 'answered', 'poll', 'thanks']
['subject', 'clock', 'poll', 'final_call', 'summary', 'final_call', 'clock', 'report', 'keywords', 'acceleration', 'clock_upgrade', 'organization_university', 'washington_line', 'fair', 'number', 'brave_soul', 'upgraded', 'clock_oscillator', 'shared', 'experience', 'poll', 'please_send', 'brief_message', 'detailing', 'experience', '

In [43]:
# create dictionary, corpus

dictionary_bigram = corpora.Dictionary(corpus_bigrams)
corpus_bigram = [dictionary_bigram.doc2bow(text) for text in texts_bigrams]

In [48]:
# input to lda bigram model

ldamodel_bigram = gensim.models.ldamodel.LdaModel(corpus_bigram, num_topics = 20, random_state=12, id2word=dictionary_bigram, passes=15)
topics_bigram = ldamodel_bigram.print_topics(num_words=10)
for topic in topics_bigram:
    print(topic)

(0, '0.009*"organization_netcom" + 0.007*"online_communication" + 0.006*"guest_line" + 0.006*"communication_service" + 0.005*"expose_event" + 0.005*"water" + 0.003*"service_login" + 0.003*"subject_diamond" + 0.003*"uart" + 0.003*"humor"')
(1, '0.011*"firearm" + 0.010*"gun" + 0.009*"weapon" + 0.005*"handgun" + 0.004*"criminal" + 0.004*"control" + 0.004*"case_western" + 0.004*"reserve_university" + 0.003*"lost" + 0.003*"shall"')
(2, '0.007*"cursor" + 0.004*"idle" + 0.004*"doug" + 0.004*"sphere" + 0.004*"adcom" + 0.003*"organization_motorola" + 0.003*"mazda" + 0.003*"resistor" + 0.003*"equation" + 0.003*"point"')
(3, '0.008*"keith" + 0.007*"organization_california" + 0.006*"institute_technology" + 0.005*"pasadena_line" + 0.005*"keith_allan" + 0.004*"schneider_subject" + 0.004*"easter" + 0.003*"muslim" + 0.003*"political_atheist" + 0.003*"somalia"')
(4, '0.013*"subject" + 0.009*"line" + 0.006*"system" + 0.006*"window" + 0.006*"problem" + 0.005*"using" + 0.005*"file" + 0.005*"also" + 0.005*

In [45]:
# Compute Perplexity

print('Perplexity: ', ldamodel_bigram.log_perplexity(corpus_bigram))  

# Compute Coherence Score

coherence_model_lda_bigram = CoherenceModel(model=ldamodel_bigram, texts=texts_bigrams, dictionary=dictionary_bigram, coherence='c_v')
coherence_lda_bigram = coherence_model_lda_bigram.get_coherence()
print('Coherence Score: ', coherence_lda_bigram)

Perplexity:  -11.86768571645846
Coherence Score:  0.49331852149864647


## Gensim LDA Mallet Unigram

In [47]:
# Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip , upload zip file to colab
!unzip mallet-2.0.8.zip


Archive:  mallet-2.0.8.zip
   creating: mallet-2.0.8/
  inflating: mallet-2.0.8/.DS_Store  
   creating: __MACOSX/
   creating: __MACOSX/mallet-2.0.8/
  inflating: __MACOSX/mallet-2.0.8/._.DS_Store  
   creating: mallet-2.0.8/bin/
  inflating: mallet-2.0.8/bin/classifier2info  
   creating: __MACOSX/mallet-2.0.8/bin/
  inflating: __MACOSX/mallet-2.0.8/bin/._classifier2info  
  inflating: mallet-2.0.8/bin/csv2classify  
  inflating: __MACOSX/mallet-2.0.8/bin/._csv2classify  
  inflating: mallet-2.0.8/bin/csv2vectors  
  inflating: __MACOSX/mallet-2.0.8/bin/._csv2vectors  
  inflating: mallet-2.0.8/bin/mallet  
  inflating: __MACOSX/mallet-2.0.8/bin/._mallet  
  inflating: mallet-2.0.8/bin/mallet.bat  
  inflating: __MACOSX/mallet-2.0.8/bin/._mallet.bat  
  inflating: mallet-2.0.8/bin/mallethon  
  inflating: __MACOSX/mallet-2.0.8/bin/._mallethon  
  inflating: mallet-2.0.8/bin/prepend-license.sh  
  inflating: __MACOSX/mallet-2.0.8/bin/._prepend-license.sh  
  inflating: mallet-2.0.8/bi

In [49]:
# create path to mallet

mallet_path = '/content/mallet-2.0.8/bin/mallet' 

# use mallet in lda mallet model

ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=dictionary, random_seed=12)
topics_mallet = ldamallet.print_topics(num_words=10)
for topic in topics_mallet:
    print(topic)


(0, '0.017*"israel" + 0.014*"israeli" + 0.013*"jew" + 0.009*"state" + 0.009*"arab" + 0.008*"jewish" + 0.007*"writes" + 0.007*"world" + 0.007*"country" + 0.007*"subject"')
(1, '0.027*"space" + 0.009*"nasa" + 0.007*"system" + 0.007*"launch" + 0.007*"earth" + 0.007*"research" + 0.006*"satellite" + 0.006*"center" + 0.006*"mission" + 0.006*"data"')
(2, '0.034*"drive" + 0.025*"card" + 0.022*"problem" + 0.019*"system" + 0.017*"disk" + 0.016*"driver" + 0.014*"work" + 0.012*"hard" + 0.011*"window" + 0.011*"scsi"')
(3, '0.012*"year" + 0.012*"president" + 0.012*"money" + 0.009*"people" + 0.009*"state" + 0.007*"american" + 0.007*"work" + 0.007*"clinton" + 0.006*"program" + 0.006*"care"')
(4, '0.018*"chip" + 0.017*"system" + 0.014*"encryption" + 0.012*"clipper" + 0.011*"information" + 0.010*"message" + 0.009*"government" + 0.009*"public" + 0.009*"key" + 0.009*"access"')
(5, '0.019*"armenian" + 0.016*"people" + 0.011*"turkish" + 0.007*"time" + 0.007*"woman" + 0.006*"greek" + 0.006*"turk" + 0.006*"ar

In [50]:
# Compute Coherence Score

coherence_model_lda_mallet = CoherenceModel(model=ldamallet, texts=filtered_tokens_newsgroups, dictionary=dictionary, coherence='c_v')
coherence_lda_mallet = coherence_model_lda_mallet.get_coherence()
print('Coherence Score: ', coherence_lda_mallet)

Coherence Score:  0.5988322391643066


## Gensim LDA Mallet Bigram

In [51]:
# use mallet in lda mallet model

ldamallet_bigrams = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus_bigram, num_topics=20, id2word=dictionary_bigram, random_seed=12)
topics_mallet_bigrams = ldamallet_bigrams.print_topics(num_words=10)
for topic in topics_mallet_bigrams:
    print(topic)

(0, '0.013*"year" + 0.009*"game" + 0.007*"writes" + 0.006*"baseball" + 0.006*"last_year" + 0.005*"line_article" + 0.005*"good" + 0.005*"team" + 0.005*"player" + 0.004*"organization_university"')
(1, '0.013*"window" + 0.009*"program" + 0.008*"file" + 0.008*"display" + 0.007*"application" + 0.007*"image" + 0.007*"subject" + 0.005*"version" + 0.005*"server" + 0.005*"code"')
(2, '0.007*"claim" + 0.007*"question" + 0.007*"argument" + 0.007*"religion" + 0.006*"evidence" + 0.005*"statement" + 0.005*"science" + 0.005*"true" + 0.005*"exist" + 0.005*"belief"')
(3, '0.007*"case" + 0.007*"state" + 0.007*"people" + 0.006*"government" + 0.006*"control" + 0.005*"weapon" + 0.005*"law" + 0.005*"gun" + 0.005*"court" + 0.005*"crime"')
(4, '0.007*"president" + 0.006*"state" + 0.006*"bill" + 0.006*"program" + 0.005*"united_state" + 0.005*"group" + 0.004*"plan" + 0.004*"support" + 0.004*"year" + 0.004*"work"')
(5, '0.012*"information" + 0.010*"file" + 0.009*"list" + 0.007*"system" + 0.006*"book" + 0.005*"si

In [52]:
# Compute Coherence Score

coherence_model_lda_mallet_bigrams = CoherenceModel(model=ldamallet_bigrams, texts=texts_bigrams, dictionary=dictionary_bigram, coherence='c_v')
coherence_lda_mallet_bigrams = coherence_model_lda_mallet_bigrams.get_coherence()
print('Coherence Score: ', coherence_lda_mallet_bigrams)

Coherence Score:  0.5484432361243267


# Summary

In [None]:
# Without mallet model, unigrams performed better than bigrams in terms of coherence score. 
# The topics are separated well, you can see some are about technology, politics, bible. 
# Since this is unsupervised learning, the model performed well in terms of differentiating the topics and keywords were intuitive.
# Other methods to improve could be testing out Bert, incorporting transfer learning, or more finetuning/filtering.
# With mallet model, there is noticeable increase in coherence score for unigrams and bigrams.
