# <center>Implementing LDA in Python</center>

<center>Dr. W.J.B. Mattingly</center>

<center>Smithsonian Data Science Lab and United States Holocaust Memorial Museum</center>

<center>February 2021</center>

## Key Concepts in this Notebook

## Introduction

## Importing the Required Libraries

In [1]:
#https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#1introduction
import numpy as np
import json
import glob

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
from nltk.corpus import stopwords

#vis
import pyLDAvis
import pyLDAvis.gensim_models

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
%connect_info

{
  "shell_port": 9027,
  "iopub_port": 9029,
  "stdin_port": 9028,
  "control_port": 9026,
  "hb_port": 9025,
  "ip": "127.0.0.1",
  "key": "7a9e43c5-68c7-4dde-8856-83604f6eb8e3",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
}

Paste the above JSON into a file, and connect with:
    $> jupyter <app> --existing <file>
or, if you are local, you can connect with just:
    $> jupyter <app> --existing C:\Users\KARTIKS\AppData\Local\Temp\tmp-129084popkxW5xJm9.json
or even just:
    $> jupyter <app> --existing
if this is the most recent Jupyter kernel you have started.


## Preparing the Data

In [3]:
def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f) 
    return (data)

def write_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)


In [4]:
stopwords = stopwords.words("english")

In [5]:
print (stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [6]:
data = load_data("data/ushmm_dn.json")["texts"]

print (data[0][0:90])

 My name David Kochalski. I was born in a small town called , and I was born May 5, 1928. 


In [10]:
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)


lemmatized_texts = lemmatization(data)
print (lemmatized_texts[0][0:90])

name bear small town call bear well very hard work child father mother small mill flour bu


In [11]:
print (lemmatized_texts[0][0:90])

name bear small town call bear well very hard work child father mother small mill flour bu


In [38]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(lemmatized_texts)

print (data_words[0][0:20])

['name', 'bear', 'small', 'town', 'call', 'bear', 'well', 'very', 'hard', 'work', 'child', 'father', 'mother', 'small', 'mill', 'flour', 'buckwheat', 'prosperous', 'comfortable', 'go']


In [53]:
data_words

[['name',
  'bear',
  'small',
  'town',
  'call',
  'bear',
  'well',
  'very',
  'hard',
  'work',
  'child',
  'father',
  'mother',
  'small',
  'mill',
  'flour',
  'buckwheat',
  'prosperous',
  'comfortable',
  'go',
  'school',
  'public',
  'school',
  'morning',
  'afternoon',
  'go',
  'religious',
  'school',
  'almost',
  'late',
  'night',
  'well',
  'raise',
  'spirit',
  'no',
  'school',
  'little',
  'city',
  'segregate',
  'mind',
  'small',
  'town',
  'say',
  'majority',
  'people',
  'small',
  'town',
  'jewish',
  'people',
  'town',
  'somehow',
  'know',
  'separate',
  'jewish',
  'child',
  'catholic',
  'child',
  'know',
  'most',
  'people',
  'use',
  'friend',
  'feel',
  'maybe',
  'personally',
  'know',
  'lot',
  'incident',
  'small',
  'little',
  'call',
  'separate',
  'other',
  'word',
  'hardly',
  'get',
  'together',
  'incident',
  'incident',
  'pleasant',
  'incident',
  'call',
  'house',
  'people',
  'regardless',
  'religious',
  

In [42]:
#BIGRAMS AND TRIGRAMS
bigram_phrases = gensim.models.Phrases(data_words, min_count=5, threshold=50)
trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], threshold=50)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return([bigram[doc] for doc in texts])

def make_trigrams(texts):
    return ([trigram[bigram[doc]] for doc in texts])

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)

print (data_bigrams_trigrams[0][0:50])

['name', 'bear', 'small', 'town', 'call', 'bear', 'well', 'very', 'hard', 'work', 'child', 'father', 'mother', 'small', 'mill', 'flour', 'buckwheat', 'prosperous', 'comfortable', 'go', 'school', 'public_school', 'morning', 'afternoon', 'go', 'religious', 'school', 'almost', 'late', 'night', 'well', 'raise', 'spirit', 'no', 'school', 'little', 'city', 'segregate', 'mind', 'small', 'town', 'say', 'majority', 'people', 'small', 'town', 'jewish', 'people', 'town', 'somehow']


In [44]:
print(data_bigrams_trigrams[0][:50])

['name', 'bear', 'small', 'town', 'call', 'bear', 'well', 'very', 'hard', 'work', 'child', 'father', 'mother', 'small', 'mill', 'flour', 'buckwheat', 'prosperous', 'comfortable', 'go', 'school', 'public_school', 'morning', 'afternoon', 'go', 'religious', 'school', 'almost', 'late', 'night', 'well', 'raise', 'spirit', 'no', 'school', 'little', 'city', 'segregate', 'mind', 'small', 'town', 'say', 'majority', 'people', 'small', 'town', 'jewish', 'people', 'town', 'somehow']


In [47]:
#TF-IDF REMOVAL
from gensim.models import TfidfModel

id2word = corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

corpus = [id2word.doc2bow(text) for text in texts]
# print (corpus[0][0:20])

tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.03
words  = []
words_missing_in_tfidf = []
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow


In [10]:
# id2word = corpora.Dictionary(all_texts)

# corpus = []
# for text in data_words:
#     new = id2word.doc2bow(text)
#     corpus.append(new)

# print (corpus[0][0:20])

# word = id2word[[0][:1][0]]
# print (word)

In [14]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus[:-1],
                                           id2word=id2word,
                                           num_topics=10,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")


In [17]:
test_doc = corpus[-1]

vector = lda_model[test_doc]
print (vector)

def Sort(sub_li):
    sub_li.sort(key = lambda x: x[1])
    sub_li.reverse()
    return (sub_li)
new_vector = Sort(vector)
print (new_vector)

[(0, 0.045874324), (1, 0.035836946), (2, 0.4376528), (5, 0.15934138), (7, 0.057556253), (8, 0.07231899), (9, 0.19118835)]
[(2, 0.4376528), (9, 0.19118835), (5, 0.15934138), (8, 0.07231899), (7, 0.057556253), (0, 0.045874324), (1, 0.035836946)]


In [20]:
lda_model.save("models/test_model.model")

In [22]:
new_model = gensim.models.ldamodel.LdaModel.load("models/test_model.model")

In [23]:
test_doc = corpus[-1]

vector = new_model[test_doc]
print (vector)

def Sort(sub_li):
    sub_li.sort(key = lambda x: x[1])
    sub_li.reverse()
    return (sub_li)
new_vector = Sort(vector)
print (new_vector)

[(0, 0.045875695), (1, 0.035836957), (2, 0.43765357), (5, 0.15934144), (7, 0.057556026), (8, 0.072318755), (9, 0.1911866)]
[(2, 0.43765357), (9, 0.1911866), (5, 0.15934144), (8, 0.072318755), (7, 0.057556026), (0, 0.045875695), (1, 0.035836957)]


## Vizualizing the Data

In [21]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
# vis