In [5]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models  # don't skip this
import matplotlib.pyplot as plt
# %matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

Read in the texts

In [8]:
df = pd.read_csv('../data/aft.csv')
txt = df.text.values.tolist()
txt = [re.sub('\s+', ' ', sent) for sent in txt]
txt = [re.sub("\'", "", sent) for sent in txt]

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

words = list(sent_to_words(txt))

# print(words[:1])

In [9]:
bigram = gensim.models.Phrases(
  words, min_count = 10, threshold = 50, 
  connector_words = gensim.models.phrases.ENGLISH_CONNECTOR_WORDS
) # higher threshold fewer phrases.

trigram = gensim.models.Phrases(
  bigram[words], threshold=50,
  connector_words = gensim.models.phrases.ENGLISH_CONNECTOR_WORDS
)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# print(trigram_mod[bigram_mod[words[2]]])

# Define functions for stopwords, bigrams, trigrams and lemmatization

from gensim.parsing.preprocessing import STOPWORDS
# added_stopwords = STOPWORDS.union(set(['likes', 'play']))

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in STOPWORDS] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

words = make_bigrams(words)
words = make_trigrams(words)
words = remove_stopwords(words)

In [10]:
# Create Dictionary
id2word = corpora.Dictionary(words)
texts = words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# print(corpus[:1])
# [[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

lda_model = gensim.models.ldamodel.LdaModel(
  corpus=corpus, id2word=id2word, num_topics=20, 
  random_state=100, update_every=1, chunksize=100,
  passes=10, alpha='auto', per_word_topics=True
)

pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.084*"said" + 0.027*"went" + 0.023*"little" + 0.017*"came" + 0.015*"got" + '
  '0.014*"come" + 0.012*"home" + 0.010*"house" + 0.010*"oh" + 0.009*"going"'),
 (1,
  '0.195*"wolf" + 0.076*"goat" + 0.029*"journeyman" + 0.021*"george" + '
  '0.018*"mighty" + 0.017*"skin" + 0.014*"honey" + 0.012*"der" + 0.010*"bees" '
  '+ 0.010*"goats"'),
 (2,
  '0.202*"hare" + 0.079*"lion" + 0.078*"cage" + 0.018*"lions" + '
  '0.015*"animals" + 0.007*"forest" + 0.005*"buffaloes" + 0.005*"earth" + '
  '0.005*"range" + 0.004*"palm"'),
 (3,
  '0.116*"peter" + 0.065*"bridge" + 0.023*"millers" + 0.019*"boar" + '
  '0.018*"bran" + 0.017*"fought" + 0.010*"aye" + 0.009*"annoyed" + 0.004*"og" '
  '+ 0.001*"elks"'),
 (4,
  '0.044*"heels" + 0.041*"raja" + 0.027*"tank" + 0.022*"traveler" + '
  '0.021*"rupees" + 0.020*"obtain" + 0.013*"pitcher" + 0.013*"swans" + '
  '0.012*"flat" + 0.012*"plowing"'),
 (5,
  '0.051*"thee" + 0.033*"thy" + 0.030*"thou" + 0.025*"wild" + 0.023*"juan" + '
  '0.021*"ye" + 0.017*"snak

In [11]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

  default_term_info = default_term_info.sort_values(
