In [2]:
# The code is inspired with https://github.com/susanli2016/NLP-with-Python/blob/master/LDA_news_headlines.ipynb
# and https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
import pandas as pd
import datatable as dt

In [3]:
df_train = dt.fread("./usnavy_data/source_data/tweet_sentiment_input_file.xlsx").to_pandas()
dfs = {'train': df_train}

In [4]:
df_train = df_train[['sentiment', 'text']]

In [5]:
pd.options.display.max_colwidth = 500

In [8]:
import gensim

In [12]:
df_train['pretext'] = df_train['text'].map(lambda x: x.lower())
data_train = df_train['pretext'].to_list()
docs = data_train.copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [14]:
import gensim
from gensim.utils import simple_preprocess

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(docs))

['usnavy', 'would', 'love', 'to', 'visit', 'space', 'eat', 'lunch', 'just', 'float', 'around', 'having', 'fun']


In [15]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [16]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

[nltk_data] Downloading package stopwords to /home/fiok/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [21]:
import spacy

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1][0][:30])

['would', 'love', 'visit', 'space', 'eat', 'lunch', 'float', 'fun']


In [22]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]


In [23]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=10, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

In [24]:
from pprint import pprint

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.050*"go" + 0.020*"get" + 0.013*"back" + 0.013*"come" + 0.011*"remember" + '
  '0.010*"turn" + 0.010*"need" + 0.009*"life" + 0.009*"think" + 0.009*"ready"'),
 (1,
  '0.017*"thank" + 0.014*"beautiful" + 0.012*"world" + 0.009*"video" + '
  '0.009*"forward" + 0.009*"may" + 0.009*"look" + 0.008*"ship" + '
  '0.008*"system" + 0.008*"issue"'),
 (2,
  '0.039*"good" + 0.024*"know" + 0.023*"great" + 0.023*"military" + '
  '0.012*"would" + 0.011*"make" + 0.010*"people" + 0.007*"force" + '
  '0.007*"give" + 0.007*"deploy"'),
 (3,
  '0.052*"family" + 0.022*"prayer" + 0.020*"peace" + 0.017*"may" + '
  '0.015*"rest" + 0.014*"year" + 0.014*"friend" + 0.013*"hero" + 0.012*"send" '
  '+ 0.012*"shipmate"'),
 (4,
  '0.023*"sailor" + 0.018*"ship" + 0.017*"watch" + 0.016*"great" + '
  '0.015*"always" + 0.009*"soldier" + 0.009*"name" + 0.008*"get" + '
  '0.008*"stand" + 0.008*"go"'),
 (5,
  '0.029*"ship" + 0.016*"love" + 0.015*"call" + 0.013*"live" + 0.009*"look" + '
  '0.009*"use" + 0.008*"usnavy"

In [25]:
from gensim.models import CoherenceModel

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.4008081366158945


In [26]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [86]:
import numpy as np
import tqdm

grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 5
max_topics = 20
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
# alpha = list()
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
# beta = list()
# beta.append(0.01)
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.75), 
               corpus]

corpus_title = ['100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results480.csv', index=False)
    pbar.close()

100%|██████████| 450/450 [53:17<00:00,  7.11s/it]


In [33]:
pd.DataFrame(model_results).to_csv('./lda_tuning_results450.csv', index=False)

In [89]:
dfres = dt.fread('./lda_tuning_results450.csv').to_pandas()

In [90]:
dfres

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
0,100% Corpus,5,0.01,0.01,0.266828
1,100% Corpus,5,0.01,0.31,0.240547
2,100% Corpus,5,0.01,0.61,0.309654
3,100% Corpus,5,0.01,0.9099999999999999,0.277597
4,100% Corpus,5,0.01,symmetric,0.269938
...,...,...,...,...,...
445,100% Corpus,19,asymmetric,0.01,0.467869
446,100% Corpus,19,asymmetric,0.31,0.539758
447,100% Corpus,19,asymmetric,0.61,0.532616
448,100% Corpus,19,asymmetric,0.9099999999999999,0.530674


In [91]:
dfres.iloc[dfres['Coherence'].idxmax()]

Validation_Set           100% Corpus
Topics                             8
Alpha                     asymmetric
Beta              0.9099999999999999
Coherence                   0.578056
Name: 118, dtype: object

In [163]:
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=8, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha="asymmetric",
                                           eta=0.91)

In [164]:
from gensim.models import CoherenceModel

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.5543341628085791


In [165]:
# Visualize the topics
pyLDAvis.enable_notebook()

LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)

LDAvis_prepared