# Import Libraries

In [3]:
import numpy as np
import pandas as pd

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import TfidfModel
from gensim.models import CoherenceModel

# import spacy
from nltk.corpus import stopwords

import pyLDAvis
import pyLDAvis.gensim

Notebook made using this video:  
https://www.youtube.com/watch?v=TKjjlp5_r7o

Another one to watch later:  
https://www.youtube.com/watch?v=UEn3xHNBXJU

# Read and process data

In [4]:
# read datafile
df = pd.read_csv('../project_data/complaints_processed.csv')

# clean up by removing NaN's, etc.
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df = df.dropna()
df.drop(df.loc[df['narrative'] == 'name'].index, inplace=True)

# Trying out on portion of dataframe

In [5]:
# just getting the texts from the first 1000 lines
partial_df = df['narrative'][0:1000]

In [6]:
partial_df.head()

0    purchase order day shipping amount receive pro...
1    forwarded message date tue subject please inve...
2    forwarded message cc sent friday pdt subject f...
3    payment history missing credit report speciali...
4    payment history missing credit report made mis...
Name: narrative, dtype: object

Testing to split strings. Gensim requires bag of words data (list of lists of individual strings)

In [7]:
partial_df[0].split()[0:5]

['purchase', 'order', 'day', 'shipping', 'amount']

Make the datataframe into a list of individual words

In [8]:
data_words = partial_df.tolist()

for i in range(len(data_words)):
    data_words[i] = data_words[i].split()

In [9]:
id2word = corpora.Dictionary(data_words)

In [10]:
print(id2word) 

Dictionary(5396 unique tokens: ['accordance', 'address', 'adjustment', 'adjustmentmerchandiserobert', 'agreed']...)


`id2word` is a dictionary with each token given an id.

`id2word.token2id` -->

`{'accordance': 0,
 'address': 1,
 'adjustment': 2,
 'adjustmentmerchandiserobert': 3,
 'agreed': 4,
 'although': 5, ...}`

In [11]:
id2word.doc2bow(['address', 'adjustment', 'cancel'])

[(1, 1), (2, 1), (16, 1)]

`doc2bow` creates a tuple of (token_id, token_count). Here I'm making a list of these tuples for the gensim

In [12]:
corpus = []
for text in data_words:
    new = id2word.doc2bow(text)
    corpus.append(new)

In [13]:
# the first five tuples of the first complaint
corpus[0][0:5]

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]

In [14]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [15]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds='mmds', n_jobs=1)

In [16]:
pyLDAvis.display(vis)

## Scoring the model

In [17]:
# Compute Coherence Score using c_v
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', round(coherence_lda, 3))


Coherence Score:  0.511


## Try with different params

In [18]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [19]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds='mmds', n_jobs=1)

In [20]:
pyLDAvis.display(vis)

In [21]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', round(coherence_lda, 3))


Coherence Score:  0.453


So more topics in this case yields a lower coherence score

# Using whole dataframe

Processing dataframe into list of lists of words

In [22]:
data_words = df['narrative'].tolist()

for i in range(len(data_words)):
    data_words[i] = data_words[i].split()

Prepping the data for `lda_model`

In [23]:
id2word = corpora.Dictionary(data_words)

corpus = []
for text in data_words:
    new = id2word.doc2bow(text)
    corpus.append(new)

## Creating the model with 5 topics

In [24]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=5,
                                            random_state=100,
                                            update_every=1,
                                            passes=5,
                                            alpha="auto")

Visualizing the model

In [25]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds='mmds')

  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))


In [26]:
pyLDAvis.display(vis)

In [27]:
pyLDAvis.save_html(vis, 'exported_images/topics_five.html')

In [28]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', round(coherence_lda, 3))


Coherence Score:  0.54


## Creating the model with 6 topics

In [29]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=6,
                                            random_state=100,
                                            update_every=1,
                                            passes=5,
                                            alpha="auto")

In [30]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds='mmds')
pyLDAvis.display(vis)

  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))


In [31]:
pyLDAvis.save_html(vis, 'exported_images/topics_six.html')

In [36]:
coherence_model_lda_sixtopics = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
# coherence_lda = coherence_model_lda.get_coherence()
# print('\nCoherence Score: ', round(coherence_lda, 3))

In [37]:
coherence_lda = coherence_model_lda_sixtopics.get_coherence()
print('\nCoherence Score: ', round(coherence_lda, 3))


Coherence Score:  0.553


### Make a dataframe for scoring

In [39]:
scoring_df = pd.DataFrame(data={'num_topics':[5], "coherence_score":[0.54]})

In [40]:
# Add new row to scoring_df
scoring_df.loc[len(scoring_df.index)] = [6, round(coherence_lda, 3)] 
scoring_df

Unnamed: 0,num_topics,coherence_score
0,5.0,0.54
1,6.0,0.553


## Creating the model with 7 topics

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=7,
                                            random_state=100,
                                            update_every=1,
                                            passes=5,
                                            alpha="auto")

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds='mmds')
pyLDAvis.display(vis)

In [None]:
pyLDAvis.save_html(vis, 'exported_images/topics_seven.html')

In [None]:
coherence_model_lda_seventopics = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda_seventopics.get_coherence()
print('\nCoherence Score: ', round(coherence_lda, 3))

In [None]:
# Add new row to scoring_df
scoring_df.loc[len(scoring_df.index)] = [7, round(coherence_lda, 3)] 
scoring_df