# Import Libraries

In [32]:
import numpy as np
import pandas as pd

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import TfidfModel
from gensim.models import CoherenceModel

# import spacy
from nltk.corpus import stopwords

import pyLDAvis
import pyLDAvis.gensim

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

Notebook made using this video:  
https://www.youtube.com/watch?v=TKjjlp5_r7o

Another one to watch later:  
https://www.youtube.com/watch?v=UEn3xHNBXJU

# Read and process data

In [2]:
# read datafile
df = pd.read_csv('../project_data/complaints_processed.csv')

# clean up by removing NaN's, etc.
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df = df.dropna()
df.drop(df.loc[df['narrative'] == 'name'].index, inplace=True)

# Trying out on portion of dataframe

In [3]:
# just getting the texts from the first 1000 lines
partial_df = df['narrative'][0:1000]

In [4]:
partial_df.head()

0    purchase order day shipping amount receive pro...
1    forwarded message date tue subject please inve...
2    forwarded message cc sent friday pdt subject f...
3    payment history missing credit report speciali...
4    payment history missing credit report made mis...
Name: narrative, dtype: object

Testing to split strings. Gensim requires bag of words data (list of lists of individual strings)

In [5]:
partial_df[0].split()[0:5]

['purchase', 'order', 'day', 'shipping', 'amount']

Make the datataframe into a list of individual words

In [6]:
data_words = partial_df.tolist()

for i in range(len(data_words)):
    data_words[i] = data_words[i].split()

In [7]:
id2word = corpora.Dictionary(data_words)

In [8]:
print(id2word) 

Dictionary(5396 unique tokens: ['accordance', 'address', 'adjustment', 'adjustmentmerchandiserobert', 'agreed']...)


`id2word` is a dictionary with each token given an id.

`id2word.token2id` -->

`{'accordance': 0,
 'address': 1,
 'adjustment': 2,
 'adjustmentmerchandiserobert': 3,
 'agreed': 4,
 'although': 5, ...}`

In [10]:
id2word.doc2bow(['address', 'adjustment', 'cancel'])

[(1, 1), (2, 1), (16, 1)]

`doc2bow` creates a tuple of (token_id, token_count). Here I'm making a list of these tuples for the gensim

In [11]:
corpus = []
for text in data_words:
    new = id2word.doc2bow(text)
    corpus.append(new)

In [12]:
# the first five tuples of the first complaint
corpus[0][0:5]

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]

In [13]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [14]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds='mmds', n_jobs=1)

In [15]:
pyLDAvis.display(vis)

## Scoring the model

In [21]:
# Compute Coherence Score using c_v
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', round(coherence_lda, 3))


Coherence Score:  0.511


## Try with different params

In [30]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [23]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds='mmds', n_jobs=1)

In [24]:
pyLDAvis.display(vis)

In [25]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', round(coherence_lda, 3))


Coherence Score:  0.453


So more topics in this case yields a lower coherence score

## Trying with GridSearch

In [40]:
lda_pipeline = Pipeline([('lda_model', gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                                                        id2word=id2word,
#                                                                        random_state=100,
#                                                                        update_every=1,
#                                                                        chunksize=100,
#                                                                        passes=10,
#                                                                        alpha="auto"
                                                                      )
                         ), 
                         ('coherence_model_lda', CoherenceModel(model=lda_model, 
                                                                texts=data_words, 
                                                                dictionary=id2word, 
                                                                coherence='c_v'))])

lda_param_grid = {
    'lda_model__num_topics': [3, 5, 7, 10]
}

lda_grid_search = GridSearchCV(estimator=lda_pipeline,
                               param_grid=lsa_param_grid,
                               scoring='coherence_model_lda.get_coherence()',
                               cv=3)

lda_grid_search.estimator.fit(id2word)

TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'LdaModel(num_terms=5396, num_topics=100, decay=0.5, chunksize=2000)' (type <class 'gensim.models.ldamodel.LdaModel'>) doesn't

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [None]:
dt3_pipeline = Pipeline([('tfidf', TfidfVectorizer()), 
                         ('dt', DecisionTreeClassifier(random_state=123))])

dt3_param_grid = {
    'tfidf__max_features': [7000, 10000, 12500, 16000], 
    'dt__max_depth': [6]
}

dt3_grid_search = GridSearchCV(estimator=dt3_pipeline,
                               param_grid=dt3_param_grid,
                               scoring='recall_macro',
                               cv=3)

# Using whole dataframe

Processing dataframe into list of lists of words

In [None]:
data_words = df['narrative'].tolist()

for i in range(len(data_words)):
    data_words[i] = data_words[i].split()

Prepping the data for `lda_model`

In [None]:
id2word = corpora.Dictionary(data_words)

corpus = []
for text in data_words:
    new = id2word.doc2bow(text)
    corpus.append(new)

In [None]:
# from gensim.models import TfidfModel

# tfidf = TfidfModel(corpus, id2word=id2word)

# low_value = 0.03
# low_value_words = []

# for bow in corpus:
#     low_value_words += [id for id, value in tfidf[bow] if value < low_value]
    
# id2word.filter_tokens(bad_ids=low_value_words)

# # not sure what this does exactly, but I had an index out of bounds error when I ran the model. 
# # Googling around, I found this might help:
# id2word.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)


Creating the model

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=5,
                                           alpha="auto")

Visualizing the model

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds='mmds')

In [None]:
vis

Later on, you may want to try TFIDF

In [None]:
# from gensim.models import TfidfModel

# tfidf = TfidfModel(corpus, id2word=id2word)

# low_value = 0.03

# words = []
# words_missing_in_tfidf = []

In [None]:
# vectorizer = TfidfVectorizer(ngram_range=(1,1))
# X = vectorizer.fit_transform(partial_df['narrative'])
# feature_names = vectorizer.get_feature_names()