In [1]:
!pip install pyldavis



In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy.cli
spacy.cli.download("en_core_web_lg")
nlp = spacy.load("en_core_web_lg")

import nltk
from nltk.corpus import stopwords
from matplotlib import pyplot as plt

import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
from nltk.tokenize import RegexpTokenizer

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
tokenizer = RegexpTokenizer(r'\w+')

def preprocess(text):

  text = text.replace('{html}', "") # Remove weblinks
  text = text.lower()
  text = REPLACE_BY_SPACE_RE.sub(' ', text)
  text = BAD_SYMBOLS_RE.sub('', text)
  #text = ' '.join(word for word in text.split() if word not in stop_words)
  tokens = tokenizer.tokenize(text)
  cleanedText = " ".join(tokens)

  return cleanedText

In [5]:
df = pd.read_csv('/content/complaints.csv', on_bad_lines='skip')
df.dropna(subset = ['Consumer complaint narrative'], inplace = True)
df['case_ds'] = df['Consumer complaint narrative'].map(lambda x: preprocess(x))

In [6]:
print(df.shape)
df.head()

(9829, 19)


Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID,case_ds
1,12/28/23,Credit card,General-purpose credit card or charge card,Problem when making payments,Problem during payment process,I recently made a payment agreement to America...,,AMERICAN EXPRESS COMPANY,IL,601XX,,Consent provided,Web,12/28/23,Closed with monetary relief,Yes,,8074502,i recently made a payment agreement to america...
2,11/05/23,Credit card,General-purpose credit card or charge card,Incorrect information on your report,Information belongs to someone else,"Dear XXXX XXXX XXXX XXXX DISCOVER, I have rece...",,DISCOVER BANK,GA,30238,,Consent provided,Web,11/05/23,Closed with non-monetary relief,Yes,,7805442,dear xxxx xxxx xxxx xxxx discover i have recen...
3,12/30/23,Prepaid card,Gift card,Trouble using the card,Trouble using the card to spend money in a sto...,"Cards are declined everywhere ( e.g. online, b...",,AMERICAN EXPRESS COMPANY,NY,10065,,Consent provided,Web,12/30/23,Closed with monetary relief,Yes,,8082999,cards are declined everywhere eg online brick ...
4,12/16/23,Mortgage,Conventional home mortgage,Applying for a mortgage or refinancing an exis...,Application denials,XXXX XXXX XXXX I received a letter from Disco...,,DISCOVER BANK,MO,641XX,,Consent provided,Web,12/20/23,Closed with explanation,Yes,,8007891,xxxx xxxx xxxx i received a letter from discov...
6,02/07/23,"Credit reporting, credit repair services, or o...",Credit reporting,Credit monitoring or identity theft protection...,Billing dispute for services,Good day! I am writing concerning my experienc...,,DISCOVER BANK,MI,482XX,,Consent provided,Web,02/07/23,Closed with explanation,Yes,,6541351,good day i am writing concerning my experience...


In [7]:
data = df['case_ds'].values.tolist()

def sent_to_words(sentences):
  for sentence in sentences:
    yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts):
  return [[word for word in simple_preprocess(str(texts)) if word not in stop_words]]

def make_bigrams(texts):
  return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
  return [trigram_mod[doc] for doc in texts]

def lemmatization(texts, allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']):

  texts_out = []
  for sent in texts:
    doc = nlp(" ".join(sent))
    texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])

  return texts_out


data_words = list(sent_to_words(data))
data_words_nosw = remove_stopwords(data_words)

bigram = gensim.models.Phrases(data_words_nosw, min_count=100, threshold=100)
trigram = gensim.models.Phrases(data_words_nosw, min_count=100, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

data_words_trigrams = make_trigrams(data_words_nosw)

#data_lemmatized = lemmatization(data_words_trigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [8]:
id2word = corpora.Dictionary(data_words_trigrams)

texts = data_words_trigrams
corpus = [id2word.doc2bow(text) for text in texts]

In [9]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=5,
                                            random_state=42,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto',
                                            per_word_topics=True)
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.012*"xxxx" + 0.006*"xx" + 0.005*"credit" + 0.004*"account" + 0.004*"card" '
  '+ 0.003*"discover" + 0.002*"express" + 0.002*"american" + '
  '0.002*"information" + 0.002*"would"'),
 (1,
  '0.056*"xxxx" + 0.021*"xx" + 0.020*"credit" + 0.017*"account" + 0.015*"card" '
  '+ 0.012*"discover" + 0.008*"american" + 0.008*"amex" + 0.007*"express" + '
  '0.006*"consumer"'),
 (2,
  '0.081*"xxxx" + 0.027*"xx" + 0.018*"credit" + 0.016*"account" + 0.016*"card" '
  '+ 0.012*"discover" + 0.010*"express" + 0.010*"american" + '
  '0.006*"information" + 0.006*"consumer"'),
 (3,
  '0.007*"xxxx" + 0.003*"credit" + 0.003*"xx" + 0.002*"card" + 0.002*"account" '
  '+ 0.001*"express" + 0.001*"discover" + 0.001*"consumer" + 0.001*"american" '
  '+ 0.001*"amex"'),
 (4,
  '0.046*"xxxx" + 0.014*"card" + 0.014*"credit" + 0.013*"xx" + 0.011*"account" '
  '+ 0.011*"discover" + 0.009*"express" + 0.008*"american" + 0.005*"would" + '
  '0.004*"information"')]


In [10]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus))

coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words_trigrams, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.9403435429388995

Coherence Score:  0.28767965577565296


In [11]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [12]:
def compute_coherence_values(dictionary, corpus, texts, limit, start, step):

  coherence_values=[]
  model_list=[]

  for num_topics in range(start, limit, step):
    model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=5,
                                            random_state=42,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto',
                                            per_word_topics=True)

    model_list.append(model)
    coherencemodel = CoherenceModel(model=model, texts = texts, dictionary = dictionary, coherence='c_v')
    coherence_values.append(coherencemodel.get_coherence())

  return model_list, coherence_values

In [13]:
start = 2
limit = 20
step = 1
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=texts, start=start, limit=limit, step=step)

KeyboardInterrupt: 

In [None]:
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel('Num Topics')
plt.ylabel('Coherence Score')
plt.legend('CV', loc='best')
plt.show()

In [None]:
num_topics = int(input("Num Topics:"))

optimal_model = model_list[num_topics-2]
model_topics=optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

In [None]:
def format_topics_sentences(ldamodel, corpus, texts):
  sent_topics_df = pd.DataFrame()

  for i, row_list in enumerate(ldamodel[corpus]):
    row = row_list[0] if ldamodel.per_word_topics else row_list
    row = sorted(row, key=lambda x: (x[1]), reverse=True)
    for j, (topic_num, prop_topic) in enumerate(row):
      if j == 0:
        wp = ldamodel.show_topic(topic_num)
        topic_keywords = ", ".join([word for word, prop in wp])
        sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic, 4), topic_keywords]), ignore_index=True)
      else:
        break

  sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
  contents = pd.Series(texts)

  sent_topics_df = pd.concat([sent_topics_df, contents], axis =1)

  return sent_topics_df

In [None]:
df_topics_sents_keywords = format_topics_sentences(optimal_model, corpus, data)

df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document No.', 'Cluster', 'Cluster Score', 'Keywords', 'Orig Text']
df_dominant_topic.head(10)

In [None]:
topic_counts = df_dominant_topic['Cluster No.'].value_counts()

topic_contribution = round(topic_counts/topic_counts.sum(),4)
print('Topic No', '\t', 'Document Cnt', '\t', 'Document Perc')
for i in range(len(topic_counts)):
  print(topic_counts.index[i], '\t', '\t', topic_counts[i], '\t', '\t', topic_contribution[i]*100)

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]

cloud = WordCloud(stopwords=stop_words,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func = lambda *arg, **kwargs:cols[i],
                  prefer_horizontal=1.0)

topics = optimal_model.show_topics(formatted=False)

fig, axes = plt.subplots(5,5, figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
  fig.add_subplot(ax)
  topic_words = dict(topics[i][1])
  cloud.generate_from_frequencies(topic_words, max_font_size=300)
  plt.gca().imshow(cloud)
  plt.gca().set_title('Topic ' +str(i), fontdict=dict(size=16))
  plt.gca().axis('off')

plt.subplots_adjust(wspace=0, hspace=0)
plt.axis=('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.manifold import TSNE
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook

topic_weights = []

for i, row_list in enumerate(optimal_model[corpus]):
  topic_weights.append([w for i, w in row_list[0]])

arr = pd.DataFrame(topic_weights).fillna(0).values
arr = arr[np.amax(arr, axis=1) > 0.35]
topic_num = np.argmax(arr, axis=1)

tsne_model = TSNE(n_components=2, verbose=1, random_state=42, angle=.99, init='pca')
tsne_lda = tsne_model.fit_transform(arr)

output_notebook()
n_topics = num_topics
mycolors = np.array([color for name, coor in mcolors.TABLEAU_COLORS.items()])

plot = figure(title='t-SNE Clustering of {} LDA Topics'.format(n_topics), plot_width=900, plot_height=700)

plot.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], colors= mycolors[topic_num])
show(plot)