In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import time

from nltk.corpus import stopwords
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from nltk.tokenize import RegexpTokenizer # Tokenizing

import warnings

warnings.filterwarnings("ignore", category = DeprecationWarning)
warnings.filterwarnings("ignore", message = "Precision")

In [2]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
tokenizer = RegexpTokenizer(r'\w+')

def preprocess(text):

  text = text.replace('{html}', "") # Remove weblinks
  text = text.lower()
  text = REPLACE_BY_SPACE_RE.sub(' ', text)
  text = BAD_SYMBOLS_RE.sub('', text)
  text = ' '.join(word for word in text.split() if word not in stop_words)
  tokens = tokenizer.tokenize(text)
  cleanedText = " ".join(tokens)

  return cleanedText

In [6]:
df = pd.read_csv('/content/complaints-2024-05-19_04_43.csv')
print(df.shape)
df.dropna(subset = ['Consumer complaint narrative'], inplace = True)
print(df.shape)
df.head()

(9266, 18)
(5508, 18)


Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
1,05/16/23,Debt collection,Other debt,Attempts to collect debt not owed,Debt is not yours,Caine & Weiner continuously reports to all cre...,,"CAINE & WEINER COMPANY, INC.",FL,33026,,Consent provided,Web,05/16/23,Closed with explanation,Yes,,6986706.0
2,10/15/22,Checking or savings account,Checking account,Problem with a lender or other company chargin...,Transaction was not authorized,"My Truist, then BB & T accounts were hacked in...",Company has responded to the consumer and the ...,TRUIST FINANCIAL CORPORATION,WV,24740,Servicemember,Consent provided,Web,10/15/22,Closed with monetary relief,Yes,,6088689.0
3,05/04/23,Debt collection,Other debt,Attempts to collect debt not owed,Debt was result of identity theft,I am a victim of identity theft. It was brough...,,CONTRACT CALLERS INC,IL,60640,,Consent provided,Web,05/04/23,Closed with explanation,Yes,,6925263.0
4,05/16/23,"Money transfer, virtual currency, or money ser...",Mobile or digital wallet,"Managing, opening, or closing your mobile wall...",,"Dear Sir/Madam, I hope this letter finds you w...",,"Paypal Holdings, Inc",CA,95670,,Consent provided,Web,05/16/23,Closed with explanation,Yes,,6986076.0
5,12/19/23,Student loan,Federal student loan servicing,Dealing with your lender or servicer,Problem with customer service,I have been attempting to contact NelNet for s...,,"Nelnet, Inc.",WI,XXXXX,,Consent provided,Web,12/19/23,Closed with explanation,Yes,,8023622.0


In [7]:
df['input'] = df['Consumer complaint narrative'].map(lambda x: preprocess(x))

##Understanding Doc2Vec:
###https://www.geeksforgeeks.org/doc2vec-in-nlp/

In [8]:
train, test = train_test_split(df.reset_index(drop = True), test_size = .15, random_state = 43)

#Doc2Vec model takes 'tagged_documents'

#Tag training data
tagged_tr = [TaggedDocument(words=tokenizer.tokenize(_d.lower()),\
                            tags = [str(i)]) for i, _d in enumerate(train.input)]


#Tag testing data
tagged_test = [TaggedDocument(words=tokenizer.tokenize(_d.lower()),\
                            tags = [str(i)]) for i, _d in enumerate(test.input)]

In [12]:
#Instantiate the model

model = Doc2Vec(vector_size=100,
                window=8,
                alpha=0.025, #Initial learning rate
                min_alpha =0.00025, #Learning rate drops linearly to this
                min_count=2, #Ignore all words with total frequency lower than this
                dm=1, #algorithm 1=distributed memory
                workers=16)

model.build_vocab(tagged_tr)

In [14]:
max_epochs = 50

t1 = time.time()
for epoch in range(max_epochs):
  print('iteration {0}'.format(epoch+1))
  model.train(tagged_tr,
              total_examples=model.corpus_count,
              epochs=model.epochs)

  #decreasing learning rate
  model.alpha -= 0.0002

  #fix the learning rate, no decay
  model.min_alpha = model.alpha

t2 = time.time()

print("Time: {}".format(t2-t1))

iteration 1




iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22


KeyboardInterrupt: 

In [None]:
X_train = np.array([model.doc2vecs[str(i)] for i in range(len(tagged_tr))])
y_train = train.Product

X_test = np.array([model.infer_vector(tagged_test[i][0]) for i in range(len(tagged_test))])
y_test = test.Product

In [None]:
def heatconmat(y_true, y_pred):
  sns.set_context('talk')
  sns.heatmap(confusion_matrix(y_true, y_pred),
              annot=True,
              fmt='d',
              cbar=False,
              cmap='gist_earth_r'
              yticklabels=sorted(y_test.unique()))

  plt.show()
  print(classification_report(y_true, y_pred))

In [None]:
logReg = LogisticRegression(C=5, multi_class='multinomial', solver='saga',max_iter=1000)
logReg.fit(X_train, y_train)
y_pred = logReg.predict(X_test)
heatconmat(y_test, y_pred)