In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import re
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from collections import Counter
from scipy import sparse
from scipy.sparse.linalg import svds
import gensim
from imblearn.over_sampling import RandomOverSampler

In [2]:
complaints = pd.read_csv('../data/complaints.csv')

In [3]:
# # Replacing the X's with an empty space.
# for i in range(0, len(complaints)):
#     complaints.loc[i, 'Consumer complaint narrative'] = (
#         re.sub('X{2,}', '', complaints.loc[i, 'Consumer complaint narrative'])
#     )

In [None]:
complaints['Consumer complaint narrative'] = complaints['Consumer complaint narrative'].str.replace('X{2,}', '', regex = True)

In [4]:
# Creating the Train/Test spilt stratifying by Issue.
X = complaints[['Consumer complaint narrative']]
y = complaints['Issue']

xTrain, xTest, yTrain, yTest = train_test_split(X, y, random_state = 777, stratify = y)

**Looking at a Count Vectorizer**

In [5]:
# Fitting the count vectorizer to training data then transforming both the training and test data.
vect = CountVectorizer()

XTrainVec = vect.fit_transform(xTrain['Consumer complaint narrative'])
XTestVec = vect.transform(xTest['Consumer complaint narrative'])

In [6]:
# Checking the results of a logistic regression model.
logreg = LogisticRegression(max_iter = 1000).fit(XTrainVec, yTrain)

yPred = logreg.predict(XTestVec)

print('Accuracy Score: ', accuracy_score(yTest, yPred))

print('Confusion Matrix: ', confusion_matrix(yTest, yPred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy Score:  0.8713868580094615
Confusion Matrix:  [[12223   788   118  5071    91]
 [  933  3987    24   336    31]
 [  158    29  2630   251    19]
 [ 2683   105   102 54258   178]
 [   92    54    22   279  3896]]


**Looking into dealing with class imbalance**

In [7]:
oversample = RandomOverSampler()

xTrainOver, yTrainOver = oversample.fit_resample(
    xTrain, 
    yTrain
)

**Count Vectorizer into a Logistic Regression**

In [8]:
# Fitting the count vectorizer to training data then transforming both the training and test data.
vect = CountVectorizer()

XTrainVec = vect.fit_transform(xTrainOver['Consumer complaint narrative'])
XTestVec = vect.transform(xTest['Consumer complaint narrative'])

In [9]:
# Checking the results of a logistic regression model.
logreg = LogisticRegression(max_iter = 1000).fit(XTrainVec, yTrainOver)

yPred = logreg.predict(XTestVec)

print('Accuracy Score: ', accuracy_score(yTest, yPred))

print('Confusion Matrix: ', confusion_matrix(yTest, yPred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy Score:  0.8465107856673986
Confusion Matrix:  [[13979  1390   234  2561   127]
 [  780  4317    46   120    48]
 [  166    47  2710   126    38]
 [ 6081   571   461 49829   384]
 [   99    72    35   176  3961]]


**Linear SVC**

In [5]:
from sklearn.svm import LinearSVC

In [12]:
# Checking the results of a linear svc model.
svc = LinearSVC(max_iter = 1000).fit(XTrainVec, yTrainOver)

yPred = svc.predict(XTestVec)

print('Accuracy Score: ', accuracy_score(yTest, yPred))

print('Confusion Matrix: ', confusion_matrix(yTest, yPred))



Accuracy Score:  0.8463410217524163
Confusion Matrix:  [[13762  1359   230  2803   137]
 [  833  4177    57   201    43]
 [  181    64  2620   186    36]
 [ 5726   630   332 50317   321]
 [  115    81    33   209  3905]]


**Trying to add class weights instead of oversamplng and switching to tfidf**

In [170]:
# Creating the Train/Test spilt stratifying by Issue.
X = complaints[['Consumer complaint narrative']]
y = complaints['Issue']

xTrain, xTest, yTrain, yTest = train_test_split(X, y, random_state = 777, stratify = y)

In [161]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [171]:
# Fitting the tfidf vectorizer to training data then transforming both the training and test data.
vect = TfidfVectorizer(stop_words = 'english', ngram_range = (1,2))

xTrainVec = vect.fit_transform(xTrain['Consumer complaint narrative'])
xTestVec = vect.transform(xTest['Consumer complaint narrative'])

**Logistic regression**

In [124]:
# Checking the results of a logistic regression model.
logreg = LogisticRegression(max_iter = 1000, class_weight = 'balanced').fit(xTrainVec, yTrain)

yPred = logreg.predict(xTestVec)

print('Accuracy Score: ', accuracy_score(yTest, yPred))

print('Confusion Matrix: ', confusion_matrix(yTest, yPred))

KeyboardInterrupt: 

**Linear SVC**

In [172]:
# Checking the results of a linear svc model.
svc = LinearSVC(max_iter = 1000, class_weight = 'balanced').fit(xTrainVec, yTrain)

yPred = svc.predict(xTestVec)

print('Accuracy Score: ', accuracy_score(yTest, yPred))

print('Confusion Matrix: ', confusion_matrix(yTest, yPred))

Accuracy Score:  0.9033590619977817
Confusion Matrix:  [[14548   732   145  2751   115]
 [  832  4256    24   174    25]
 [  110    19  2791   155    12]
 [ 2862    93    79 54066   226]
 [   41    36    13    95  4158]]


In [174]:
xTrain['actual'] = yTrain

xTrain_2 = xTrain.loc[xTrain['actual'].isin(['Incorrect information on your report', 'Attempts to collect debt not owed'])]

In [176]:
xTrainVec_2 = vect.transform(xTrain_2['Consumer complaint narrative'])

In [177]:
xTest['prediciton'] = yPred

xTest_2 = xTest.loc[xTest['prediciton'].isin(['Incorrect information on your report', 'Attempts to collect debt not owed'])]

yTest_2 = yTest.loc[xTest['prediciton'].isin(['Incorrect information on your report', 'Attempts to collect debt not owed'])]

xTestVec_2 = vect.transform(xTest_2['Consumer complaint narrative'])

In [179]:
# Checking the results of a linear svc model.
svc = LinearSVC(max_iter = 1000, class_weight = 'balanced').fit(xTrainVec_2, xTrain_2['actual'])

yPred = svc.predict(xTestVec_2)

print('Accuracy Score: ', accuracy_score(yTest_2, yPred))

print('Confusion Matrix: ', confusion_matrix(yTest_2, yPred))

Accuracy Score:  0.9043287410423884
Confusion Matrix:  [[14856     0     0  2443     0]
 [  854     0     0   152     0]
 [  116     0     0   149     0]
 [ 3386     0     0 53542     0]
 [   40     0     0    96     0]]


In [151]:
yPred = svc.predict(xTrainVec)

print('Accuracy Score: ', accuracy_score(yTrain, yPred))

print('Confusion Matrix: ', confusion_matrix(yTrain, yPred))

Accuracy Score:  0.9858982774621426
Confusion Matrix:  [[ 53645    106     17   1092     12]
 [    24  15896      0     12      0]
 [     0      0   9249     11      0]
 [  2303     56     28 169519     73]
 [     2      0      0      2  13027]]


In [152]:
xTrain['prediciton'] = yPred
xTrain['actual'] = yTrain

In [153]:
incorrect_info_wrong = xTrain.loc[
    (xTrain['actual'] == 'Incorrect information on your report') & 
    (xTrain['prediciton'] == 'Attempts to collect debt not owed')
    ]

attemp_collect_debt_wrong = xTrain.loc[
    (xTrain['actual'] == 'Attempts to collect debt not owed') & 
    (xTrain['prediciton'] == 'Incorrect information on your report')
    ]

incorrect_info = xTrain.loc[
    (xTrain['actual'] == 'Incorrect information on your report') & 
    (xTrain['prediciton'] == 'Incorrect information on your report')
    ]

attemp_collect_debt = xTrain.loc[
    (xTrain['actual'] == 'Attempts to collect debt not owed') & 
    (xTrain['prediciton'] == 'Attempts to collect debt not owed')
    ]

In [154]:
from sklearn.feature_extraction.text import CountVectorizer

# select the issues with shared words and combine into a single corpus
corpus = incorrect_info_wrong['Consumer complaint narrative'].tolist() + attemp_collect_debt['Consumer complaint narrative'].tolist()

vectorizer = CountVectorizer(stop_words='english')

word_frequency_dict = {}

# process the corpus in chunks to calculate word frequencies incrementally
chunk_size = 1000
num_samples = len(corpus)

for i in range(0, num_samples, chunk_size):
    chunk = corpus[i:i+chunk_size]
    X = vectorizer.fit_transform(chunk)
    feature_names = vectorizer.get_feature_names_out()
    word_frequencies = X.toarray().sum(axis=0)
    
    # update the word frequency dictionary
    for word, frequency in zip(feature_names, word_frequencies):
        word_frequency_dict[word] = word_frequency_dict.get(word, 0) + frequency

# sort the dictionary by frequency in descending order
sorted_words = sorted(word_frequency_dict.items(), key=lambda x: x[1], reverse=True)

top_words = sorted_words[:200]
remove_words_attempt_collect_debt = []
for word, frequency in top_words:
    remove_words_attempt_collect_debt.append(word)

In [155]:
# select the issues with shared words and combine into a single corpus
corpus = incorrect_info_wrong['Consumer complaint narrative'].tolist() + incorrect_info['Consumer complaint narrative'].tolist()

vectorizer = CountVectorizer(stop_words='english')

word_frequency_dict = {}

# process the corpus in chunks to calculate word frequencies incrementally
chunk_size = 1000
num_samples = len(corpus)

for i in range(0, num_samples, chunk_size):
    chunk = corpus[i:i+chunk_size]
    X = vectorizer.fit_transform(chunk)
    feature_names = vectorizer.get_feature_names_out()
    word_frequencies = X.toarray().sum(axis=0)
    
    # update the word frequency dictionary
    for word, frequency in zip(feature_names, word_frequencies):
        word_frequency_dict[word] = word_frequency_dict.get(word, 0) + frequency

# sort the dictionary by frequency in descending order
sorted_words = sorted(word_frequency_dict.items(), key=lambda x: x[1], reverse=True)

top_words = sorted_words[:]
words_incorrect_info = []
for word, frequency in top_words:
    words_incorrect_info.append(word)

In [156]:
remove_words_attempt_collect_debt = [word for word in remove_words_attempt_collect_debt if word not in words_incorrect_info]

In [157]:
len(remove_words_attempt_collect_debt)

0

In [158]:
from sklearn.feature_extraction.text import CountVectorizer

# select the issues with shared words and combine into a single corpus
corpus = attemp_collect_debt_wrong['Consumer complaint narrative'].tolist() + incorrect_info['Consumer complaint narrative'].tolist()

vectorizer = CountVectorizer(stop_words='english')

word_frequency_dict = {}

# process the corpus in chunks to calculate word frequencies incrementally
chunk_size = 1000
num_samples = len(corpus)

for i in range(0, num_samples, chunk_size):
    chunk = corpus[i:i+chunk_size]
    X = vectorizer.fit_transform(chunk)
    feature_names = vectorizer.get_feature_names_out()
    word_frequencies = X.toarray().sum(axis=0)
    
    # update the word frequency dictionary
    for word, frequency in zip(feature_names, word_frequencies):
        word_frequency_dict[word] = word_frequency_dict.get(word, 0) + frequency

# sort the dictionary by frequency in descending order
sorted_words = sorted(word_frequency_dict.items(), key=lambda x: x[1], reverse=True)

top_words = sorted_words[:200]
remove_incorrect_info = []
for word, frequency in top_words:
    remove_incorrect_info.append(word)

In [159]:
# select the issues with shared words and combine into a single corpus
corpus = attemp_collect_debt_wrong['Consumer complaint narrative'].tolist() + attemp_collect_debt['Consumer complaint narrative'].tolist()

vectorizer = CountVectorizer(stop_words='english')

word_frequency_dict = {}

# process the corpus in chunks to calculate word frequencies incrementally
chunk_size = 1000
num_samples = len(corpus)

for i in range(0, num_samples, chunk_size):
    chunk = corpus[i:i+chunk_size]
    X = vectorizer.fit_transform(chunk)
    feature_names = vectorizer.get_feature_names_out()
    word_frequencies = X.toarray().sum(axis=0)
    
    # update the word frequency dictionary
    for word, frequency in zip(feature_names, word_frequencies):
        word_frequency_dict[word] = word_frequency_dict.get(word, 0) + frequency

# sort the dictionary by frequency in descending order
sorted_words = sorted(word_frequency_dict.items(), key=lambda x: x[1], reverse=True)

top_words = sorted_words[:]
words_attempt_to_collect_dedt = []
for word, frequency in top_words:
    words_attempt_to_collect_dedt.append(word)

In [None]:
remove_incorrect_info = [word for word in remove_incorrect_info if word not in words_attempt_to_collect_dedt]

In [None]:
len(remove_incorrect_info)

33

In [None]:
# filter the words
mask = xTrain['prediciton'] == 'Incorrect information on your report'
xTrain.loc[mask, 'Consumer complaint narrative'] = xTrain.loc[mask, 'Consumer complaint narrative'].apply(
    lambda text: ' '.join([word for word in text.split() if word not in remove_words_attempt_collect_debt])
)

In [None]:
# filter the words
mask = xTrain['prediciton'] == 'Attempts to collect debt not owed'
xTrain.loc[mask, 'Consumer complaint narrative'] = xTrain.loc[mask, 'Consumer complaint narrative'].apply(
    lambda text: ' '.join([word for word in text.split() if word not in remove_incorrect_info])
)

In [None]:
XTrainVec = vect.transform(xTrain['Consumer complaint narrative'])

In [None]:
yPred = svc.predict(xTrainVec)

print('Accuracy Score: ', accuracy_score(yTrain, yPred))

print('Confusion Matrix: ', confusion_matrix(yTrain, yPred))

Accuracy Score:  0.9858982774621426
Confusion Matrix:  [[ 53645    106     17   1092     12]
 [    24  15896      0     12      0]
 [     0      0   9249     11      0]
 [  2303     56     28 169519     73]
 [     2      0      0      2  13027]]


In [None]:
yPred = svc.predict(xTestVec)

print('Accuracy Score: ', accuracy_score(yTest, yPred))

print('Confusion Matrix: ', confusion_matrix(yTest, yPred))

Accuracy Score:  0.9033590619977817
Confusion Matrix:  [[14548   732   145  2751   115]
 [  832  4256    24   174    25]
 [  110    19  2791   155    12]
 [ 2862    93    79 54066   226]
 [   41    36    13    95  4158]]


In [None]:
xTest['prediciton'] = yPred

In [None]:
# filter the words
mask = xTest['prediciton'] == 'Incorrect information on your report'
xTest.loc[mask, 'Consumer complaint narrative'] = xTest.loc[mask, 'Consumer complaint narrative'].apply(
    lambda text: ' '.join([word for word in text.split() if word not in remove_words_attempt_collect_debt])
)

In [None]:
# filter the words
mask = xTest['prediciton'] == 'Attempts to collect debt not owed'
xTest.loc[mask, 'Consumer complaint narrative'] = xTest.loc[mask, 'Consumer complaint narrative'].apply(
    lambda text: ' '.join([word for word in text.split() if word not in remove_incorrect_info])
)

In [None]:
xTestVec = vect.transform(xTest['Consumer complaint narrative'])

In [None]:
yPred = svc.predict(xTestVec)

print('Accuracy Score: ', accuracy_score(yTest, yPred))

print('Confusion Matrix: ', confusion_matrix(yTest, yPred))

Accuracy Score:  0.9029063582244958
Confusion Matrix:  [[14503   757   154  2760   117]
 [  814  4275    24   173    25]
 [  107    19  2795   153    13]
 [ 2874    98    81 54044   229]
 [   37    36    13    95  4162]]
