In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import re
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from collections import Counter
from scipy import sparse
from scipy.sparse.linalg import svds
import gensim
from imblearn.over_sampling import RandomOverSampler

In [4]:
complaints = pd.read_csv('../data/complaints.csv')

In [5]:
# Replacing the X's with an empty space.
for i in range(0, len(complaints)):
    complaints.loc[i, 'Consumer complaint narrative'] = (
        re.sub('X{2,}', '', complaints.loc[i, 'Consumer complaint narrative'])
    )

In [4]:
# Creating the Train/Test spilt stratifying by Issue.
X = complaints[['Consumer complaint narrative']]
y = complaints['Issue']

xTrain, xTest, yTrain, yTest = train_test_split(X, y, random_state = 777, stratify = y)

**Looking at a Count Vectorizer**

In [5]:
# Fitting the count vectorizer to training data then transforming both the training and test data.
vect = CountVectorizer()

XTrainVec = vect.fit_transform(xTrain['Consumer complaint narrative'])
XTestVec = vect.transform(xTest['Consumer complaint narrative'])

In [6]:
# Checking the results of a logistic regression model.
logreg = LogisticRegression(max_iter = 1000).fit(XTrainVec, yTrain)

yPred = logreg.predict(XTestVec)

print('Accuracy Score: ', accuracy_score(yTest, yPred))

print('Confusion Matrix: ', confusion_matrix(yTest, yPred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy Score:  0.8713868580094615
Confusion Matrix:  [[12223   788   118  5071    91]
 [  933  3987    24   336    31]
 [  158    29  2630   251    19]
 [ 2683   105   102 54258   178]
 [   92    54    22   279  3896]]


**Looking into dealing with class imbalance**

In [7]:
oversample = RandomOverSampler()

xTrainOver, yTrainOver = oversample.fit_resample(
    xTrain, 
    yTrain
)

**Count Vectorizer into a Logistic Regression**

In [8]:
# Fitting the count vectorizer to training data then transforming both the training and test data.
vect = CountVectorizer()

XTrainVec = vect.fit_transform(xTrainOver['Consumer complaint narrative'])
XTestVec = vect.transform(xTest['Consumer complaint narrative'])

In [9]:
# Checking the results of a logistic regression model.
logreg = LogisticRegression(max_iter = 1000).fit(XTrainVec, yTrainOver)

yPred = logreg.predict(XTestVec)

print('Accuracy Score: ', accuracy_score(yTest, yPred))

print('Confusion Matrix: ', confusion_matrix(yTest, yPred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy Score:  0.8465107856673986
Confusion Matrix:  [[13979  1390   234  2561   127]
 [  780  4317    46   120    48]
 [  166    47  2710   126    38]
 [ 6081   571   461 49829   384]
 [   99    72    35   176  3961]]


**Linear SVC**

In [16]:
from sklearn.svm import LinearSVC

In [12]:
# Checking the results of a linear svc model.
svc = LinearSVC(max_iter = 1000).fit(XTrainVec, yTrainOver)

yPred = svc.predict(XTestVec)

print('Accuracy Score: ', accuracy_score(yTest, yPred))

print('Confusion Matrix: ', confusion_matrix(yTest, yPred))



Accuracy Score:  0.8463410217524163
Confusion Matrix:  [[13762  1359   230  2803   137]
 [  833  4177    57   201    43]
 [  181    64  2620   186    36]
 [ 5726   630   332 50317   321]
 [  115    81    33   209  3905]]


**Trying to add class weights instead of oversamplng and switching to tfidf**

In [21]:
# Adding a length of complaint to see if that helps improve the model.
complaints['length'] = complaints['Consumer complaint narrative'].str.len()

In [22]:
# Creating the Train/Test spilt stratifying by Issue.
X = complaints[['Consumer complaint narrative', 'length']]
y = complaints['Issue']

xTrain, xTest, yTrain, yTest = train_test_split(X, y, random_state = 777, stratify = y)

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [50]:
# Fitting the tfidf vectorizer to training data then transforming both the training and test data.
vect = TfidfVectorizer(stop_words = 'english', ngram_range = (1,2))

xTrainVec = vect.fit_transform(xTrain['Consumer complaint narrative'])
xTestVec = vect.transform(xTest['Consumer complaint narrative'])

In [46]:
from scipy.sparse import hstack

In [47]:
# xTrainVec = hstack((xTrainVec, sparse.csr_matrix(np.array(xTrain['length'])).reshape(-1, 1)))
# xTestVec = hstack((xTestVec, sparse.csr_matrix(np.array(xTest['length'])).reshape(-1, 1)))

**Logistic regression**

In [51]:
# Checking the results of a logistic regression model.
logreg = LogisticRegression(max_iter = 1000, class_weight = 'balanced').fit(xTrainVec, yTrain)

yPred = logreg.predict(xTestVec)

print('Accuracy Score: ', accuracy_score(yTest, yPred))

print('Confusion Matrix: ', confusion_matrix(yTest, yPred))

Accuracy Score:  0.8732203082912696
Confusion Matrix:  [[14778  1116   261  1961   175]
 [  594  4540    46    96    35]
 [   93    25  2863    94    12]
 [ 5278   306   329 50763   650]
 [   30    45    18    38  4212]]


**Linear SVC**

In [52]:
# Checking the results of a linear svc model.
svc = LinearSVC(max_iter = 1000, class_weight = 'balanced').fit(xTrainVec, yTrain)

yPred = svc.predict(xTestVec)

print('Accuracy Score: ', accuracy_score(yTest, yPred))

print('Confusion Matrix: ', confusion_matrix(yTest, yPred))

Accuracy Score:  0.9033590619977817
Confusion Matrix:  [[14548   732   145  2751   115]
 [  832  4256    24   174    25]
 [  110    19  2791   155    12]
 [ 2862    93    79 54066   226]
 [   41    36    13    95  4158]]
