In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import re
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from collections import Counter
from scipy import sparse
from scipy.sparse.linalg import svds
import gensim
from imblearn.over_sampling import RandomOverSampler

In [2]:
complaints = pd.read_csv('../data/complaints.csv')

In [3]:
# Replacing the X's with an empty space.
for i in range(0, len(complaints)):
    complaints.loc[i, 'Consumer complaint narrative'] = (
        re.sub('X{2,}', '', complaints.loc[i, 'Consumer complaint narrative'])
    )

In [4]:
# Creating the Train/Test spilt stratifying by Issue.
X = complaints[['Consumer complaint narrative']]
y = complaints['Issue']

xTrain, xTest, yTrain, yTest = train_test_split(X, y, random_state = 777, stratify = y)

**Looking at a Count Vectorizer**

In [5]:
# Fitting the count vectorizer to training data then transforming both the training and test data.
vect = CountVectorizer()

XTrainVec = vect.fit_transform(xTrain['Consumer complaint narrative'])
XTestVec = vect.transform(xTest['Consumer complaint narrative'])

In [6]:
# Checking the results of a logistic regression model.
logreg = LogisticRegression(max_iter = 1000).fit(XTrainVec, yTrain)

yPred = logreg.predict(XTestVec)

print('Accuracy Score: ', accuracy_score(yTest, yPred))

print('Confusion Matrix: ', confusion_matrix(yTest, yPred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy Score:  0.8713868580094615
Confusion Matrix:  [[12223   788   118  5071    91]
 [  933  3987    24   336    31]
 [  158    29  2630   251    19]
 [ 2683   105   102 54258   178]
 [   92    54    22   279  3896]]


**Looking into dealing with class imbalance**

In [19]:
oversample = RandomOverSampler()

xTrainOver, yTrainOver = oversample.fit_resample(
    xTrain, 
    yTrain
)

**Count Vectorizer into a Logistic Regression**

In [20]:
# Fitting the count vectorizer to training data then transforming both the training and test data.
vect = CountVectorizer()

XTrainVec = vect.fit_transform(xTrain['Consumer complaint narrative'])
XTestVec = vect.transform(xTest['Consumer complaint narrative'])

In [21]:
# Checking the results of a logistic regression model.
logreg = LogisticRegression(max_iter = 1000).fit(XTrainVec, yTrain)

yPred = logreg.predict(XTestVec)

print('Accuracy Score: ', accuracy_score(yTest, yPred))

print('Confusion Matrix: ', confusion_matrix(yTest, yPred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy Score:  0.8713868580094615
Confusion Matrix:  [[12223   788   118  5071    91]
 [  933  3987    24   336    31]
 [  158    29  2630   251    19]
 [ 2683   105   102 54258   178]
 [   92    54    22   279  3896]]


**Linear SVC**

In [22]:
from sklearn.svm import LinearSVC

In [23]:
# Checking the results of a linear svc model.
svc = LinearSVC(max_iter = 1000).fit(XTrainVec, yTrain)

yPred = svc.predict(XTestVec)

print('Accuracy Score: ', accuracy_score(yTest, yPred))

print('Confusion Matrix: ', confusion_matrix(yTest, yPred))



Accuracy Score:  0.8676294166911881
Confusion Matrix:  [[11628   818   131  5592   122]
 [  845  3928    38   465    35]
 [  125    41  2613   282    26]
 [ 2300   137    91 54613   185]
 [   81    51    24   307  3880]]
