In [None]:
'''
Data source: https://thecleverprogrammer.com/2022/11/28/consumer-complaint-classification-with-machine-learning/

In this project it is expected to classify consumer complaint. Output of the data is
'Product' which basically explains what consumers issue is about. I applied NLP
techniques we learnt in the course, classify consumer complaints. 
Original data has over 3 million rows, I used 50k rows in my project. 
'''

In [None]:
import pandas as pd

In [79]:
df = pd.read_csv("consumercomplaints.csv")
#df = df['Consumer complaint narrative'].dropna()


In [None]:
# EDA/ DATA MANIPULATION

In [None]:
df = df.dropna(subset=['Consumer complaint narrative'])

In [80]:
df = df.sample(50000)

In [81]:
df.head()

Unnamed: 0.1,Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative
964553,964553,2020-07-13,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Was not notified of investigation status or re...,I decided to get my credit pulled due to a bil...
1010276,1010276,2017-05-20,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account information incorrect,"On XXXX 2017 , I paid Barclay card a settle..."
1231896,1231896,2016-05-03,Mortgage,Conventional adjustable mortgage (ARM),"Loan modification,collection,foreclosure",,My financial hardship began on XX/XX/XXXX when...
961318,961318,2019-01-25,Credit card or prepaid card,General-purpose credit card or charge card,"Other features, terms, or problems",Problem with rewards from credit card,I had an American Express Business Platinum ca...
2448317,2448317,2021-11-12,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account status incorrect,I am XXXX XXXX XXXX and I am submitting this c...


In [82]:
df.isnull().sum() 

Unnamed: 0                         0
Date received                      0
Product                            0
Sub-product                     2291
Issue                              0
Sub-issue                       8630
Consumer complaint narrative       0
dtype: int64

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 964553 to 2634106
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Unnamed: 0                    50000 non-null  int64 
 1   Date received                 50000 non-null  object
 2   Product                       50000 non-null  object
 3   Sub-product                   47709 non-null  object
 4   Issue                         50000 non-null  object
 5   Sub-issue                     41370 non-null  object
 6   Consumer complaint narrative  50000 non-null  object
dtypes: int64(1), object(6)
memory usage: 3.1+ MB


In [86]:
# NLP - DATA CLEANING /PREPARATION

In [88]:
# kucuk harfe donustur
df["Consumer complaint narrative"]=df["Consumer complaint narrative"].str.lower()
# noktalama isaretlerini kaldirma
df["Consumer complaint narrative"]=df["Consumer complaint narrative"].str.replace("[^\w\s]","") 
# rakamlari kaldir
df["Consumer complaint narrative"]=df["Consumer complaint narrative"].str.replace("\d+","") 
# \newline ve enter\r kaldir
df["Consumer complaint narrative"]=df["Consumer complaint narrative"].str.replace("\n"," ").replace("\r","") 
df["Consumer complaint narrative"]=df["Consumer complaint narrative"].str.replace("  "," ")

In [89]:
# strip out html tags
import html
def del_html(text):
   return html.unescape(text)

# Apply to multiple columns
df['Consumer complaint narrative'] = df['Consumer complaint narrative'].apply(del_html)

In [90]:
# Remove STOP words
# Taken from Amazon 
import re
import nltk

from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import wordnet
from nltk.corpus import stopwords

#import nltk resources
resources = ["wordnet", "stopwords", "punkt", \
             "averaged_perceptron_tagger", "maxent_treebank_pos_tagger"]

for resource in resources:
    try:
        nltk.data.find("tokenizers/" + resource)
    except LookupError:
        nltk.download(resource)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\haluk\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\haluk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\haluk\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_treebank_pos_tagger to
[nltk_data]     C:\Users\haluk\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_treebank_pos_tagger is already up-to-
[nltk_data]       date!


In [91]:
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer

stop_words = stopwords.words("english")

stop_words = [word.replace("\'", "") for word in stop_words]

In [92]:
import six
import time

i = 0;
nan_index = []
text_wo_sw = []
for text in df['Consumer complaint narrative']:
    
    ret_str ="";
    if isinstance(text, six.string_types):
    
        splitted_text = text.split(" ");
        for word in splitted_text:
            #print(word)

            if len(word)>0 and isinstance(word, six.string_types):
                if word not in stop_words:
                    ret_str = ret_str +" "+word
                        
            #else :
                #print(word)
                #print('not string')
    else :
        #print(i) 
        #print(text)
        nan_index.append(i)
        #print('not a string unfortunately')
    
    text_wo_sw.append(ret_str)
    i=i+1;    
df['Consumer complaint narrative'] =  text_wo_sw 
df.drop(df.index[nan_index], inplace=True)

In [93]:
df['Product'].unique()

array(['Credit reporting, credit repair services, or other personal consumer reports',
       'Mortgage', 'Credit card or prepaid card', 'Credit card',
       'Vehicle loan or lease', 'Consumer Loan',
       'Money transfer, virtual currency, or money service',
       'Debt collection', 'Checking or savings account', 'Student loan',
       'Payday loan, title loan, or personal loan', 'Credit reporting',
       'Bank account or service', 'Money transfers',
       'Other financial service', 'Prepaid card', 'Payday loan',
       'Virtual currency'], dtype=object)

In [94]:
# Output product is verbal data. We quantify this data
column = 'Product';
#print(type(df[column]))
value_counts = df[column].value_counts()
num_categories = len(value_counts)
# İki alt kategorisi varsa 0 ve 1 olarak kodlayalım
if num_categories == 2:
    category_mapping = {category: index for index, category in enumerate(value_counts.index)}
    df[column] = df[column].map(category_mapping)
# İki alt kategoriden fazlaysa 1'den başlayan sıralı sayılarla kodlayalım
else:
    category_mapping = {category: index + 1 for index, category in enumerate(value_counts.index)}
    df[column] = df[column].map(category_mapping)

In [95]:
x = df['Consumer complaint narrative']
y = df['Product']

In [96]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
x =vectorizer.fit_transform(x)

In [97]:
x.shape, y.shape

((50000, 43654), (50000,))

In [98]:
def algo_test(x,y):
    
    import numpy as np

    from scipy import stats
    import matplotlib.pyplot as plt
    %matplotlib inline 
    import seaborn as sns


    from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
    from sklearn.linear_model import LinearRegression, ElasticNet, Ridge, Lasso
    from sklearn.tree import ExtraTreeClassifier
    from sklearn.ensemble import GradientBoostingRegressor
    from xgboost import XGBRegressor 

    from sklearn.model_selection import train_test_split

    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
    from sklearn.naive_bayes import GaussianNB
    from sklearn.naive_bayes import BernoulliNB
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.svm import SVC
    from sklearn.linear_model import SGDClassifier

    import warnings
    warnings.filterwarnings('ignore')
    #plt.style.use('')

    
    G = GaussianNB()
    B = BernoulliNB()
    K = KNeighborsClassifier()
    L = LogisticRegression()
    D = DecisionTreeClassifier()
    S = SVC()
    SGD = SGDClassifier()

    x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2,random_state = 13)
    
    #algos = [G,B,K,L,D,S]
    algos = [G,B,K,L,D, SGD]
    
    #algo_names = [ "Gaussian", "Bernoulli", "K-Neighbors", "Logistic Regression", "Decision Tree","SVC" ]
    algo_names = [ "Gaussian", "Bernoulli", "K-Neighbors", "Logistic Regression", "Decision Tree", \
                  'Stochastic Gradient Descent' ]
    
    ASC = [];
    
    result =  result = pd.DataFrame(columns =['Accuracy_Score'], index = algo_names)
    
    i = 0;
    for algo in algos :
        
        algo.fit(x_train,y_train)
        print (accuracy_score(  y_test, algo.predict(x_test) ))
        ASC.append(  accuracy_score(  y_test, algo.predict(x_test) ) )
        
        print( "Confusion matrix: ",format(algo_names[i]) )
        i = i+1;
        print ( confusion_matrix(algo.predict(x_test),y_test) )
        
        #algo.fit(x,y)
        #print (accuracy_score(  y, algo.predict(x) ))
        #ASC.append(  accuracy_score(  y, algo.predict(x) ) )
    
    
    result.Accuracy_Score = ASC;

    return result.sort_values('Accuracy_Score', ascending=False)

In [99]:
algo_test(x.toarray(),y)

0.2807
Confusion matrix:  Gaussian
[[985 178 107 108  49  81  47  22  19  43  15  20  14   0   3   2   1   0]
 [511 613  67  71  38  68  32  15  28  31   6  19  17   0   2   0   0   0]
 [174  88 528  54  52  20  12  15  18  18  20  15  13   1   1   0   0   0]
 [335 126  27 210  69  11  30  29  40  12  10  15   5   1   1   2   0   0]
 [106  66  34  75 115   7   5  48  13   7  26  14   2   1   2   2   0   0]
 [148  50  14  13   5 104   6   1   3   3   2   2   0   0   0   0   0   0]
 [878 113   6  15   3  11  81   0   4   3   1   2   0   0   0   0   0   0]
 [ 56  21   6  25  37   2   1  80   4   1   3   1   0   1   1   1   0   0]
 [124  44   7  55  19   3   6   3  25   3   6   1   1   1   1   0   0   0]
 [147  47  20  14  12   3   6   1   0  22   3   7   4   1   0   0   0   0]
 [ 49  23  10  27  40   1   5   7   6   1  17   3   3   0   0   2   1   0]
 [475 100  15  42  23  10  18  13   6   8  13  11  10   1   0   1   0   0]
 [575 163  16  41  17   6  36   7  10  15   5  15  13   2   0   1

0.7739
Confusion matrix:  Stochastic Gradient Descent
[[4322  328   50  142   31   29  252   21   28   73   11   41   25    0
     1    1    0    0]
 [ 192 1267   12   36   13   24   23    7   16   29    3   23   25    5
     1    0    0    0]
 [  54   25  785    3    6    6    4    2    1    9    7   15   10    1
     0    0    0    0]
 [  47   27    5  532   48    4    2   18  120    7   13    8    3    0
     0    7    0    0]
 [   9    6    2   32  358    1    0   49    5    1   80   11    1    2
     0    1    0    1]
 [  24    7    2    3    2  263    1    0    2    6    0   13    5    1
     0    0    0    0]
 [   0    0    0    0    0    0    4    0    0    0    0    0    0    0
     0    0    0    0]
 [   0    2    1    8   25    1    1  149    5    1    6    3    1    0
    11    0    1    0]
 [   0    0    0    2    0    0    0    0    1    0    0    0    0    0
     0    0    0    0]
 [   6    1    0    0    0    0    0    0    0   41    0    5   11    0
     0    0    0   

Unnamed: 0,Accuracy_Score
Logistic Regression,0.7794
Stochastic Gradient Descent,0.7739
K-Neighbors,0.6963
Decision Tree,0.6594
Bernoulli,0.6458
Gaussian,0.2807
