In [53]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/consumercomplaintssmall/connsumerComplaintsSmall.csv
/kaggle/input/consumer-complaints-identification/consumercomplaints.csv


# Import the data set from the csv file using pandas.


In [54]:
# import data set 
# consumer = pd.read_csv("/kaggle/input/consumer-complaints-identification/consumercomplaints.csv")
consumer = pd.read_csv("/kaggle/input/consumercomplaintssmall/connsumerComplaintsSmall.csv")



# Viewing a part(first 5 lines) of the dataset:

In [55]:
print(consumer.head())

   Unnamed: 0 Date received  \
0           1    21-11-2022   
1           2    21-11-2022   
2           3    21-11-2022   
3           4    21-11-2022   
4           5    15-11-2022   

                                             Product  \
0                                           Mortgage   
1  Credit reporting, credit repair services, or o...   
2  Credit reporting, credit repair services, or o...   
3  Credit reporting, credit repair services, or o...   
4  Credit reporting, credit repair services, or o...   

                  Sub-product  \
0  Conventional home mortgage   
1            Credit reporting   
2            Credit reporting   
3            Credit reporting   
4            Credit reporting   

                                               Issue  \
0                         Struggling to pay mortgage   
1                        Improper use of your report   
2                        Improper use of your report   
3  Problem with a credit reporting company's inve... 

In [56]:
print(consumer.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Unnamed: 0                    101 non-null    int64 
 1   Date received                 101 non-null    object
 2   Product                       101 non-null    object
 3   Sub-product                   101 non-null    object
 4   Issue                         101 non-null    object
 5   Sub-issue                     96 non-null     object
 6   Consumer complaint narrative  3 non-null      object
dtypes: int64(1), object(6)
memory usage: 5.6+ KB
None


# Looking at the data set it infers that there are many unnamed columns and null values .
this step needs to be done only once, since the drop() deletes the unnamed column, if you run it more than once then you'll get an error


In [63]:
consumer.columns.str.match("Unnamed")


array([False, False, False, False, False, False])

In [59]:
consumer.loc[:,~consumer.columns.str.match("Unnamed")]

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative
0,21-11-2022,Mortgage,Conventional home mortgage,Struggling to pay mortgage,,
1,21-11-2022,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,Reporting company used your report improperly,
2,21-11-2022,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,Reporting company used your report improperly,
3,21-11-2022,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Their investigation did not fix an error on yo...,
4,15-11-2022,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,
...,...,...,...,...,...,...
96,07-11-2022,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,Reporting company used your report improperly,
97,21-11-2022,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Their investigation did not fix an error on yo...,
98,09-11-2022,Checking or savings account,Checking account,Managing an account,Deposits and withdrawals,
99,21-11-2022,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,Reporting company used your report improperly,


# Removing the unnamed columns:
using the drop() from the pandas library

In [62]:
# consumer = consumer.drop("Unnamed: 0",axis=1)


# Removing all the null values:
using the isnull() from the pandas lib to view the null values:

In [68]:
print(consumer.isnull().sum())

Date received                   0
Product                         0
Sub-product                     0
Issue                           0
Sub-issue                       0
Consumer complaint narrative    0
dtype: int64


Usind the dropna() from pandas lib to drop all the rows that contaiin all the "not available values"

In [69]:
consumer =  consumer.dropna()


Looking at the dataset the product column contains all the labels which represent the nature of the complaints reported by the consumers. 

#  Viewing all the labels and their frequency
using value_counts() from pandas lib

In [71]:
print(consumer['Product'].value_counts())

Credit reporting, credit repair services, or other personal consumer reports    2
Name: Product, dtype: int64


# Data Cleaning 
using ntlk

In [72]:
import nltk 
from nltk.corpus import stopwords
import re
import string

In [74]:
# nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
stopword=set(stopwords.words('english'))

def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
consumer["Consumer complaint narrative"] = consumer["Consumer complaint narrative"].apply(clean)

# **Training the Classification Model**

# Splitting the data into training and test sets

In [80]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [79]:
consumer = consumer[["Consumer complaint narrative", "Product"]]
x = np.array(consumer["Consumer complaint narrative"])
y = np.array(consumer["Product"])

cv = CountVectorizer()
X = cv.fit_transform(x)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.33, 
                                                    random_state=42)

# Train the Machine Learning model using the Stochastic Gradient Descent classification algorithm:

In [77]:

from sklearn.linear_model import SGDClassifier

In [78]:
sgdmodel = SGDClassifier()
sgdmodel.fit(X_train,y_train)

ValueError: The number of classes has to be greater than one; got 1 class

# Using the trained model to make predictions

1st:

In [83]:
user = input("Enter a Text: ")
data = cv.transform([user]).toarray()
output = sgdmodel.predict(data)
print(output)

Enter a Text:  n XXXX/XXXX/2022, I called Citi XXXX XXXX XXXX XXXX XXXX Customer Service at XXXX. I did not want to pay {$99.00} for the next year membership and wanted to cancel my card account


IndexError: tuple index out of range

2nd:

In [None]:
user = input("Enter a Text: ")
data = cv.transform([user]).toarray()
output = sgdmodel.predict(data)
print(output)