In [25]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
import nltk
import re
from nltk.corpus import stopwords
import string

In [2]:
data = pd.read_csv('complaints/consumercomplaints.csv')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative
0,0,2022-11-11,Mortgage,Conventional home mortgage,Trouble during payment process,,
1,1,2022-11-23,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,Reporting company used your report improperly,
2,2,2022-11-16,Mortgage,VA mortgage,Trouble during payment process,,
3,3,2022-11-15,Checking or savings account,Checking account,Managing an account,Fee problem,"Hi, I have been banking with Wells Fargo for o..."
4,4,2022-11-07,Mortgage,Other type of mortgage,Trouble during payment process,,


## Processing Data

In [6]:
data = data.drop("Unnamed: 0",axis=1)

In [7]:
data.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative
0,2022-11-11,Mortgage,Conventional home mortgage,Trouble during payment process,,
1,2022-11-23,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,Reporting company used your report improperly,
2,2022-11-16,Mortgage,VA mortgage,Trouble during payment process,,
3,2022-11-15,Checking or savings account,Checking account,Managing an account,Fee problem,"Hi, I have been banking with Wells Fargo for o..."
4,2022-11-07,Mortgage,Other type of mortgage,Trouble during payment process,,


In [8]:
data.isnull().sum()

Date received                         0
Product                               0
Sub-product                      235294
Issue                                 0
Sub-issue                        683355
Consumer complaint narrative    1987977
dtype: int64

In [9]:
data.dropna()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative
3,2022-11-15,Checking or savings account,Checking account,Managing an account,Fee problem,"Hi, I have been banking with Wells Fargo for o..."
11,2022-11-09,Debt collection,Other debt,False statements or representation,Indicated you were committing crime by not pay...,XXXX is attempting to collect funds for Valuat...
15,2022-11-14,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account information incorrect,Today I called to get my balance and reset my ...
51,2022-10-12,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Their investigation did not fix an error on yo...,The Federal Trade Commission Bureau of Consume...
72,2022-10-09,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Difficulty submitting a dispute or getting inf...,Ive mailed police report called been hung up o...
...,...,...,...,...,...,...
3101950,2017-03-04,Debt collection,I do not know,Disclosure verification of debt,Not given enough info to verify debt,I have received calls and notices in regards t...
3101955,2017-01-19,Student loan,Non-federal student loan,Can't repay my loan,Can't decrease my monthly payments,"Insanely high monthly payments, with "" no opti..."
3101956,2017-01-22,Student loan,Federal student loan servicing,Dealing with my lender or servicer,Need information about my balance/terms,My loans have an extraordinarily high interest...
3101958,2017-01-26,Debt collection,Auto,Communication tactics,Called after sent written cease of comm,Received cease and desist letter from them sho...


In [10]:
data["Product"].value_counts()

Product
Credit reporting, credit repair services, or other personal consumer reports    1432096
Debt collection                                                                  452620
Mortgage                                                                         365181
Credit card or prepaid card                                                      165452
Checking or savings account                                                      141849
Credit reporting                                                                 140430
Credit card                                                                       89190
Bank account or service                                                           86206
Student loan                                                                      71530
Money transfer, virtual currency, or money service                                47641
Vehicle loan or lease                                                             37161
Consumer Loan           

## Training Consumer Complaint Classification Model

In [11]:
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
stopword=set(stopwords.words('english'))

def clean(text):
    text = str(text).lower() # Chuyển đổi thành chữ thường
    text = re.sub('\[.*?\]', '', text) # Loại bỏ các đoạn văn bản nằm trong dấu ngoặc vuông, bao gồm cả nội dung bên trong dấu ngoặc vuông
    text = re.sub('https?://\S+|www\.\S+', '', text) # Loại bỏ các liên kết URL 
    text = re.sub('<.*?>+', '', text) #  Loại bỏ các thẻ HTML
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # Loại bỏ tất cả các ký tự dấu câu 
    text = re.sub('\n', '', text) # Loại bỏ các ký tự xuống dòng 
    text = re.sub('\w*\d\w*', '', text) # Loại bỏ các từ chứa chữ số
    text = [word for word in text.split(' ') if word not in stopword] # Tách văn bản thành danh sách các từ (tokenization) và loại bỏ stop word
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')] # Loại bỏ các hậu tố từ để đưa về dạng gốc
    text=" ".join(text)
    return text
data["Consumer complaint narrative"] = data["Consumer complaint narrative"].apply(clean)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HELLO\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


## Train And Test

In [13]:
data = data[["Consumer complaint narrative", "Product"]]
x = np.array(data["Consumer complaint narrative"])
y = np.array(data["Product"])

cv = CountVectorizer()
X = cv.fit_transform(x)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.33, 
                                                    random_state=42)

## Using the Stochastic Gradient Descent classification algorithm(CountVectorizer)

In [14]:
sgdmodel = SGDClassifier()
sgdmodel.fit(X_train,y_train)

In [22]:
# Predicting the Test set results
y_pred = sgdmodel.predict(X_test)

# Calculating the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.5732848141454598


## Using the Multinomial Naive Bayes classifier algorithm(CountVectorizer)

In [23]:
# Training the Multinomial Naive Bayes classifier with the simulated training data
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)


In [24]:
# Predicting the Test set results
y_pred_nb = nb_model.predict(X_test)

# Calculating the accuracy
accuracy = accuracy_score(y_test, y_pred_nb)
print(f'Accuracy: {accuracy}')

Accuracy: 0.5644272944854198


## Again Training and Testing

In [26]:
# Creating a TF-IDF Vectorizer instance
tfidf_vectorizer = TfidfVectorizer()

# Since I don't have the actual dataset, I will continue using the mock dataset.
# In practice, you would fit the TF-IDF vectorizer on your actual text data.
X_tfidf = tfidf_vectorizer.fit_transform(x)

# Splitting the TF-IDF transformed data into training and test set
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.33, random_state=42)

## Stochastic Gradient Descent classification algorithm(TF-IDF)

In [27]:
sgdmodel = SGDClassifier()
sgdmodel.fit(X_train_tfidf,y_train)

In [28]:
# Predicting the Test set results
y_pred = sgdmodel.predict(X_test_tfidf)

# Calculating the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.3575792507204611


## Using the Multinomial Naive Bayes classifier algorithm(TF-IDF)

In [29]:
# Training the Multinomial Naive Bayes classifier with the simulated training data
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

In [30]:
# Predicting the Test set results
y_pred_nb = nb_model.predict(X_test_tfidf)

# Calculating the accuracy
accuracy = accuracy_score(y_test, y_pred_nb)
print(f'Accuracy: {accuracy}')

Accuracy: 0.5486279490060079


## Use our trained model to make predictions as CountVectorizer

In [19]:
def predict_with_sgd_model():
    # Nhập văn bản từ người dùng
    user_input = input("Nhập văn bản của bạn: ")

    # Biến đổi văn bản thành dạng vector
    data = cv.transform([user_input]).toarray()

    # Dự đoán sử dụng mô hình SGD
    output = sgdmodel.predict(data)

    # In ra kết quả dự đoán
    print("Kết quả dự đoán:", output)

# Gọi hàm để thực hiện dự đoán
predict_with_sgd_model()

['Credit reporting, credit repair services, or other personal consumer reports']
