In [None]:
import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
# Model building
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

#dataset can be downloaded at: https://catalog.data.gov/dataset/consumer-complaint-database

In [None]:
df = pd.read_csv('/Users/pawankumarkc/Downloads/complaints.csv')
df.shape

In [None]:
print(df.columns)

In [None]:
df.info()

In [None]:
df.head(3)

In [None]:
#We need only Consumer complaint narrative and Product columns, so dropping others
df= df[['Consumer complaint narrative','Product']]

In [None]:
df.head(3)

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
#Removing all nulls from dataset
df.dropna(axis=0, inplace=True)
df.shape

In [None]:
#Removing duplicate files
df[['Consumer complaint narrative', 'Product']].drop_duplicates()
df.shape

In [None]:
#Selecting only 5% of values of each type of product for simple code executio

reduced_df = pd.DataFrame()
for prod in df['Product'].unique():
    count = round((df['Product'].where(lambda x: x==prod).count()/100)*5)
    print(f'Selecting {count} columns from product type {prod}')
    _df = pd.DataFrame(df.loc[df['Product'] == prod].head(count))
    reduced_df = pd.concat([reduced_df, _df])

reduced_df.shape

In [None]:
#Shortening the column names
reduced_df.columns = ['complaint','product']
reduced_df.columns

In [None]:
#Saving the dataframe as csv -- checkpoint 1
reduced_df.to_csv('/Users/pawankumarkc/Documents/vscodepython/MLProjects/datasets/customer_complaints_cp1.csv')

In [None]:
reduced_df['product'].unique()

In [None]:
reduced_df.groupby('product').complaint.count().sort_values().plot.barh(title= 'NUMBER OF COMPLAINTS IN EACH PRODUCT CATEGORYn')

In [None]:
reduced_df.groupby('product').count().sort_values(by=['complaint'], ascending=False)

In [None]:
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub('[^\w\s]','', text)
    text = text.lower()
    text = [lemmatizer.lemmatize(tex) for tex in text if not tex in stopwords.words('english')]
    return text

In [None]:
lemmatizer = WordNetLemmatizer()

def clean_text2(text):
    words = str(text).lower() 
    words = words.split()
    words = [lemmatizer.lemmatize(tex, pos='a') for tex in words if tex not in stopwords.words('english')]
    text = ' '.join(words)
    return text

In [None]:
lemmatizer = WordNetLemmatizer()

def clean_text3(text):
    #Removes special characters
    words = re.sub(r'[/,():*."\[\]]','', text)
    #Removes consecutive repeatitions of x
    words = re.sub(r'(\w)\1X+','', words)
    words = words.lower().split()
    #Lemmatization and removal of stop words
    words = [lemmatizer.lemmatize(tex, pos="a") for tex in words if tex not in stopwords.words('english')]
    words = ' '.join(words)
    return words

In [None]:
reduced_df.head()

In [None]:
from tqdm import tqdm
tqdm.pandas()

from pandarallel import pandarallel
pandarallel.initialize()


reduced_df['complaint'] = reduced_df['complaint'].parallel_apply(clean_text3)
reduced_df.head()


In [None]:
reduced_df.info()

In [None]:
reduced_df.to_csv('/Users/pawankumarkc/Documents/vscodepython/MLProjects/datasets/customer_complaints_cp2.csv')

In [None]:
reduced_df = pd.read_csv('/Users/pawankumarkc/Documents/vscodepython/MLProjects/datasets/customer_complaints_cp2.csv')
reduced_df.columns

In [None]:
reduced_df['product_code'] = reduced_df['product'].astype('category').cat.codes

In [None]:
reduced_df.columns

In [None]:
reduced_df.drop(columns=['product','Unnamed: 0'], axis=1, inplace=True)
reduced_df.columns

In [None]:
reduced_df.head(3)

In [None]:
reduced_df.index

In [None]:
reduced_df.to_csv('/Users/pawankumarkc/Documents/vscodepython/MLProjects/datasets/customer_complaints_cp3.csv')

In [None]:
x_train, x_test, y_train, y_test = train_test_split(reduced_df['complaint'], reduced_df['product_code'], test_size=0.20, random_state=2024)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape )

In [None]:
x_train

In [None]:
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(x_train).toarray()

In [None]:
features.shape

In [None]:
from sklearn.metrics import accuracy_score
lr = LogisticRegression()
lr.fit(features, y_train)

y_pred_train = lr.predict(features)
print(accuracy_score(y_train, y_pred_train))

In [None]:
import joblib

joblib.dump(lr, '/Users/pawankumarkc/Documents/vscodepython/MLProjects/models/customer_complaints/lr_model.pkl')

In [None]:
print(accuracy_score(y_train, y_pred_train))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_train, y_pred_train))

In [None]:
# tfidf_v_vector = fitted_v.transform(x_train)
# print(type(tfidf_v_vector))
# print(tfidf_v_vector)

# tfidf_v_vector.toarray()

In [None]:
# reduced_df.to_csv('/Users/pawankumarkc/Documents/vscodepython/MLProjects/Capstone/datasets/customer_complaints_cp2.csv')

# #label encoding product column
# reduced_df['product'] = reduced_df['product'].astype('category').cat.codes

# #splitting data into train and test
# x_train, x_test, y_train, y_test = train_test_split(reduced_df['complaint'], reduced_df['product'], test_size=0.25, random_state=2024)

# vectorizer = TfidfVectorizer()
# x_train_transformed = vectorizer.fit_transform(x_train)


# #Convert everything to lower case
# #Remove punctuations,numbers, dates, special characters, stopwords
# #visualise

In [None]:
test_vector = vectorizer.transform(x_test).toarray()

y_pred_test = lr.predict(test_vector)
print(accuracy_score(y_test, y_pred_test))

In [None]:
vectorizer.get_feature_names_out()