In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import regex as re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier)
from xgboost import XGBClassifier
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [2]:
import pandas as pd

# Load dataset
df = pd.read_csv('complaints.csv')

# Check for duplicates based on 'narrative'
duplicates = df.duplicated(subset=['narrative'])
print(f"Number of duplicate rows: {duplicates.sum()}")

# Remove duplicates based on 'narrative' and keep the first occurrence
df_cleaned = df.drop_duplicates(subset=['narrative'], keep='first')

# Confirm that duplicates are removed
duplicates_after = df_cleaned.duplicated(subset=['narrative']).sum()
print(f"Number of duplicate rows after removal: {duplicates_after}")

# Show the shape of the cleaned DataFrame
print(f"Shape of DataFrame after removing duplicates: {df_cleaned.shape}")

# Display the first few rows of the cleaned dataset
print(df_cleaned.head())

Number of duplicate rows: 37948
Number of duplicate rows after removal: 0
Shape of DataFrame after removing duplicates: (124473, 3)
   Unnamed: 0           product  \
0           0       credit_card   
1           1       credit_card   
2           2    retail_banking   
3           3  credit_reporting   
4           4  credit_reporting   

                                           narrative  
0  purchase order day shipping amount receive pro...  
1  forwarded message date tue subject please inve...  
2  forwarded message cc sent friday pdt subject f...  
3  payment history missing credit report speciali...  
4  payment history missing credit report made mis...  


In [3]:
df_encoded = pd.get_dummies(df_cleaned['product'])
df_encoded.shape

(124473, 5)

In [7]:
df = pd.concat([df_cleaned,df_encoded],axis=1)
df.rename(columns={'Unnamed: 0': 'Complaint No.'}, inplace=True)
df

Unnamed: 0,Complaint No.,product,narrative,credit_card,credit_reporting,debt_collection,mortgages_and_loans,retail_banking
0,0,credit_card,purchase order day shipping amount receive pro...,True,False,False,False,False
1,1,credit_card,forwarded message date tue subject please inve...,True,False,False,False,False
2,2,retail_banking,forwarded message cc sent friday pdt subject f...,False,False,False,False,True
3,3,credit_reporting,payment history missing credit report speciali...,False,True,False,False,False
4,4,credit_reporting,payment history missing credit report made mis...,False,True,False,False,False
...,...,...,...,...,...,...,...,...
162411,162411,retail_banking,zelle suspended account without cause banking ...,False,False,False,False,True
162412,162412,debt_collection,zero contact made debt supposedly resolved fou...,False,False,True,False,False
162413,162413,mortgages_and_loans,zillow home loan nmls nmls actual quote provid...,False,False,False,True,False
162414,162414,debt_collection,zuntafi sent notice willing settle defaulted s...,False,False,True,False,False


In [9]:
df.reset_index(inplace=True)
df = df.drop('index',axis=1)

In [11]:
df.dropna(inplace=True)
df['narrative_length'] = df['narrative'].apply(lambda x: len(x))

In [13]:
df


Unnamed: 0,Complaint No.,product,narrative,credit_card,credit_reporting,debt_collection,mortgages_and_loans,retail_banking,narrative_length
0,0,credit_card,purchase order day shipping amount receive pro...,True,False,False,False,False,1705
1,1,credit_card,forwarded message date tue subject please inve...,True,False,False,False,False,904
2,2,retail_banking,forwarded message cc sent friday pdt subject f...,False,False,False,False,True,1230
3,3,credit_reporting,payment history missing credit report speciali...,False,True,False,False,False,903
4,4,credit_reporting,payment history missing credit report made mis...,False,True,False,False,False,851
...,...,...,...,...,...,...,...,...,...
124468,162411,retail_banking,zelle suspended account without cause banking ...,False,False,False,False,True,118
124469,162412,debt_collection,zero contact made debt supposedly resolved fou...,False,False,True,False,False,189
124470,162413,mortgages_and_loans,zillow home loan nmls nmls actual quote provid...,False,False,False,True,False,107
124471,162414,debt_collection,zuntafi sent notice willing settle defaulted s...,False,False,True,False,False,515


In [15]:
nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')
ps = PorterStemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\keert\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\keert\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
def preprocess_text(text):
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = ''.join([char for char in text if char not in string.punctuation])  # Remove punctuation
    tokenized_words = word_tokenize(text.lower())  # Lowercase and tokenize
    tokenized_words = [ps.stem(word) for word in tokenized_words if word not in stop_words]  # Remove stopwords and stem
    return " ".join(tokenized_words)


In [19]:
df['narrative'] = df['narrative'].apply(preprocess_text)


In [20]:
tfidf = TfidfVectorizer(max_features=1000)  # Reduced max_features to prevent overfitting
X_text = tfidf.fit_transform(df['narrative']).toarray()

In [21]:
y = np.argmax(df[['credit_card', 'credit_reporting', 'debt_collection', 'mortgages_and_loans', 'retail_banking']].values, axis=1)

In [31]:
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score, train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_text, y, test_size=0.4, random_state=22)

# Initialize classifiers with regularization and reduced complexity
clfs = {
    'DecisionTree': DecisionTreeClassifier(max_depth=4, min_samples_split=20),  # Limiting the tree depth and split
    'LogisticRegression': LogisticRegression(solver='saga', penalty='l2', C=0.1, max_iter=1000, random_state=22),  # Regularization using L2
    'RandomForest': RandomForestClassifier(n_estimators=30, max_depth=5, random_state=2),  # Reduced number of estimators and limited depth
    'MultinomialNB': MultinomialNB()  # Adding Multinomial Naive Bayes
}

# Create Voting Classifier with soft voting (include MNB)
voting_clf = VotingClassifier(estimators=[('dt', clfs['DecisionTree']),
                                          ('lr', clfs['LogisticRegression']),
                                          ('rf', clfs['RandomForest']),
                                          ('mnb', clfs['MultinomialNB'])],  
                              voting='soft')

# Function to train classifier and evaluate
def train_classifier(clf, X_train, y_train, X_test, y_test):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    return accuracy, precision


In [33]:
def cross_validate_classifier(clf, X_train, y_train):
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')  # 5-fold cross-validation
    return np.mean(scores)

# Track accuracy, precision, and cross-validation scores
accuracy_scores = []
precision_scores = []
cv_scores = []

# Evaluate individual classifiers
for name, clf in clfs.items():
    current_accuracy, current_precision = train_classifier(clf, X_train, y_train, X_test, y_test)
    cv_score = cross_validate_classifier(clf, X_train, y_train)
    print(f'{name} accuracy: {current_accuracy}')
    print(f'{name} precision: {current_precision}')
    print(f'{name} cross-validation accuracy: {cv_score}')
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)
    cv_scores.append(cv_score)

# Evaluate the Voting Classifier
voting_accuracy, voting_precision = train_classifier(voting_clf, X_train, y_train, X_test, y_test)
voting_cv_score = cross_validate_classifier(voting_clf, X_train, y_train)
print(f'Voting Classifier accuracy: {voting_accuracy}')
print(f'Voting Classifier precision: {voting_precision}')
print(f'Voting Classifier cross-validation accuracy: {voting_cv_score}')

# Compare performance
performance_df = pd.DataFrame({
    'Algorithm': list(clfs.keys()) + ['Voting Classifier'],
    'Accuracy': accuracy_scores + [voting_accuracy],
    'Precision': precision_scores + [voting_precision],
    'Cross-validation Accuracy': cv_scores + [voting_cv_score]
}).sort_values('Cross-validation Accuracy', ascending=False)

print(performance_df)

DecisionTree accuracy: 0.6382333447146961
DecisionTree precision: 0.6492972428851314
DecisionTree cross-validation accuracy: 0.6408017825913292
LogisticRegression accuracy: 0.8310871879330776
LogisticRegression precision: 0.8296371322664009
LogisticRegression cross-validation accuracy: 0.8284214626265165
RandomForest accuracy: 0.5418064231055052
RandomForest precision: 0.707051932336991
RandomForest cross-validation accuracy: 0.5497100785269207
MultinomialNB accuracy: 0.7959589467553074
MultinomialNB precision: 0.7968441067353452
MultinomialNB cross-validation accuracy: 0.7983343154980609
Voting Classifier accuracy: 0.8121271766856133
Voting Classifier precision: 0.8149256464883955
Voting Classifier cross-validation accuracy: 0.8151252686744066
            Algorithm  Accuracy  Precision  Cross-validation Accuracy
1  LogisticRegression  0.831087   0.829637                   0.828421
4   Voting Classifier  0.812127   0.814926                   0.815125
3       MultinomialNB  0.795959   0

In [34]:
def predict_complaint_category(complaint):
    processed_complaint = preprocess_text(complaint)
    complaint_tfidf = tfidf.transform([processed_complaint]).toarray()  # Apply the same TF-IDF transformation
    predictions = {}
    
    # Predict using trained classifiers
    for name, clf in clfs.items():
        pred = clf.predict(complaint_tfidf)
        predictions[name] = pred[0]  # Store the predicted class for each classifier
    
    # Return the predictions for all classifiers
    product_categories = df['product'].astype('category').cat.categories  # Mapping index to product categories
    predictions = {name: product_categories[pred] for name, pred in predictions.items()}
    
    return predictions



In [41]:
predict_complaint_category("name company paypal search find numerous complaint company fund without warning even though company may issue account holder yes transfer occurs issue account flagged account holder alerted prior action online teacher due lock down unable work various country trying get work online able based california operates branch world wide unfortunately online school remit teacher paypal paypal cloak rule regs verbose vague language designed confuse create mind account holder operates openly fairly protect consumer even close month ago received payment school mentioned account immediately locked told call customer service center complied call request find several occasion limited hour due restriction limited meaning every time called got automated message never spoke human began sending email ceo paypal respond noticed customer service activity increased began receiving direct email instead copy paste regs least attempt paypal tried call service center available called also issue transferring fund bank pp constantly kicked back transfer saying something wrong bank account wrong never indicated instead listed different account finally fund got transferred week receiving received another payment school online teaching time paypal immediately locked account apparent reason deleted picture id file flagged account verification needed verification good service sold needed well sure confusing online teacher selling good service must communication side paypal complied request immediately sent email ceo detailing latest fandango required satisfy additionally sent multiple complaint paypal customer service center regarding issue today paypal sent email notifying account restored however fund still frozen understand could long day acceptable messaged complaint entity already mentioned offered get fund faster providing detail product service sold product service online teacher appears paypal want freeze fund election america acceptable necessary")

{'DecisionTree': 'credit_reporting',
 'LogisticRegression': 'retail_banking',
 'RandomForest': 'retail_banking',
 'MultinomialNB': 'retail_banking'}

In [43]:
import pickle

#Save the trained voting classifier
with open('voting_classifier_model.pkl', 'wb') as model_file:
    pickle.dump(voting_clf, model_file)

# Save the fitted TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(tfidf, vectorizer_file)
