In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import re

# Download required NLTK data (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
df = pd.read_csv('complaints.csv')

# Filter to only include rows where Consumer complaint narrative is not null
df = df[df['Consumer complaint narrative'].notna()]

# Define mapping for target categories
product_to_category = {
    "Credit reporting, credit repair services, or other personal consumer reports": 0,
    "Debt collection": 1,
    "Consumer Loan": 2,
    "Mortgage": 3
}

# Filter to only include relevant products
df = df[df['Product'].isin(product_to_category.keys())]

# Create target column
df['target'] = df['Product'].map(product_to_category)

# Drop rows where mapping failed (if any)
df = df.dropna(subset=['target'])

# EDA: Display count of complaints for each category
category_counts = df['target'].value_counts().sort_index()
category_names = ["Credit reporting, repair, or other", "Debt collection", "Consumer Loan", "Mortgage"]
print("Count of complaints per category:")
for i, name in enumerate(category_names):
    print(f"{name}: {category_counts.get(i, 0)}")

# Create a bar chart for class distribution
plt.figure(figsize=(10, 6))
category_counts.plot(kind='bar')
plt.title('Distribution of Complaints by Category')
plt.xlabel('Category')
plt.ylabel('Number of Complaints')
plt.xticks(ticks=range(len(category_names)), labels=category_names, rotation=45)
plt.tight_layout()
plt.show()

# Analyze class imbalance
total_samples = len(df)
imbalance_ratios = category_counts / total_samples * 100
print("\nPercentage distribution:")
for i, name in enumerate(category_names):
    print(f"{name}: {imbalance_ratios.get(i, 0):.2f}%")

print("\nClass Imbalance Analysis:")
print("The dataset shows class imbalance, with some categories (e.g., Credit reporting) likely having more samples than others (e.g., Consumer Loan).")
print("This can lead to biased models favoring majority classes. We will use stratified splitting and evaluate with balanced metrics like F1-score.")

# Define features and target
X = df['Consumer complaint narrative']
y = df['target']

In [None]:
# Define preprocessing function
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stop words and lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    
    # Join back into string
    return ' '.join(tokens)

# Apply preprocessing
X_preprocessed = X.apply(preprocess_text)

print("Sample preprocessed text:")
print(X_preprocessed.iloc[0])

In [None]:
# Split the data (80/20 stratified)
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42, stratify=y)

# Define models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Multinomial Naive Bayes': MultinomialNB(),
    'Linear SVC': LinearSVC(random_state=42, max_iter=1000)
}

# Train pipelines
pipelines = {}
for name, model in models.items():
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
        ('clf', model)
    ])
    pipeline.fit(X_train, y_train)
    pipelines[name] = pipeline

print("Models trained successfully.")

In [None]:
# Evaluate models
results = {}
for name, pipeline in pipelines.items():
    y_pred = pipeline.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=category_names, output_dict=True)
    f1_macro = report['macro avg']['f1-score']
    
    results[name] = {'Accuracy': accuracy, 'F1-macro': f1_macro}
    
    print(f"\n--- {name} ---")
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=category_names))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=category_names, yticklabels=category_names)
    plt.title(f'Confusion Matrix - {name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.show()

# Summary table
results_df = pd.DataFrame(results).T
print("\nModel Comparison Summary:")
print(results_df)

# Declare best model
best_model = results_df['F1-macro'].idxmax()
print(f"\nBest performing model based on F1-macro: {best_model}")

In [None]:
# Create final pipeline with best model
best_pipeline = pipelines[best_model]

# Prediction function
def predict_category(raw_text):
    preprocessed = preprocess_text(raw_text)
    pred = best_pipeline.predict([preprocessed])[0]
    return category_names[pred]

# Example complaints
examples = [
    "My credit report has errors that are affecting my score and I can't get them fixed.",
    "The debt collector is harassing me with calls about a debt I don't owe.",
    "I took out a personal loan and the interest rates are higher than promised.",
    "The bank is foreclosing on my home without proper notice."
]

print("Predictions on example complaints:")
for example in examples:
    prediction = predict_category(example)
    print(f"Complaint: {example[:100]}...")
    print(f"Predicted Category: {prediction}\n")