In [5]:
# %%
"""
Fixed version: indentation error corrected for clean_text() function
Jupyter-ready script for Consumer Complaint text classification
"""

# %%
import os, re, gc, joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# nltk.download('stopwords')
# nltk.download('wordnet')

DATA_PATH = "D:\csv\complaints.csv"
SAMPLE_SIZE = 50000
RANDOM_STATE = 42
LABEL_MAPPING = {
    'Credit reporting': 0,
    'Credit reporting, credit repair services, or other': 0,
    'Debt collection': 1,
    'Consumer Loan': 2,
    'Mortgage': 3,
}

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
contraction_mapping = {"ain't": "is not", "can't": "cannot", "won't": "will not", "i'm": "i am", "it's": "it is"}

# fixed indentation
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    for k, v in contraction_mapping.items():
        text = text.replace(k, v)
    text = re.sub(r'\S+@\S+', ' ', text)
    text = re.sub(r'http\S+|www\.\S+', ' ', text)
    text = re.sub(r'[^a-z\s]', ' ', text)
    tokens = [w for w in text.split() if len(w) > 1 and w not in stop_words]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

# %% Load data sample
usecols = ['Product', 'Issue', 'Consumer complaint narrative']
chunks = []
for chunk in pd.read_csv(DATA_PATH, usecols=usecols, chunksize=100000):
    def map_label(p):
        if not isinstance(p, str):
            return None
        for k, v in LABEL_MAPPING.items():
            if k.lower() in p.lower():
                return v
        return None
    chunk['label'] = chunk['Product'].apply(map_label)
    filtered = chunk[chunk['label'].notnull()]
    chunks.append(filtered)
    print('Loaded chunk:', len(filtered))

data = pd.concat(chunks, ignore_index=True)
data = data.groupby('label', group_keys=False).apply(lambda x: x.sample(min(len(x), SAMPLE_SIZE//4), random_state=RANDOM_STATE))
data['text'] = data['Issue'].fillna('') + ' ' + data['Consumer complaint narrative'].fillna('')
data['clean_text'] = data['text'].apply(clean_text)

X = data['clean_text']
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE)

# %% Train models
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, n_jobs=-1),
    'MultinomialNB': MultinomialNB(),
    'LinearSVC': LinearSVC(max_iter=5000)
}

for name, clf in models.items():
    print(f"\nTraining {name}...")
    pipe = Pipeline([('tfidf', tfidf), ('clf', clf)])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    print(name, 'Accuracy:', accuracy_score(y_test, preds))
    print(classification_report(y_test, preds))

print("Done.")

Loaded chunk: 92282
Loaded chunk: 89586
Loaded chunk: 89062
Loaded chunk: 88925
Loaded chunk: 89143
Loaded chunk: 89040
Loaded chunk: 89054
Loaded chunk: 88978
Loaded chunk: 89088
Loaded chunk: 88880
Loaded chunk: 88711
Loaded chunk: 89058
Loaded chunk: 88938
Loaded chunk: 89047
Loaded chunk: 88933
Loaded chunk: 89036
Loaded chunk: 88843
Loaded chunk: 89163
Loaded chunk: 89148
Loaded chunk: 89015
Loaded chunk: 89297
Loaded chunk: 89008
Loaded chunk: 89301
Loaded chunk: 88634
Loaded chunk: 88224
Loaded chunk: 88727
Loaded chunk: 88448
Loaded chunk: 88473
Loaded chunk: 88408
Loaded chunk: 88428
Loaded chunk: 89189
Loaded chunk: 89175
Loaded chunk: 89062
Loaded chunk: 88879
Loaded chunk: 88728
Loaded chunk: 89116
Loaded chunk: 88905
Loaded chunk: 89028
Loaded chunk: 88661
Loaded chunk: 88736
Loaded chunk: 88787
Loaded chunk: 88561
Loaded chunk: 88790
Loaded chunk: 88386
Loaded chunk: 88319
Loaded chunk: 88437
Loaded chunk: 87711
Loaded chunk: 88492
Loaded chunk: 88818
Loaded chunk: 88865


  data = data.groupby('label', group_keys=False).apply(lambda x: x.sample(min(len(x), SAMPLE_SIZE//4), random_state=RANDOM_STATE))



Training LogisticRegression...
LogisticRegression Accuracy: 0.9873
              precision    recall  f1-score   support

         0.0       0.98      0.99      0.98      2500
         1.0       0.99      0.99      0.99      2500
         2.0       0.99      0.99      0.99      2500
         3.0       0.99      0.98      0.99      2500

    accuracy                           0.99     10000
   macro avg       0.99      0.99      0.99     10000
weighted avg       0.99      0.99      0.99     10000


Training MultinomialNB...
MultinomialNB Accuracy: 0.9768
              precision    recall  f1-score   support

         0.0       0.97      0.97      0.97      2500
         1.0       0.98      0.97      0.98      2500
         2.0       0.98      0.98      0.98      2500
         3.0       0.98      0.98      0.98      2500

    accuracy                           0.98     10000
   macro avg       0.98      0.98      0.98     10000
weighted avg       0.98      0.98      0.98     10000


Tra