In [None]:
# LABEL ENCODING + Model Training
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib


df = pd.read_csv("downloads/complaints_train.csv")
texts = df["narrative"].dropna().astype(str).tolist()

valid_idx = df["narrative"].dropna().index
labels = df.loc[valid_idx, "product"].astype(str).tolist()
le = LabelEncoder()
y_encoded = le.fit_transform(labels)

# Save label encoder
joblib.dump(le, 'label_encoder.joblib')

len(texts) == len(y_encoded)



In [None]:
# Load features
X = np.load('X_train_full.npy')
assert X.shape[0] == len(y_encoded), "Mismatch in features and labels"

In [None]:
# Train model
clf = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42)
clf.fit(X, y_encoded)

# Save model
joblib.dump(clf, 'logreg_model.joblib')

# Optional: quick check
y_pred = clf.predict(X)
print(classification_report(y_encoded, y_pred, target_names=le.classes_))

####  Model trained on 129,927 samples with weighted F1 score of 0.82.

| Predicted ↓ / Actual → | Credit Card | Credit Reporting | Debt Collection | Mortgages & Loans | Retail Banking |
| ---------------------- | ----------- | ---------------- | --------------- | ----------------- | -------------- |
| **Credit Card**        | 8,332       | 2,131            | 1,107           | 420               | 462            |
| **Credit Reporting**   | 1,337       | 66,938           | 3,145           | 1,208             | 1,309          |
| **Debt Collection**    | 1,278       | 3,067            | 11,522          | 854               | 1,797          |
| **Mortgages & Loans**  | 323         | 765              | 645             | 11,681            | 1,778          |
| **Retail Banking**     | 1,182       | 1,036            | 2,099           | 1,029             | 8,482          |


#### Classification Report

| Class                | Precision | Recall | F1-Score | Support |
|----------------------|-----------|--------|----------|---------|
| Credit Card          | 0.72      | 0.67   | 0.69     | 12,452  |
| Credit Reporting     | 0.87      | 0.92   | 0.89     | 72,937  |
| Debt Collection      | 0.74      | 0.62   | 0.67     | 18,518  |
| Mortgages & Loans    | 0.79      | 0.77   | 0.78     | 15,192  |
| Retail Banking       | 0.79      | 0.79   | 0.79     | 10,828  |

| Metric              | Value  |
|---------------------|--------|
| Accuracy            | 0.82   |
| Macro Avg F1        | 0.77   |
| Weighted Avg F1     | 0.82   |

*Trained on 129,927 complaints without validation split.*

