In [1]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn import naive_bayes #import GaussianNB, MultinomialNB
from sklearn.linear_model import SGDClassifier # simulate the behavior of logistic regression using SGDClassifier(loss='log')
from sklearn.metrics import accuracy_score,balanced_accuracy_score,average_precision_score, classification_report

from sklearn.utils import shuffle
from collections import Counter

In [2]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
train_df = pd.read_csv('cfpb_train.csv')
test_df = pd.read_csv('cfpb_test.csv')
dev_df = pd.read_csv('cfpb_dev.csv')

train_df['Consumer complaint narrative'] = train_df['Consumer complaint narrative'].fillna('').astype(str)
test_df['Consumer complaint narrative'] = test_df['Consumer complaint narrative'].fillna('').astype(str)
dev_df['Consumer complaint narrative'] = dev_df['Consumer complaint narrative'].fillna('').astype(str)

train_df['debt_collection'] = (train_df['Product'] == 'Debt collection').astype(int)
test_df['debt_collection'] = (test_df['Product'] == 'Debt collection').astype(int)
dev_df['debt_collection'] = (dev_df['Product'] == 'Debt collection').astype(int)

train_df_sample = train_df.sample(10000)

  train_df = pd.read_csv('cfpb_train.csv')
  dev_df = pd.read_csv('cfpb_dev.csv')


In [4]:
# Load the trained vectorizer
with open('tfidf_vectorizer_train_split_33k.pkl', 'rb') as f:
    loaded_vectorizer = pickle.load(f)
len(loaded_vectorizer.get_feature_names_out())

30288

In [5]:
%%time
# Vectorize the text data with pre-tuned vectorizer
X_train = loaded_vectorizer.transform(train_df['Consumer complaint narrative'])
y_train = train_df['debt_collection']

X_train_sample = loaded_vectorizer.transform(train_df_sample['Consumer complaint narrative'])
y_train_sample = train_df_sample['debt_collection']

X_dev = loaded_vectorizer.transform(dev_df['Consumer complaint narrative'])
y_dev = dev_df['debt_collection']

X_test = loaded_vectorizer.transform(test_df['Consumer complaint narrative'])
y_test = test_df['debt_collection']

Wall time: 3min 50s


In [6]:
# %%time
# selector = SelectKBest(chi2, k=30000)
# X_train = selector.fit_transform(X_train, y_train)
# X_train_sample = selector.transform(X_train_sample)
# X_dev = selector.transform(X_dev)
# X_test = selector.transform(X_test)

In [7]:
%%time
# some balancing
# from imblearn.over_sampling import SMOTE 
# sm = SMOTE(random_state=42)
# X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
# # Shuffle your data
# X_train_res, y_train_res = shuffle(X_train_res, y_train_res)
# define oversampling strategy
oversample = RandomOverSampler(sampling_strategy='minority')
# fit and apply the transform
X_train_res, y_train_res = oversample.fit_resample(X_train, y_train)

# Shuffle your data
X_train_res, y_train_res = shuffle(X_train_res, y_train_res)

Wall time: 780 ms


### SGD Linear SVM

In [8]:
# Initialize a LogisticRegression model, ‘log_loss’ gives logistic regression, a probabilistic classifier.
clf= SGDClassifier(loss='hinge',random_state=42, alpha=1e-4, n_iter_no_change=3, early_stopping=False)

# The partial fit if you ran out of RAM
batch_size = 50000
n_batches = X_train_res.shape[0] // batch_size

# partial fitting
for i in tqdm(range(n_batches)):
    start = i * batch_size
    end = (i + 1) * batch_size
    batch_X = X_train_res[start:end]
    batch_y = y_train_res[start:end]
    if i == 0:
        clf.partial_fit(batch_X.A, batch_y, classes=np.unique(y_train_res)) 
        # The `.A` here converts the sparse matrix to a dense matrix.
        # This is necessary because GaussianNB doesn't support sparse matrices.
        # We also specify the classes parameter in the first call to partial_fit.
    else:
        clf.partial_fit(batch_X.A, batch_y)

100%|██████████████████████████████████████████████████████████████████████████████████| 27/27 [05:23<00:00, 11.98s/it]


In [9]:
# get the results
y_pred = clf.predict(X_train_sample.toarray())
# y_pred_proba = clf.predict_proba(X_train_sample.toarray())

# Convert lists to arrays for further use
y_pred = np.array(y_pred)
# y_pred_proba = np.array(y_pred_proba)

print("Train")
print("accuracy_score",accuracy_score(y_train_sample, y_pred))
print("balanced_accuracy_score",balanced_accuracy_score(y_train_sample, y_pred))
print("average_precision_score",average_precision_score(y_train_sample, y_pred))
print(classification_report(y_train_sample, y_pred))

# c = Counter(y_pred)
# print("Prediction", c.most_common(2))
# c = Counter(y_train_sample)
# print("Ground Truth",c.most_common(2))

# prediction = pd.DataFrame(y_pred_proba)
# prediction['result'] = y_pred

# df = prediction.copy()
# df.columns = ['neg', 'pos', 'class']
# df['true'] = y_train_sample

# fig, axs = plt.subplots(ncols=2, figsize=(15, 5)) # Adjust figsize as needed

# # Plot Probability Density for Different True Classes
# for class_label in df['true'].unique():
#     sns.kdeplot(df[df['true'] == class_label]['pos'], label=class_label, ax=axs[0])

# axs[0].set_title('Probability Density for Different True Classes')
# axs[0].set_xlabel('Probability')
# axs[0].set_ylabel('Density')
# axs[0].legend(title='True Class')

# # Plot Probability Density for Different Predicted Classes
# for class_label in df['class'].unique():
#     sns.kdeplot(df[df['class'] == class_label]['pos'], label=class_label, ax=axs[1])

# axs[1].set_title('Probability Density for Different Predicted Classes')
# axs[1].set_xlabel('Probability')
# axs[1].set_ylabel('Density')
# axs[1].legend(title='Predicted Class')

# plt.tight_layout()
# plt.show()

Train
accuracy_score 0.9103
balanced_accuracy_score 0.8800004452770154
average_precision_score 0.617592840990541
              precision    recall  f1-score   support

           0       0.96      0.93      0.94      8253
           1       0.71      0.83      0.76      1747

    accuracy                           0.91     10000
   macro avg       0.83      0.88      0.85     10000
weighted avg       0.92      0.91      0.91     10000



In [10]:
# get the results
y_pred = clf.predict(X_dev.toarray())
# y_pred_proba = clf.predict_proba(X_dev.toarray())

# Convert lists to arrays for further use
y_pred = np.array(y_pred)
# y_pred_proba = np.array(y_pred_proba)

print("Train")
print("accuracy_score",accuracy_score(y_dev, y_pred))
print("balanced_accuracy_score",balanced_accuracy_score(y_dev, y_pred))
print("average_precision_score",average_precision_score(y_dev, y_pred))
print(classification_report(y_dev, y_pred))

# c = Counter(y_pred)
# print("Prediction", c.most_common(2))
# c = Counter(y_dev)
# print("Ground Truth",c.most_common(2))

# prediction = pd.DataFrame(y_pred_proba)
# prediction['result'] = y_pred

# df = prediction.copy()
# df.columns = ['neg', 'pos', 'class']
# df['true'] = y_dev

# fig, axs = plt.subplots(ncols=2, figsize=(15, 5)) # Adjust figsize as needed

# # Plot Probability Density for Different True Classes
# for class_label in df['true'].unique():
#     sns.kdeplot(df[df['true'] == class_label]['pos'], label=class_label, ax=axs[0])

# axs[0].set_title('Probability Density for Different True Classes')
# axs[0].set_xlabel('Probability')
# axs[0].set_ylabel('Density')
# axs[0].legend(title='True Class')

# # Plot Probability Density for Different Predicted Classes
# for class_label in df['class'].unique():
#     sns.kdeplot(df[df['class'] == class_label]['pos'], label=class_label, ax=axs[1])

# axs[1].set_title('Probability Density for Different Predicted Classes')
# axs[1].set_xlabel('Probability')
# axs[1].set_ylabel('Density')
# axs[1].legend(title='Predicted Class')

# plt.tight_layout()
# plt.show()

Train
accuracy_score 0.9080992736077482
balanced_accuracy_score 0.8746854623937779
average_precision_score 0.6118697432874209
              precision    recall  f1-score   support

           0       0.96      0.93      0.94    136036
           1       0.71      0.82      0.76     29164

    accuracy                           0.91    165200
   macro avg       0.83      0.87      0.85    165200
weighted avg       0.92      0.91      0.91    165200



### Non-Linear SVM

In [11]:
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler

In [12]:
# Standardize the features to have mean=0 and variance=1
scaler = StandardScaler(with_mean=False).fit(X_train_res)
X_train_scaled = scaler.transform(X_train_res)
X_train_sample_scaled = scaler.transform(X_train_sample)
X_dev_scaled = scaler.transform(X_dev)

In [13]:
# Use the Nystroem method to approximate a radial basis function (RBF) kernel
nystroem = Nystroem(kernel='rbf', gamma=0.2, random_state=42, n_components=1500)
nystroem.fit(X_train_scaled)
X_train_transformed = nystroem.transform(X_train_scaled)
X_train_sample_transformed = nystroem.transform(X_train_sample_scaled)
X_dev_transformed = nystroem.transform(X_dev_scaled)

In [14]:
# Initialize a LogisticRegression model, ‘log_loss’ gives logistic regression, a probabilistic classifier.
clf= SGDClassifier(loss='hinge',random_state=42, alpha=1e-4, n_iter_no_change=3, early_stopping=False)

# The partial fit if you ran out of RAM
batch_size = 50000
n_batches = X_train_transformed.shape[0] // batch_size

# partial fitting
for i in tqdm(range(n_batches)):
    start = i * batch_size
    end = (i + 1) * batch_size
    batch_X = X_train_transformed[start:end]
    batch_y = y_train_res[start:end]
    if i == 0:
        clf.partial_fit(batch_X, batch_y, classes=np.unique(y_train_res)) 
        # The `.A` here converts the sparse matrix to a dense matrix.
        # This is necessary because GaussianNB doesn't support sparse matrices.
        # We also specify the classes parameter in the first call to partial_fit.
    else:
        clf.partial_fit(batch_X, batch_y)

100%|██████████████████████████████████████████████████████████████████████████████████| 27/27 [00:18<00:00,  1.46it/s]


In [15]:
# get the results
y_pred = clf.predict(X_train_sample_transformed)
# y_pred_proba = clf.predict_proba(X_train_sample.toarray())

# Convert lists to arrays for further use
y_pred = np.array(y_pred)
# y_pred_proba = np.array(y_pred_proba)

print("Train")
print("accuracy_score",accuracy_score(y_train_sample, y_pred))
print("balanced_accuracy_score",balanced_accuracy_score(y_train_sample, y_pred))
print("average_precision_score",average_precision_score(y_train_sample, y_pred))
print(classification_report(y_train_sample, y_pred))

Train
accuracy_score 0.1871
balanced_accuracy_score 0.5072867988334852
average_precision_score 0.17682681511138804
              precision    recall  f1-score   support

           0       0.99      0.02      0.03      8253
           1       0.18      1.00      0.30      1747

    accuracy                           0.19     10000
   macro avg       0.58      0.51      0.17     10000
weighted avg       0.85      0.19      0.08     10000



In [16]:
# get the results
y_pred = clf.predict(X_dev_transformed)
# y_pred_proba = clf.predict_proba(X_dev.toarray())

# Convert lists to arrays for further use
y_pred = np.array(y_pred)
# y_pred_proba = np.array(y_pred_proba)

print("Train")
print("accuracy_score",accuracy_score(y_dev, y_pred))
print("balanced_accuracy_score",balanced_accuracy_score(y_dev, y_pred))
print("average_precision_score",average_precision_score(y_dev, y_pred))
print(classification_report(y_dev, y_pred))


Train
accuracy_score 0.18854721549636805
balanced_accuracy_score 0.5071709670194323
average_precision_score 0.17864737135014022
              precision    recall  f1-score   support

           0       1.00      0.01      0.03    136036
           1       0.18      1.00      0.30     29164

    accuracy                           0.19    165200
   macro avg       0.59      0.51      0.17    165200
weighted avg       0.85      0.19      0.08    165200

