In [9]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn import naive_bayes #import GaussianNB, MultinomialNB
from sklearn.linear_model import SGDClassifier # simulate the behavior of logistic regression using SGDClassifier(loss='log')
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score,balanced_accuracy_score,average_precision_score, classification_report, f1_score
 
from sklearn.utils import shuffle
from collections import Counter
from itertools import product

In [2]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
# Load the trained vectorizer
with open('tfidf_vectorizer_train_split_33k.pkl', 'rb') as f:
    loaded_vectorizer = pickle.load(f)
len(loaded_vectorizer.get_feature_names_out())

cfpb_df = pd.read_csv('cfpb_train.csv')
# some cleaning just ot make sure
cfpb_df['Consumer complaint narrative'] = cfpb_df['Consumer complaint narrative'].fillna('').astype(str)
cfpb_df['debt_collection'] = (cfpb_df['Product'] == 'Debt collection').astype(int)
cv_df = cfpb_df[['Consumer complaint narrative','debt_collection']]

  cfpb_df = pd.read_csv('cfpb_train.csv')


## 5-Fold CV

In [4]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10)

### Logistic Regression

In [5]:
%%time
f1_scores = []  # to store the f1_scores for each fold

for train_index, val_index in skf.split(cv_df['Consumer complaint narrative'], cv_df['debt_collection']):
    # Logistic Regression Params:
    sample_size = 200000
    chi2_features = 22500
    alpha = 0.0001
    loss = 'log_loss'
    penalty=  'elasticnet'
    n_iter_no_change = 3
    early_stopping = False
    learning_rate = 'constant'
    eta0 =  0.01
    
    
    # split the data
    X_train_fold, X_val_fold = cv_df['Consumer complaint narrative'][train_index], cv_df['Consumer complaint narrative'][val_index]
    y_train_fold, y_val_fold = cv_df['debt_collection'][train_index], cv_df['debt_collection'][val_index]
    
    # Further Sampling Since we don't need all that
    X_train_fold_sample = X_train_fold.sample(sample_size)
    y_train_fold_sample = y_train_fold.loc[X_train_fold_sample.index]  # Get the corresponding labels
    
    # Vectorize the training and validation sets
    X_train_fold_sample = loaded_vectorizer.transform(X_train_fold_sample)
    X_val_fold = loaded_vectorizer.transform(X_val_fold)
    
    # Feature selection
    selector = SelectKBest(chi2, k=chi2_features)
    X_train_fold_sample = selector.fit_transform(X_train_fold_sample, y_train_fold_sample)
    X_val_fold = selector.transform(X_val_fold)
    
    # Oversampling
    oversample = RandomOverSampler(sampling_strategy='minority')
    X_train_fold_sample, y_train_fold_sample = oversample.fit_resample(X_train_fold_sample, y_train_fold_sample)
    
    # Shuffle your data
    X_train_fold_sample, y_train_fold_sample = shuffle(X_train_fold_sample, y_train_fold_sample)
    
    # Define the classifier:
    clf = SGDClassifier(loss=loss, 
                        penalty=penalty, 
                        alpha=alpha,
                        n_iter_no_change=n_iter_no_change,
                        early_stopping=early_stopping,
                        learning_rate=learning_rate,
                        eta0=eta0,
                        random_state=42)
    clf.fit(X_train_fold_sample, y_train_fold_sample)
    
    # Predict on the validation fold
    y_val_pred = clf.predict(X_val_fold)
    
    # Calculate the f1_score and append to the list of f1_scores
    f1 = f1_score(y_val_fold, y_val_pred)
    f1_scores.append(f1)
    
    print("accuracy_score",accuracy_score(y_val_fold, y_val_pred))
    print("balanced_accuracy_score",balanced_accuracy_score(y_val_fold, y_val_pred))
    print("average_precision_score",average_precision_score(y_val_fold, y_val_pred))
    print("f1_score",f1)
    print(classification_report(y_val_fold, y_val_pred))
    
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)
print('Mean F1:', mean_f1, 'Std F1:', std_f1)

accuracy_score 0.8951089588377724
balanced_accuracy_score 0.8538980364340482
average_precision_score 0.5691223953833455
f1_score 0.7270665322580645
              precision    recall  f1-score   support

           0       0.95      0.92      0.94    135990
           1       0.67      0.79      0.73     29210

    accuracy                           0.90    165200
   macro avg       0.81      0.85      0.83    165200
weighted avg       0.90      0.90      0.90    165200

accuracy_score 0.8970332750198245
balanced_accuracy_score 0.8569319842905218
average_precision_score 0.5753254956927794
f1_score 0.731897991993191
              precision    recall  f1-score   support

           0       0.95      0.92      0.94    135990
           1       0.68      0.79      0.73     29209

    accuracy                           0.90    165199
   macro avg       0.82      0.86      0.83    165199
weighted avg       0.91      0.90      0.90    165199

accuracy_score 0.8952112300921918
balanced_accuracy

### Naive Bayes

In [8]:
%%time
f1_scores = []  # to store the f1_scores for each fold

for train_index, val_index in skf.split(cv_df['Consumer complaint narrative'], cv_df['debt_collection']):
    # Logistic Regression Params:
    sample_size = 150000
    chi2_features = 27500
    alpha =  0.001
    fit_prior =  True
    class_prior =  None
    
    
    # split the data
    X_train_fold, X_val_fold = cv_df['Consumer complaint narrative'][train_index], cv_df['Consumer complaint narrative'][val_index]
    y_train_fold, y_val_fold = cv_df['debt_collection'][train_index], cv_df['debt_collection'][val_index]
    
    # Further Sampling Since we don't need all that
    X_train_fold_sample = X_train_fold.sample(sample_size)
    y_train_fold_sample = y_train_fold.loc[X_train_fold_sample.index]  # Get the corresponding labels
    
    # Vectorize the training and validation sets
    X_train_fold_sample = loaded_vectorizer.transform(X_train_fold_sample)
    X_val_fold = loaded_vectorizer.transform(X_val_fold)
    
    # Feature selection
    selector = SelectKBest(chi2, k=chi2_features)
    X_train_fold_sample = selector.fit_transform(X_train_fold_sample, y_train_fold_sample)
    X_val_fold = selector.transform(X_val_fold)
    
    # Oversampling
    sm = SMOTE(random_state=42)
    X_train, y_train = sm.fit_resample(X_train_fold_sample, y_train_fold_sample)
    
    # Shuffle your data
    X_train_fold_sample, y_train_fold_sample = shuffle(X_train_fold_sample, y_train_fold_sample)
    
    # Define the classifier:
    clf = naive_bayes.MultinomialNB(
        alpha=alpha,
        fit_prior=fit_prior, 
        class_prior=class_prior
    )
    clf.fit(X_train_fold_sample, y_train_fold_sample)
    
    # Predict on the validation fold
    y_val_pred = clf.predict(X_val_fold)
    
    # Calculate the f1_score and append to the list of f1_scores
    f1 = f1_score(y_val_fold, y_val_pred)
    f1_scores.append(f1)
    
    print("accuracy_score",accuracy_score(y_val_fold, y_val_pred))
    print("balanced_accuracy_score",balanced_accuracy_score(y_val_fold, y_val_pred))
    print("average_precision_score",average_precision_score(y_val_fold, y_val_pred))
    print("f1_score",f1)
    print(classification_report(y_val_fold, y_val_pred))
    
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)
print('Mean F1:', mean_f1, 'Std F1:', std_f1)

accuracy_score 0.9176937046004843
balanced_accuracy_score 0.8498070497712402
average_precision_score 0.6259304729207066
f1_score 0.7619030941916052
              precision    recall  f1-score   support

           0       0.95      0.95      0.95    135990
           1       0.78      0.74      0.76     29210

    accuracy                           0.92    165200
   macro avg       0.86      0.85      0.86    165200
weighted avg       0.92      0.92      0.92    165200

accuracy_score 0.9183893364971943
balanced_accuracy_score 0.8498357092577018
average_precision_score 0.6281607965116728
f1_score 0.7631911754373638
              precision    recall  f1-score   support

           0       0.95      0.96      0.95    135990
           1       0.78      0.74      0.76     29209

    accuracy                           0.92    165199
   macro avg       0.86      0.85      0.86    165199
weighted avg       0.92      0.92      0.92    165199

accuracy_score 0.9160648672207459
balanced_accurac

### SVM NON-LINEAR

In [10]:
%%time
f1_scores = []  # to store the f1_scores for each fold

for train_index, val_index in skf.split(cv_df['Consumer complaint narrative'], cv_df['debt_collection']):
    # SVM Params:
    sample_size =400000
    chi2_features = 30000
    alpha = 0.0001
    loss = 'squared_hinge'
    penalty=  'elasticnet'
    n_iter_no_change = 3
    early_stopping = False
    learning_rate = 'constant'
    eta0 =  0.01
    
    
    # split the data
    X_train_fold, X_val_fold = cv_df['Consumer complaint narrative'][train_index], cv_df['Consumer complaint narrative'][val_index]
    y_train_fold, y_val_fold = cv_df['debt_collection'][train_index], cv_df['debt_collection'][val_index]
    
    # Further Sampling Since we don't need all that
    X_train_fold_sample = X_train_fold.sample(sample_size)
    y_train_fold_sample = y_train_fold.loc[X_train_fold_sample.index]  # Get the corresponding labels
    
    # Vectorize the training and validation sets
    X_train_fold_sample = loaded_vectorizer.transform(X_train_fold_sample)
    X_val_fold = loaded_vectorizer.transform(X_val_fold)
    
    # Feature selection
    selector = SelectKBest(chi2, k=chi2_features)
    X_train_fold_sample = selector.fit_transform(X_train_fold_sample, y_train_fold_sample)
    X_val_fold = selector.transform(X_val_fold)
    
    # Oversampling
    oversample = RandomOverSampler(sampling_strategy='minority')
    X_train_fold_sample, y_train_fold_sample = oversample.fit_resample(X_train_fold_sample, y_train_fold_sample)
    
    # Shuffle your data
    X_train_fold_sample, y_train_fold_sample = shuffle(X_train_fold_sample, y_train_fold_sample)
    
    # Define the classifier:
    clf = SGDClassifier(
                    loss=loss, 
                    penalty=penalty, 
                    alpha=alpha, 
                    n_iter_no_change=n_iter_no_change, 
                    early_stopping=early_stopping, 
                    learning_rate=learning_rate, 
                    eta0=eta0, 
                    random_state=42
                )
    clf.fit(X_train_fold_sample, y_train_fold_sample)
    
    # Predict on the validation fold
    y_val_pred = clf.predict(X_val_fold)
    
    # Calculate the f1_score and append to the list of f1_scores
    f1 = f1_score(y_val_fold, y_val_pred)
    f1_scores.append(f1)
    
    print("accuracy_score",accuracy_score(y_val_fold, y_val_pred))
    print("balanced_accuracy_score",balanced_accuracy_score(y_val_fold, y_val_pred))
    print("average_precision_score",average_precision_score(y_val_fold, y_val_pred))
    print("f1_score",f1)
    print(classification_report(y_val_fold, y_val_pred))
    
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)
print('Mean F1:', mean_f1, 'Std F1:', std_f1)

accuracy_score 0.905817191283293
balanced_accuracy_score 0.8822836256839575
average_precision_score 0.6116184341435366
f1_score 0.760538668718738
              precision    recall  f1-score   support

           0       0.97      0.92      0.94    135990
           1       0.69      0.85      0.76     29210

    accuracy                           0.91    165200
   macro avg       0.83      0.88      0.85    165200
weighted avg       0.92      0.91      0.91    165200

accuracy_score 0.9079715978910284
balanced_accuracy_score 0.8851490945324623
average_precision_score 0.6184577167163845
f1_score 0.7655630773026569
              precision    recall  f1-score   support

           0       0.97      0.92      0.94    135990
           1       0.70      0.85      0.77     29209

    accuracy                           0.91    165199
   macro avg       0.83      0.89      0.85    165199
weighted avg       0.92      0.91      0.91    165199

accuracy_score 0.9032500196732426
balanced_accuracy_

### SVM Linear

In [11]:
%%time
f1_scores = []  # to store the f1_scores for each fold

for train_index, val_index in skf.split(cv_df['Consumer complaint narrative'], cv_df['debt_collection']):
    # SVM Linear
    sample_size =100000
    chi2_features = 20000
    alpha = 0.0001
    loss = 'hinge'
    penalty=  'elasticnet'
    n_iter_no_change = 3
    early_stopping = False
    learning_rate = 'adaptive'
    eta0 =  0.01
    
    
    # split the data
    X_train_fold, X_val_fold = cv_df['Consumer complaint narrative'][train_index], cv_df['Consumer complaint narrative'][val_index]
    y_train_fold, y_val_fold = cv_df['debt_collection'][train_index], cv_df['debt_collection'][val_index]
    
    # Further Sampling Since we don't need all that
    X_train_fold_sample = X_train_fold.sample(sample_size)
    y_train_fold_sample = y_train_fold.loc[X_train_fold_sample.index]  # Get the corresponding labels
    
    # Vectorize the training and validation sets
    X_train_fold_sample = loaded_vectorizer.transform(X_train_fold_sample)
    X_val_fold = loaded_vectorizer.transform(X_val_fold)
    
    # Feature selection
    selector = SelectKBest(chi2, k=chi2_features)
    X_train_fold_sample = selector.fit_transform(X_train_fold_sample, y_train_fold_sample)
    X_val_fold = selector.transform(X_val_fold)
    
    # Oversampling
    oversample = RandomOverSampler(sampling_strategy='minority')
    X_train_fold_sample, y_train_fold_sample = oversample.fit_resample(X_train_fold_sample, y_train_fold_sample)
    
    # Shuffle your data
    X_train_fold_sample, y_train_fold_sample = shuffle(X_train_fold_sample, y_train_fold_sample)
    
    # Define the classifier:
    clf = SGDClassifier(
                    loss=loss, 
                    penalty=penalty, 
                    alpha=alpha, 
                    n_iter_no_change=n_iter_no_change, 
                    early_stopping=early_stopping, 
                    learning_rate=learning_rate, 
                    eta0=eta0, 
                    random_state=42
                )
    clf.fit(X_train_fold_sample, y_train_fold_sample)
    
    # Predict on the validation fold
    y_val_pred = clf.predict(X_val_fold)
    
    # Calculate the f1_score and append to the list of f1_scores
    f1 = f1_score(y_val_fold, y_val_pred)
    f1_scores.append(f1)
    
    print("accuracy_score",accuracy_score(y_val_fold, y_val_pred))
    print("balanced_accuracy_score",balanced_accuracy_score(y_val_fold, y_val_pred))
    print("average_precision_score",average_precision_score(y_val_fold, y_val_pred))
    print("f1_score",f1)
    print(classification_report(y_val_fold, y_val_pred))
    
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)
print('Mean F1:', mean_f1, 'Std F1:', std_f1)

accuracy_score 0.9049818401937046
balanced_accuracy_score 0.8592630887257127
average_precision_score 0.5953194043288259
f1_score 0.7458510758868578
              precision    recall  f1-score   support

           0       0.95      0.93      0.94    135990
           1       0.71      0.79      0.75     29210

    accuracy                           0.90    165200
   macro avg       0.83      0.86      0.84    165200
weighted avg       0.91      0.90      0.91    165200

accuracy_score 0.9054836893685797
balanced_accuracy_score 0.8608818828224665
average_precision_score 0.597525618962122
f1_score 0.7476484468435853
              precision    recall  f1-score   support

           0       0.95      0.93      0.94    135990
           1       0.71      0.79      0.75     29209

    accuracy                           0.91    165199
   macro avg       0.83      0.86      0.84    165199
weighted avg       0.91      0.91      0.91    165199

accuracy_score 0.9047815059413192
balanced_accuracy

### Decision Tree

In [14]:
%%time
f1_scores = []  # to store the f1_scores for each fold
{'sample_size': 300000, 'chi2_features': 30000, 'clf__n_estimators': 50, 'clf__criterion': 'entropy', 'clf__max_depth': 50, 
                                         'clf__min_samples_split': 100, 'clf__min_samples_leaf': 50, 'clf__class_weight': None, 'clf__bootstrap': False}
for train_index, val_index in skf.split(cv_df['Consumer complaint narrative'], cv_df['debt_collection']):
    # Tree Params:
    sample_size = 300000
    chi2_features = 30000
    criterion='entropy'
    max_depth=60
    min_samples_split=50
    min_samples_leaf=50
    class_weight=None
    
    
    # split the data
    X_train_fold, X_val_fold = cv_df['Consumer complaint narrative'][train_index], cv_df['Consumer complaint narrative'][val_index]
    y_train_fold, y_val_fold = cv_df['debt_collection'][train_index], cv_df['debt_collection'][val_index]
    
    # Further Sampling Since we don't need all that
    X_train_fold_sample = X_train_fold.sample(sample_size)
    y_train_fold_sample = y_train_fold.loc[X_train_fold_sample.index]  # Get the corresponding labels
    
    # Vectorize the training and validation sets
    X_train_fold_sample = loaded_vectorizer.transform(X_train_fold_sample)
    X_val_fold = loaded_vectorizer.transform(X_val_fold)
    
    # Feature selection
    selector = SelectKBest(chi2, k=chi2_features)
    X_train_fold_sample = selector.fit_transform(X_train_fold_sample, y_train_fold_sample)
    X_val_fold = selector.transform(X_val_fold)
    
    # Oversampling
    oversample = RandomOverSampler(sampling_strategy='minority')
    X_train_fold_sample, y_train_fold_sample = oversample.fit_resample(X_train_fold_sample, y_train_fold_sample)
    
    # Shuffle your data
    X_train_fold_sample, y_train_fold_sample = shuffle(X_train_fold_sample, y_train_fold_sample)
    
    # Define the classifier:
    clf = DecisionTreeClassifier(
                    criterion=criterion, 
                    max_depth=max_depth, 
                    min_samples_split=min_samples_split, 
                    min_samples_leaf=min_samples_leaf, 
                    class_weight=class_weight,
                    random_state=42
                )
    clf.fit(X_train_fold_sample, y_train_fold_sample)
    
    # Predict on the validation fold
    y_val_pred = clf.predict(X_val_fold)
    
    # Calculate the f1_score and append to the list of f1_scores
    f1 = f1_score(y_val_fold, y_val_pred)
    f1_scores.append(f1)
    
    print("accuracy_score",accuracy_score(y_val_fold, y_val_pred))
    print("balanced_accuracy_score",balanced_accuracy_score(y_val_fold, y_val_pred))
    print("average_precision_score",average_precision_score(y_val_fold, y_val_pred))
    print("f1_score",f1)
    print(classification_report(y_val_fold, y_val_pred))
    
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)
print('Mean F1:', mean_f1, 'Std F1:', std_f1)

accuracy_score 0.8526331719128329
balanced_accuracy_score 0.7909348523547468
average_precision_score 0.4488863674817464
f1_score 0.6253174297806848
              precision    recall  f1-score   support

           0       0.93      0.89      0.91    135990
           1       0.57      0.70      0.63     29210

    accuracy                           0.85    165200
   macro avg       0.75      0.79      0.77    165200
weighted avg       0.87      0.85      0.86    165200

accuracy_score 0.856361115987385
balanced_accuracy_score 0.7927509500811115
average_precision_score 0.4554442147700546
f1_score 0.6309125694109596
              precision    recall  f1-score   support

           0       0.93      0.89      0.91    135990
           1       0.58      0.69      0.63     29209

    accuracy                           0.86    165199
   macro avg       0.75      0.79      0.77    165199
weighted avg       0.87      0.86      0.86    165199

accuracy_score 0.8617425044945792
balanced_accuracy

### Random Forest

In [12]:
%%time
f1_scores = []  # to store the f1_scores for each fold

for train_index, val_index in skf.split(cv_df['Consumer complaint narrative'], cv_df['debt_collection']):
    # RF Params
    sample_size = 300000
    chi2_features = 30000
    n_estimators = 50
    criterion ='entropy'
    max_depth = 50
    min_samples_split = 100
    min_samples_leaf = 50
    class_weight = None
    bootstrap = False
    
    
    # split the data
    X_train_fold, X_val_fold = cv_df['Consumer complaint narrative'][train_index], cv_df['Consumer complaint narrative'][val_index]
    y_train_fold, y_val_fold = cv_df['debt_collection'][train_index], cv_df['debt_collection'][val_index]
    
    # Further Sampling Since we don't need all that
    X_train_fold_sample = X_train_fold.sample(sample_size)
    y_train_fold_sample = y_train_fold.loc[X_train_fold_sample.index]  # Get the corresponding labels
    
    # Vectorize the training and validation sets
    X_train_fold_sample = loaded_vectorizer.transform(X_train_fold_sample)
    X_val_fold = loaded_vectorizer.transform(X_val_fold)
    
    # Feature selection
    selector = SelectKBest(chi2, k=chi2_features)
    X_train_fold_sample = selector.fit_transform(X_train_fold_sample, y_train_fold_sample)
    X_val_fold = selector.transform(X_val_fold)
    
    # Oversampling
    oversample = RandomOverSampler(sampling_strategy='minority')
    X_train_fold_sample, y_train_fold_sample = oversample.fit_resample(X_train_fold_sample, y_train_fold_sample)
    
    # Shuffle your data
    X_train_fold_sample, y_train_fold_sample = shuffle(X_train_fold_sample, y_train_fold_sample)
    
    # Define the classifier:
    clf = RandomForestClassifier(
        n_estimators=n_estimators, 
        criterion=criterion, 
        max_depth=max_depth, 
        min_samples_split=min_samples_split, 
        min_samples_leaf=min_samples_leaf, 
        class_weight=class_weight,
        bootstrap=bootstrap,
        random_state=42,
        n_jobs = -1
    )
    clf.fit(X_train_fold_sample, y_train_fold_sample)
    
    # Predict on the validation fold
    y_val_pred = clf.predict(X_val_fold)
    
    # Calculate the f1_score and append to the list of f1_scores
    f1 = f1_score(y_val_fold, y_val_pred)
    f1_scores.append(f1)
    
    print("accuracy_score",accuracy_score(y_val_fold, y_val_pred))
    print("balanced_accuracy_score",balanced_accuracy_score(y_val_fold, y_val_pred))
    print("average_precision_score",average_precision_score(y_val_fold, y_val_pred))
    print("f1_score",f1)
    print(classification_report(y_val_fold, y_val_pred))
    
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)
print('Mean F1:', mean_f1, 'Std F1:', std_f1)

accuracy_score 0.8843523002421307
balanced_accuracy_score 0.8243136936962383
average_precision_score 0.5264748829073798
f1_score 0.6910325867227298
              precision    recall  f1-score   support

           0       0.94      0.92      0.93    135990
           1       0.65      0.73      0.69     29210

    accuracy                           0.88    165200
   macro avg       0.80      0.82      0.81    165200
weighted avg       0.89      0.88      0.89    165200

accuracy_score 0.8851385298942488
balanced_accuracy_score 0.8288732188654833
average_precision_score 0.5312346175737108
f1_score 0.6954791288857505
              precision    recall  f1-score   support

           0       0.94      0.92      0.93    135990
           1       0.65      0.74      0.70     29209

    accuracy                           0.89    165199
   macro avg       0.80      0.83      0.81    165199
weighted avg       0.89      0.89      0.89    165199

accuracy_score 0.8828867002826893
balanced_accurac