In [1]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn import naive_bayes #import GaussianNB, MultinomialNB
from sklearn.linear_model import SGDClassifier # simulate the behavior of logistic regression using SGDClassifier(loss='log')
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score,balanced_accuracy_score,average_precision_score, classification_report, f1_score
 
from sklearn.utils import shuffle
from collections import Counter
from itertools import product

In [2]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
# Load the trained vectorizer
with open('tfidf_vectorizer_train_split_33k.pkl', 'rb') as f:
    loaded_vectorizer = pickle.load(f)
len(loaded_vectorizer.get_feature_names_out())

cfpb_df = pd.read_csv('cfpb_train.csv')
# some cleaning just ot make sure
cfpb_df['Consumer complaint narrative'] = cfpb_df['Consumer complaint narrative'].fillna('').astype(str)
cfpb_df['debt_collection'] = (cfpb_df['Product'] == 'Debt collection').astype(int)
cv_df = cfpb_df[['Consumer complaint narrative','debt_collection']]

  cfpb_df = pd.read_csv('cfpb_train.csv')


## 10-Fold CV

In [4]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10)

### Logistic Regression

In [5]:
%%time
f1_scores = []  # to store the f1_scores for each fold

for train_index, val_index in skf.split(cv_df['Consumer complaint narrative'], cv_df['debt_collection']):
    # Logistic Regression Params:
    sample_size = 200000
    chi2_features = 22500
    alpha = 0.0001
    loss = 'log_loss'
    penalty=  'elasticnet'
    n_iter_no_change = 3
    early_stopping = False
    learning_rate = 'constant'
    eta0 =  0.01
    
    
    # split the data
    X_train_fold, X_val_fold = cv_df['Consumer complaint narrative'][train_index], cv_df['Consumer complaint narrative'][val_index]
    y_train_fold, y_val_fold = cv_df['debt_collection'][train_index], cv_df['debt_collection'][val_index]
    
    # Further Sampling Since we don't need all that
    X_train_fold_sample = X_train_fold.sample(sample_size)
    y_train_fold_sample = y_train_fold.loc[X_train_fold_sample.index]  # Get the corresponding labels
    
    # Vectorize the training and validation sets
    X_train_fold_sample = loaded_vectorizer.transform(X_train_fold_sample)
    X_val_fold = loaded_vectorizer.transform(X_val_fold)
    
    # Feature selection
    selector = SelectKBest(chi2, k=chi2_features)
    X_train_fold_sample = selector.fit_transform(X_train_fold_sample, y_train_fold_sample)
    X_val_fold = selector.transform(X_val_fold)
    
    # Oversampling
    oversample = RandomOverSampler(sampling_strategy='minority')
    X_train_fold_sample, y_train_fold_sample = oversample.fit_resample(X_train_fold_sample, y_train_fold_sample)
    
    # Shuffle your data
    X_train_fold_sample, y_train_fold_sample = shuffle(X_train_fold_sample, y_train_fold_sample)
    
    # Define the classifier:
    clf = SGDClassifier(loss=loss, 
                        penalty=penalty, 
                        alpha=alpha,
                        n_iter_no_change=n_iter_no_change,
                        early_stopping=early_stopping,
                        learning_rate=learning_rate,
                        eta0=eta0,
                        random_state=42)
    clf.fit(X_train_fold_sample, y_train_fold_sample)
    
    # Predict on the validation fold
    y_val_pred = clf.predict(X_val_fold)
    
    # Calculate the f1_score and append to the list of f1_scores
    f1 = f1_score(y_val_fold, y_val_pred)
    f1_scores.append(f1)
    
    print("accuracy_score",accuracy_score(y_val_fold, y_val_pred))
    print("balanced_accuracy_score",balanced_accuracy_score(y_val_fold, y_val_pred))
    print("average_precision_score",average_precision_score(y_val_fold, y_val_pred))
    print("f1_score",f1)
    print(classification_report(y_val_fold, y_val_pred))
    
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)
print('Mean F1:', mean_f1, 'Std F1:', std_f1)

accuracy_score 0.8952058111380146
balanced_accuracy_score 0.8538224574430139
average_precision_score 0.5692808542175429
f1_score 0.727163840383282
              precision    recall  f1-score   support

           0       0.95      0.92      0.94     67995
           1       0.67      0.79      0.73     14605

    accuracy                           0.90     82600
   macro avg       0.81      0.85      0.83     82600
weighted avg       0.90      0.90      0.90     82600

accuracy_score 0.8950847457627119
balanced_accuracy_score 0.8545553637004191
average_precision_score 0.5695110813197592
f1_score 0.7274499937099006
              precision    recall  f1-score   support

           0       0.95      0.92      0.94     67995
           1       0.67      0.79      0.73     14605

    accuracy                           0.90     82600
   macro avg       0.81      0.85      0.83     82600
weighted avg       0.90      0.90      0.90     82600

accuracy_score 0.8946489104116223
balanced_accuracy

### Naive Bayes

In [6]:
%%time
f1_scores = []  # to store the f1_scores for each fold

for train_index, val_index in skf.split(cv_df['Consumer complaint narrative'], cv_df['debt_collection']):
    # Logistic Regression Params:
    sample_size = 100000#150000
    chi2_features = 25000#27500
    alpha =  0.001
    fit_prior =  True
    class_prior =  None
    
    
    # split the data
    X_train_fold, X_val_fold = cv_df['Consumer complaint narrative'][train_index], cv_df['Consumer complaint narrative'][val_index]
    y_train_fold, y_val_fold = cv_df['debt_collection'][train_index], cv_df['debt_collection'][val_index]
    
    # Further Sampling Since we don't need all that
    X_train_fold_sample = X_train_fold.sample(sample_size)
    y_train_fold_sample = y_train_fold.loc[X_train_fold_sample.index]  # Get the corresponding labels
    
    # Vectorize the training and validation sets
    X_train_fold_sample = loaded_vectorizer.transform(X_train_fold_sample)
    X_val_fold = loaded_vectorizer.transform(X_val_fold)
    
    # Feature selection
    selector = SelectKBest(chi2, k=chi2_features)
    X_train_fold_sample = selector.fit_transform(X_train_fold_sample, y_train_fold_sample)
    X_val_fold = selector.transform(X_val_fold)
    
    # Oversampling
    sm = SMOTE(random_state=42)
    X_train, y_train = sm.fit_resample(X_train_fold_sample, y_train_fold_sample)
    
    # Shuffle your data
    X_train_fold_sample, y_train_fold_sample = shuffle(X_train_fold_sample, y_train_fold_sample)
    
    # Define the classifier:
    clf = naive_bayes.MultinomialNB(
        alpha=alpha,
        fit_prior=fit_prior, 
        class_prior=class_prior
    )
    clf.fit(X_train_fold_sample, y_train_fold_sample)
    
    # Predict on the validation fold
    y_val_pred = clf.predict(X_val_fold)
    
    # Calculate the f1_score and append to the list of f1_scores
    f1 = f1_score(y_val_fold, y_val_pred)
    f1_scores.append(f1)
    
    print("accuracy_score",accuracy_score(y_val_fold, y_val_pred))
    print("balanced_accuracy_score",balanced_accuracy_score(y_val_fold, y_val_pred))
    print("average_precision_score",average_precision_score(y_val_fold, y_val_pred))
    print("f1_score",f1)
    print(classification_report(y_val_fold, y_val_pred))
    
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)
print('Mean F1:', mean_f1, 'Std F1:', std_f1)

accuracy_score 0.9158474576271186
balanced_accuracy_score 0.8446668790893987
average_precision_score 0.61788161387257
f1_score 0.7553067905797867
              precision    recall  f1-score   support

           0       0.94      0.95      0.95     67995
           1       0.78      0.73      0.76     14605

    accuracy                           0.92     82600
   macro avg       0.86      0.84      0.85     82600
weighted avg       0.91      0.92      0.91     82600

accuracy_score 0.916271186440678
balanced_accuracy_score 0.8449511323241818
average_precision_score 0.6193445836033826
f1_score 0.756255727074082
              precision    recall  f1-score   support

           0       0.94      0.96      0.95     67995
           1       0.78      0.73      0.76     14605

    accuracy                           0.92     82600
   macro avg       0.86      0.84      0.85     82600
weighted avg       0.91      0.92      0.92     82600

accuracy_score 0.917360774818402
balanced_accuracy_sco

In [13]:
%%time
f1_scores = []  # to store the f1_scores for each fold

for train_index, val_index in skf.split(cv_df['Consumer complaint narrative'], cv_df['debt_collection']):
    # Logistic Regression Params:
    sample_size = 200000#150000
    chi2_features = 25000#27500
    
    # split the data
    X_train_fold, X_val_fold = cv_df['Consumer complaint narrative'][train_index], cv_df['Consumer complaint narrative'][val_index]
    y_train_fold, y_val_fold = cv_df['debt_collection'][train_index], cv_df['debt_collection'][val_index]
    
    # Further Sampling Since we don't need all that
    X_train_fold_sample = X_train_fold.sample(sample_size)
    y_train_fold_sample = y_train_fold.loc[X_train_fold_sample.index]  # Get the corresponding labels
    
    # Vectorize the training and validation sets
    X_train_fold_sample = loaded_vectorizer.transform(X_train_fold_sample)
    X_val_fold = loaded_vectorizer.transform(X_val_fold)
    
    # Feature selection
    selector = SelectKBest(chi2, k=chi2_features)
    X_train_fold_sample = selector.fit_transform(X_train_fold_sample, y_train_fold_sample)
    X_val_fold = selector.transform(X_val_fold)
    
    # Oversampling
    sm = SMOTE(random_state=42)
    X_train, y_train = sm.fit_resample(X_train_fold_sample, y_train_fold_sample)
    
    # Shuffle your data
    X_train_fold_sample, y_train_fold_sample = shuffle(X_train_fold_sample, y_train_fold_sample)
    
    # Define the classifier:
    clf = naive_bayes.GaussianNB()
    clf.fit(X_train_fold_sample.toarray(), y_train_fold_sample)
    
    # Predict on the validation fold
    y_val_pred = clf.predict(X_val_fold.toarray())
    
    # Calculate the f1_score and append to the list of f1_scores
    f1 = f1_score(y_val_fold, y_val_pred)
    f1_scores.append(f1)
    
    print("accuracy_score",accuracy_score(y_val_fold, y_val_pred))
    print("balanced_accuracy_score",balanced_accuracy_score(y_val_fold, y_val_pred))
    print("average_precision_score",average_precision_score(y_val_fold, y_val_pred))
    print("f1_score",f1)
    print(classification_report(y_val_fold, y_val_pred))
    
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)
print('Mean F1:', mean_f1, 'Std F1:', std_f1)

accuracy_score 0.5658595641646489
balanced_accuracy_score 0.7133205693402502
average_precision_score 0.27586340758692496
f1_score 0.43402777777777773
              precision    recall  f1-score   support

           0       0.97      0.49      0.65     67995
           1       0.28      0.94      0.43     14605

    accuracy                           0.57     82600
   macro avg       0.63      0.71      0.54     82600
weighted avg       0.85      0.57      0.61     82600

accuracy_score 0.5761380145278451
balanced_accuracy_score 0.7210959134956633
average_precision_score 0.28147696405894546
f1_score 0.44094211576846304
              precision    recall  f1-score   support

           0       0.98      0.50      0.66     67995
           1       0.29      0.95      0.44     14605

    accuracy                           0.58     82600
   macro avg       0.63      0.72      0.55     82600
weighted avg       0.86      0.58      0.62     82600

accuracy_score 0.5679782082324455
balanced_acc

### SVM NON-LINEAR

In [14]:
%%time
f1_scores = []  # to store the f1_scores for each fold

for train_index, val_index in skf.split(cv_df['Consumer complaint narrative'], cv_df['debt_collection']):
    # SVM Params:
    sample_size =200000
    chi2_features = 20000
    alpha = 0.0001
    loss = 'squared_hinge'
    penalty=  'elasticnet'
    n_iter_no_change = 3
    early_stopping = False
    learning_rate = 'constant'
    eta0 =  0.01
    
    
    # split the data
    X_train_fold, X_val_fold = cv_df['Consumer complaint narrative'][train_index], cv_df['Consumer complaint narrative'][val_index]
    y_train_fold, y_val_fold = cv_df['debt_collection'][train_index], cv_df['debt_collection'][val_index]
    
    # Further Sampling Since we don't need all that
    X_train_fold_sample = X_train_fold.sample(sample_size)
    y_train_fold_sample = y_train_fold.loc[X_train_fold_sample.index]  # Get the corresponding labels
    
    # Vectorize the training and validation sets
    X_train_fold_sample = loaded_vectorizer.transform(X_train_fold_sample)
    X_val_fold = loaded_vectorizer.transform(X_val_fold)
    
    # Feature selection
    selector = SelectKBest(chi2, k=chi2_features)
    X_train_fold_sample = selector.fit_transform(X_train_fold_sample, y_train_fold_sample)
    X_val_fold = selector.transform(X_val_fold)
    
    # Oversampling
    oversample = RandomOverSampler(sampling_strategy='minority')
    X_train_fold_sample, y_train_fold_sample = oversample.fit_resample(X_train_fold_sample, y_train_fold_sample)
    
    # Shuffle your data
    X_train_fold_sample, y_train_fold_sample = shuffle(X_train_fold_sample, y_train_fold_sample)
    
    # Define the classifier:
    clf = SGDClassifier(
                    loss=loss, 
                    penalty=penalty, 
                    alpha=alpha, 
                    n_iter_no_change=n_iter_no_change, 
                    early_stopping=early_stopping, 
                    learning_rate=learning_rate, 
                    eta0=eta0, 
                    random_state=42
                )
    clf.fit(X_train_fold_sample, y_train_fold_sample)
    
    # Predict on the validation fold
    y_val_pred = clf.predict(X_val_fold)
    
    # Calculate the f1_score and append to the list of f1_scores
    f1 = f1_score(y_val_fold, y_val_pred)
    f1_scores.append(f1)
    
    print("accuracy_score",accuracy_score(y_val_fold, y_val_pred))
    print("balanced_accuracy_score",balanced_accuracy_score(y_val_fold, y_val_pred))
    print("average_precision_score",average_precision_score(y_val_fold, y_val_pred))
    print("f1_score",f1)
    print(classification_report(y_val_fold, y_val_pred))
    
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)
print('Mean F1:', mean_f1, 'Std F1:', std_f1)

accuracy_score 0.9027966101694915
balanced_accuracy_score 0.8790779821270363
average_precision_score 0.6026798383879122
f1_score 0.7539757928604258
              precision    recall  f1-score   support

           0       0.96      0.92      0.94     67995
           1       0.68      0.84      0.75     14605

    accuracy                           0.90     82600
   macro avg       0.82      0.88      0.85     82600
weighted avg       0.91      0.90      0.91     82600

accuracy_score 0.9066949152542373
balanced_accuracy_score 0.8781125135089705
average_precision_score 0.6110415246973934
f1_score 0.7596444721659129
              precision    recall  f1-score   support

           0       0.96      0.92      0.94     67995
           1       0.70      0.83      0.76     14605

    accuracy                           0.91     82600
   macro avg       0.83      0.88      0.85     82600
weighted avg       0.92      0.91      0.91     82600

accuracy_score 0.9065859564164649
balanced_accurac

### SVM Linear

In [8]:
%%time
f1_scores = []  # to store the f1_scores for each fold

for train_index, val_index in skf.split(cv_df['Consumer complaint narrative'], cv_df['debt_collection']):
    # SVM Linear
    sample_size =100000
    chi2_features = 20000
    alpha = 0.0001
    loss = 'hinge'
    penalty=  'elasticnet'
    n_iter_no_change = 3
    early_stopping = False
    learning_rate = 'adaptive'
    eta0 =  0.01
    
    
    # split the data
    X_train_fold, X_val_fold = cv_df['Consumer complaint narrative'][train_index], cv_df['Consumer complaint narrative'][val_index]
    y_train_fold, y_val_fold = cv_df['debt_collection'][train_index], cv_df['debt_collection'][val_index]
    
    # Further Sampling Since we don't need all that
    X_train_fold_sample = X_train_fold.sample(sample_size)
    y_train_fold_sample = y_train_fold.loc[X_train_fold_sample.index]  # Get the corresponding labels
    
    # Vectorize the training and validation sets
    X_train_fold_sample = loaded_vectorizer.transform(X_train_fold_sample)
    X_val_fold = loaded_vectorizer.transform(X_val_fold)
    
    # Feature selection
    selector = SelectKBest(chi2, k=chi2_features)
    X_train_fold_sample = selector.fit_transform(X_train_fold_sample, y_train_fold_sample)
    X_val_fold = selector.transform(X_val_fold)
    
    # Oversampling
    oversample = RandomOverSampler(sampling_strategy='minority')
    X_train_fold_sample, y_train_fold_sample = oversample.fit_resample(X_train_fold_sample, y_train_fold_sample)
    
    # Shuffle your data
    X_train_fold_sample, y_train_fold_sample = shuffle(X_train_fold_sample, y_train_fold_sample)
    
    # Define the classifier:
    clf = SGDClassifier(
                    loss=loss, 
                    penalty=penalty, 
                    alpha=alpha, 
                    n_iter_no_change=n_iter_no_change, 
                    early_stopping=early_stopping, 
                    learning_rate=learning_rate, 
                    eta0=eta0, 
                    random_state=42
                )
    clf.fit(X_train_fold_sample, y_train_fold_sample)
    
    # Predict on the validation fold
    y_val_pred = clf.predict(X_val_fold)
    
    # Calculate the f1_score and append to the list of f1_scores
    f1 = f1_score(y_val_fold, y_val_pred)
    f1_scores.append(f1)
    
    print("accuracy_score",accuracy_score(y_val_fold, y_val_pred))
    print("balanced_accuracy_score",balanced_accuracy_score(y_val_fold, y_val_pred))
    print("average_precision_score",average_precision_score(y_val_fold, y_val_pred))
    print("f1_score",f1)
    print(classification_report(y_val_fold, y_val_pred))
    
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)
print('Mean F1:', mean_f1, 'Std F1:', std_f1)

accuracy_score 0.9050968523002422
balanced_accuracy_score 0.8584055395659492
average_precision_score 0.5950870126526203
f1_score 0.7455118007986234
              precision    recall  f1-score   support

           0       0.95      0.93      0.94     67995
           1       0.71      0.79      0.75     14605

    accuracy                           0.91     82600
   macro avg       0.83      0.86      0.84     82600
weighted avg       0.91      0.91      0.91     82600

accuracy_score 0.9059927360774819
balanced_accuracy_score 0.8603475284232467
average_precision_score 0.5984769687383495
f1_score 0.7481594395615088
              precision    recall  f1-score   support

           0       0.95      0.93      0.94     67995
           1       0.71      0.79      0.75     14605

    accuracy                           0.91     82600
   macro avg       0.83      0.86      0.85     82600
weighted avg       0.91      0.91      0.91     82600

accuracy_score 0.9045762711864407
balanced_accurac

### Decision Tree

In [9]:
%%time
f1_scores = []  # to store the f1_scores for each fold
{'sample_size': 300000, 'chi2_features': 30000, 'clf__n_estimators': 50, 'clf__criterion': 'entropy', 'clf__max_depth': 50, 
                                         'clf__min_samples_split': 100, 'clf__min_samples_leaf': 50, 'clf__class_weight': None, 'clf__bootstrap': False}
for train_index, val_index in skf.split(cv_df['Consumer complaint narrative'], cv_df['debt_collection']):
    # Tree Params:
    sample_size = 300000
    chi2_features = 30000
    criterion='entropy'
    max_depth=60
    min_samples_split=50
    min_samples_leaf=50
    class_weight=None
    
    
    # split the data
    X_train_fold, X_val_fold = cv_df['Consumer complaint narrative'][train_index], cv_df['Consumer complaint narrative'][val_index]
    y_train_fold, y_val_fold = cv_df['debt_collection'][train_index], cv_df['debt_collection'][val_index]
    
    # Further Sampling Since we don't need all that
    X_train_fold_sample = X_train_fold.sample(sample_size)
    y_train_fold_sample = y_train_fold.loc[X_train_fold_sample.index]  # Get the corresponding labels
    
    # Vectorize the training and validation sets
    X_train_fold_sample = loaded_vectorizer.transform(X_train_fold_sample)
    X_val_fold = loaded_vectorizer.transform(X_val_fold)
    
    # Feature selection
    selector = SelectKBest(chi2, k=chi2_features)
    X_train_fold_sample = selector.fit_transform(X_train_fold_sample, y_train_fold_sample)
    X_val_fold = selector.transform(X_val_fold)
    
    # Oversampling
    oversample = RandomOverSampler(sampling_strategy='minority')
    X_train_fold_sample, y_train_fold_sample = oversample.fit_resample(X_train_fold_sample, y_train_fold_sample)
    
    # Shuffle your data
    X_train_fold_sample, y_train_fold_sample = shuffle(X_train_fold_sample, y_train_fold_sample)
    
    # Define the classifier:
    clf = DecisionTreeClassifier(
                    criterion=criterion, 
                    max_depth=max_depth, 
                    min_samples_split=min_samples_split, 
                    min_samples_leaf=min_samples_leaf, 
                    class_weight=class_weight,
                    random_state=42
                )
    clf.fit(X_train_fold_sample, y_train_fold_sample)
    
    # Predict on the validation fold
    y_val_pred = clf.predict(X_val_fold)
    
    # Calculate the f1_score and append to the list of f1_scores
    f1 = f1_score(y_val_fold, y_val_pred)
    f1_scores.append(f1)
    
    print("accuracy_score",accuracy_score(y_val_fold, y_val_pred))
    print("balanced_accuracy_score",balanced_accuracy_score(y_val_fold, y_val_pred))
    print("average_precision_score",average_precision_score(y_val_fold, y_val_pred))
    print("f1_score",f1)
    print(classification_report(y_val_fold, y_val_pred))
    
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)
print('Mean F1:', mean_f1, 'Std F1:', std_f1)

accuracy_score 0.8560532687651332
balanced_accuracy_score 0.794127787806054
average_precision_score 0.4561119240065308
f1_score 0.6317517343904856
              precision    recall  f1-score   support

           0       0.93      0.89      0.91     67995
           1       0.58      0.70      0.63     14605

    accuracy                           0.86     82600
   macro avg       0.75      0.79      0.77     82600
weighted avg       0.87      0.86      0.86     82600

accuracy_score 0.8576029055690073
balanced_accuracy_score 0.7895583527989137
average_precision_score 0.4547008366389653
f1_score 0.6295433070866142
              precision    recall  f1-score   support

           0       0.93      0.89      0.91     67995
           1       0.58      0.68      0.63     14605

    accuracy                           0.86     82600
   macro avg       0.76      0.79      0.77     82600
weighted avg       0.87      0.86      0.86     82600

accuracy_score 0.8557384987893463
balanced_accuracy

### Random Forest

In [10]:
%%time
f1_scores = []  # to store the f1_scores for each fold

for train_index, val_index in skf.split(cv_df['Consumer complaint narrative'], cv_df['debt_collection']):
    # RF Params
    sample_size = 300000
    chi2_features = 30000
    n_estimators = 50
    criterion ='entropy'
    max_depth = 50
    min_samples_split = 100
    min_samples_leaf = 50
    class_weight = None
    bootstrap = False
    
    
    # split the data
    X_train_fold, X_val_fold = cv_df['Consumer complaint narrative'][train_index], cv_df['Consumer complaint narrative'][val_index]
    y_train_fold, y_val_fold = cv_df['debt_collection'][train_index], cv_df['debt_collection'][val_index]
    
    # Further Sampling Since we don't need all that
    X_train_fold_sample = X_train_fold.sample(sample_size)
    y_train_fold_sample = y_train_fold.loc[X_train_fold_sample.index]  # Get the corresponding labels
    
    # Vectorize the training and validation sets
    X_train_fold_sample = loaded_vectorizer.transform(X_train_fold_sample)
    X_val_fold = loaded_vectorizer.transform(X_val_fold)
    
    # Feature selection
    selector = SelectKBest(chi2, k=chi2_features)
    X_train_fold_sample = selector.fit_transform(X_train_fold_sample, y_train_fold_sample)
    X_val_fold = selector.transform(X_val_fold)
    
    # Oversampling
    oversample = RandomOverSampler(sampling_strategy='minority')
    X_train_fold_sample, y_train_fold_sample = oversample.fit_resample(X_train_fold_sample, y_train_fold_sample)
    
    # Shuffle your data
    X_train_fold_sample, y_train_fold_sample = shuffle(X_train_fold_sample, y_train_fold_sample)
    
    # Define the classifier:
    clf = RandomForestClassifier(
        n_estimators=n_estimators, 
        criterion=criterion, 
        max_depth=max_depth, 
        min_samples_split=min_samples_split, 
        min_samples_leaf=min_samples_leaf, 
        class_weight=class_weight,
        bootstrap=bootstrap,
        random_state=42,
        n_jobs = -1
    )
    clf.fit(X_train_fold_sample, y_train_fold_sample)
    
    # Predict on the validation fold
    y_val_pred = clf.predict(X_val_fold)
    
    # Calculate the f1_score and append to the list of f1_scores
    f1 = f1_score(y_val_fold, y_val_pred)
    f1_scores.append(f1)
    
    print("accuracy_score",accuracy_score(y_val_fold, y_val_pred))
    print("balanced_accuracy_score",balanced_accuracy_score(y_val_fold, y_val_pred))
    print("average_precision_score",average_precision_score(y_val_fold, y_val_pred))
    print("f1_score",f1)
    print(classification_report(y_val_fold, y_val_pred))
    
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)
print('Mean F1:', mean_f1, 'Std F1:', std_f1)

accuracy_score 0.884818401937046
balanced_accuracy_score 0.8250672267094574
average_precision_score 0.5279367807836642
f1_score 0.692242996700524
              precision    recall  f1-score   support

           0       0.94      0.92      0.93     67995
           1       0.66      0.73      0.69     14605

    accuracy                           0.88     82600
   macro avg       0.80      0.83      0.81     82600
weighted avg       0.89      0.88      0.89     82600

accuracy_score 0.879225181598063
balanced_accuracy_score 0.8256214793569185
average_precision_score 0.517571829554726
f1_score 0.685001578781181
              precision    recall  f1-score   support

           0       0.94      0.91      0.93     67995
           1       0.64      0.74      0.69     14605

    accuracy                           0.88     82600
   macro avg       0.79      0.83      0.81     82600
weighted avg       0.89      0.88      0.88     82600

accuracy_score 0.8872154963680388
balanced_accuracy_sco