In [None]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
# from category_encoders import TargetEncoder
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report, 
    roc_auc_score, roc_curve, auc
)
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFE
from sklearn.preprocessing import RobustScaler,MinMaxScaler,StandardScaler
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization 
from tensorflow.keras.optimizers import Adam
import tensorflow_addons as tfa



In [None]:
# Load the data with outliers 
train_data = pd.read_csv("../data/train.csv")
test_data = pd.read_csv("../data/test.csv")

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
train_data.tail()

In [None]:
len(test_data.columns), len(train_data.columns)

In [None]:
train_data.shape, test_data.shape

In [None]:
train_data.info()

In [None]:
target_column = 'loan_status'  # Replace with your actual target variable name

class_mapping = {'Fully Paid': 0, 'Charged Off': 1}  # Should we default

# Apply binary encoding to the target variable in the training DataFrame
train_data['Label'] = train_data[target_column].map(class_mapping)
test_data['Label'] = test_data[target_column].map(class_mapping)

In [None]:
print(train_data.shape)
train_data['Label'].value_counts(normalize=True) * 100


In [None]:
print(test_data.shape)
test_data['Label'].value_counts(normalize=True) * 100

 Leaving 'term (months)' as a number 

In [None]:
train_data = train_data.drop('loan_status', axis = 1)
test_data = test_data.drop('loan_status', axis = 1)

In [None]:
categorical_columns = train_data.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_columns

In [None]:
for column in categorical_columns:
    unique_values = train_data[column].nunique()
    print(f"Column '{column}' has {unique_values} unique values.")

TODO : since home_ownership, verification_status, purpose and application_type have less than neary 10 categories we will use one hot encodign on them
target encoding for sub_grade and addr_state


In [None]:
cat_columns_ohe = ['home_ownership', 'verification_status', 'application_type', 'addr_state', 'sub_grade', 'purpose']

# cat_columns_ohe = ['home_ownership', 'verification_status', 'application_type']
# cat_target_encoding = ['addr_state', 'sub_grade', 'purpose']



In [None]:
train_data.head()

In [None]:
# encoder = TargetEncoder(cols=cat_target_encoding)

# encoder.fit(train_data[cat_target_encoding], train_data['Label'])


In [None]:
# X_train_encoded = encoder.transform(train_data[cat_target_encoding])
# X_test_encoded = encoder.transform(test_data[cat_target_encoding])

# # Replace original categorical columns with encoded columns in X_train and X_test
# X_train_encoded.columns = [f"{col}_encoded" for col in cat_target_encoding]
# X_test_encoded.columns = [f"{col}_encoded" for col in cat_target_encoding]

# # Drop original categorical columns from X_train and X_test
# train_data.drop(columns=cat_target_encoding, inplace=True)
# test_data.drop(columns=cat_target_encoding, inplace=True)

# # Concatenate encoded columns with remaining features
# train_data = pd.concat([train_data, X_train_encoded], axis=1)
# test_data = pd.concat([test_data, X_test_encoded], axis=1)

In [None]:
combined_df = pd.concat([train_data, test_data], axis=0)
combined_df.reset_index(drop=True, inplace=True)

custom_prefixes = {col: col + "_" for col in cat_columns_ohe}

encoded_combined_df = pd.get_dummies(combined_df, columns=cat_columns_ohe, drop_first=True, prefix=custom_prefixes)

encoded_combined_df.reset_index(drop=True, inplace=True)


In [None]:
train_data = encoded_combined_df.iloc[:len(train_data)]
train_data.reset_index(drop=True, inplace=True)
test_data = encoded_combined_df.iloc[len(train_data):]
test_data.reset_index(drop=True, inplace=True)


In [None]:
train_data.head()

In [None]:
numerical_columns = train_data.select_dtypes(include=['number']).columns.tolist()
# numerical_columns = [col for col in numerical_columns if col not in ['Label', 'addr_state_encoded', 'sub_grade_encoded', 'purpose_encoded']]
numerical_columns = [col for col in numerical_columns if col not in ['Label']]
numerical_columns

In [None]:
train_numerical_data = train_data[numerical_columns]
test_numerical_data = test_data[numerical_columns]

train_data = train_data.drop(columns=numerical_columns)
test_data = test_data.drop(columns=numerical_columns)

scaler = RobustScaler()
scaled_train_data = scaler.fit_transform(train_numerical_data)
scaled_test_data = scaler.transform(test_numerical_data)
scaled_train_df = pd.DataFrame(scaled_train_data, columns=numerical_columns)
scaled_test_df = pd.DataFrame(scaled_test_data, columns=numerical_columns)
robust_train_data = pd.concat([train_data, scaled_train_df], axis=1)
robust_test_data = pd.concat([test_data, scaled_test_df], axis=1)

 
scaler = StandardScaler()
scaled_train_data = scaler.fit_transform(train_numerical_data)
scaled_test_data = scaler.transform(test_numerical_data)
scaled_train_df = pd.DataFrame(scaled_train_data, columns=numerical_columns)
scaled_test_df = pd.DataFrame(scaled_test_data, columns=numerical_columns)
standard_train_data = pd.concat([train_data, scaled_train_df], axis=1)
standard_test_data = pd.concat([test_data, scaled_test_df], axis=1)


scaler = MinMaxScaler()
scaled_train_data = scaler.fit_transform(train_numerical_data)
scaled_test_data = scaler.transform(test_numerical_data)
scaled_train_df = pd.DataFrame(scaled_train_data, columns=numerical_columns)
scaled_test_df = pd.DataFrame(scaled_test_data, columns=numerical_columns)
minmax_train_data = pd.concat([train_data, scaled_train_df], axis=1)
minmax_test_data = pd.concat([test_data, scaled_test_df], axis=1)

In [None]:
robust_train_data.head()



In [None]:
robust_train_data = robust_train_data[robust_train_data['dti'] >= 0]
standard_train_data = standard_train_data[standard_train_data['dti'] >= 0]
minmax_train_data = minmax_train_data[minmax_train_data['dti'] >= 0]


In [None]:
# train_data = train_data.drop('issue_d', axis = 1)
# test_data = test_data.drop('issue_d', axis = 1)

In [None]:
boolean_columns = train_data.select_dtypes(include=[bool]).columns

robust_train_data[boolean_columns] = robust_train_data[boolean_columns].astype(int)
robust_test_data[boolean_columns] = robust_test_data[boolean_columns].astype(int)

standard_train_data[boolean_columns] = standard_train_data[boolean_columns].astype(int)
standard_test_data[boolean_columns] = standard_test_data[boolean_columns].astype(int)

minmax_train_data[boolean_columns] = minmax_train_data[boolean_columns].astype(int)
minmax_test_data[boolean_columns] = minmax_test_data[boolean_columns].astype(int)





In [None]:
robust_train_data.head()



In [None]:
# numerical_cols = numerical_columns.append('Label')
numerical_cols = numerical_columns + ["Label"]
numerical_cols

In [None]:
plt.figure(figsize = (20, 20))
sns.heatmap(robust_train_data[numerical_cols].corr(), cmap="RdYlGn")
plt.show()

In [None]:
plt.figure(figsize = (20, 20))
sns.heatmap(standard_train_data[numerical_cols].corr(), cmap="RdYlGn")
plt.show()

In [None]:
plt.figure(figsize = (20, 20))
sns.heatmap(minmax_train_data[numerical_cols].corr(), cmap="RdYlGn")
plt.show()

In [None]:
X_train_robust  = robust_train_data.drop("Label", axis = 1)
y_train_robust = robust_train_data["Label"]
X_test_robust = robust_test_data.drop("Label", axis = 1)
y_test_robust = robust_test_data["Label"]

X_train_standard = standard_train_data.drop("Label", axis=1)
y_train_standard = standard_train_data["Label"]
X_test_standard = standard_test_data.drop("Label", axis=1)
y_test_standard = standard_test_data["Label"]

X_train_minmax = minmax_train_data.drop("Label", axis=1)
y_train_minmax = minmax_train_data["Label"]
X_test_minmax = minmax_test_data.drop("Label", axis=1)
y_test_minmax = minmax_test_data["Label"]

In [None]:
X_train_robust.shape

In [None]:
# from sklearn.preprocessing import PowerTransformer

# # Select only the specified numerical columns
# # selected_numerical_columns = [col for col in numerical_columns if col not in ['Label', 'mort_acc', 'pub_rec']]
# selected_numerical_columns = [col for col in numerical_columns if col not in ['Label']]

# # Apply Box-Cox Transform after handling zero values
# train_data_transformed = X_train[selected_numerical_columns].apply(lambda x: x + 0.000001 if (x <= 0).any() else x)
# pt = PowerTransformer(method='box-cox')
# train_data_transformed[selected_numerical_columns] = pt.fit_transform(train_data_transformed)

# # Replace original numerical columns in X_train with their normalized versions
# X_train[selected_numerical_columns] = train_data_transformed[selected_numerical_columns]


In [None]:
# pd.DataFrame({'cols':selected_numerical_columns,'box_cox_lambdas':pt.lambdas_})

In [None]:
# X_train.columns[X_train.isna().any()].tolist()

In [None]:
# X_test.shape,y_test.shape, X_train.shape, y_train.shape

In [None]:
def print_score(true, pred, train=True):
    if train:
        clf_report = pd.DataFrame(classification_report(true, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(true, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(true, pred)}\n")
        
    elif train==False:
        clf_report = pd.DataFrame(classification_report(true, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(true, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(true, pred)}\n")

The SGDClassifier estimator in scikit-learn implements linear classifiers (SVM, logistic regression, and others) with stochastic gradient descent (SGD) training. A particular linear classifier is chosen through the loss hyperparameter. Because we want to predict the probability of charge-off, we choose logistic regression (a probabilistic classifier) by setting loss = 'log'.

In [None]:
naive_model = SGDClassifier(loss='log_loss')


In [None]:
naive_model.fit(X_train_robust, y_train_robust)

In [None]:
y_train_pred_robust = naive_model.predict(X_train_robust)
y_pred_robust = naive_model.predict(X_test_robust)

In [None]:
print_score(y_train_robust, y_train_pred_robust, train=True)
print_score(y_test_robust, y_pred_robust, train=False)


In [None]:
naive_model.fit(X_train_standard, y_train_standard)

In [None]:
y_train_pred_standard = naive_model.predict(X_train_standard)
y_pred_standard = naive_model.predict(X_test_standard)

In [None]:
print_score(y_train_standard, y_train_pred_standard, train=True)
print_score(y_test_standard, y_pred_standard, train=False)


In [None]:
naive_model.fit(X_train_minmax, y_train_minmax)

In [None]:
y_train_pred_minmax = naive_model.predict(X_train_minmax)
y_pred_minmax = naive_model.predict(X_test_minmax)

In [None]:
print_score(y_train_minmax, y_train_pred_minmax, train=True)
print_score(y_test_minmax, y_pred_minmax, train=False)


In [None]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))

for y_pred, y_true, label in zip([y_pred_robust, y_pred_standard, y_pred_minmax],[y_test_robust, y_test_standard, y_test_minmax], ['Robust', 'Standard', 'MinMax']):
    # Compute precision-recall curve
    precision, recall, _ = precision_recall_curve(y_true, y_pred)
    
    # Plot precision-recall curve
    plt.plot(recall, precision, label=label)

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Basic model Precision-Recall Curve')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
model = SGDClassifier(loss='log_loss', max_iter=1000, tol=1e-3, random_state=1, warm_start=True, class_weight='balanced')

kf = StratifiedKFold(n_splits=5, shuffle=False)


In [None]:
param_grid_sgdlogreg = {
    'alpha': [10**-5, 10**-2, 10**1],
    'penalty': ['l1', 'l2']
}

In [None]:
grid_sgdlogreg_robust = GridSearchCV(estimator=model, 
                               param_grid=param_grid_sgdlogreg, 
                               scoring='f1_weighted',  # Change scoring metric to F1 score
                               n_jobs=-1, 
                               pre_dispatch='2*n_jobs', 
                               cv=kf, 
                               verbose=1, 
                               return_train_score=False)

grid_sgdlogreg_standard = GridSearchCV(estimator=model, 
                               param_grid=param_grid_sgdlogreg, 
                               scoring='f1_weighted',  # Change scoring metric to F1 score
                               n_jobs=-1, 
                               pre_dispatch='2*n_jobs', 
                               cv=kf, 
                               verbose=1, 
                               return_train_score=False)


grid_sgdlogreg_minmax = GridSearchCV(estimator=model, 
                               param_grid=param_grid_sgdlogreg, 
                               scoring='f1_weighted',  # Change scoring metric to F1 score
                               n_jobs=-1, 
                               pre_dispatch='2*n_jobs', 
                               cv=kf, 
                               verbose=1, 
                               return_train_score=False)

In [None]:
grid_sgdlogreg_robust.fit(X_train_robust, y_train_robust)

In [None]:
grid_sgdlogreg_robust.best_score_

In [None]:
grid_sgdlogreg_robust.best_params_

In [None]:
grid_sgdlogreg_standard.fit(X_train_standard, y_train_standard)

In [None]:
grid_sgdlogreg_standard.best_score_

In [None]:
grid_sgdlogreg_standard.best_params_

In [None]:
grid_sgdlogreg_minmax.fit(X_train_minmax, y_train_minmax)

In [None]:
grid_sgdlogreg_minmax.best_score_

In [None]:
grid_sgdlogreg_minmax.best_params_

In [None]:
y_train_pred_robust = grid_sgdlogreg_robust.predict(X_train_robust)
y_pred_robust = grid_sgdlogreg_robust.predict(X_test_robust)

In [None]:
print_score(y_train_robust, y_train_pred_robust, train=True)
print_score(y_test_robust, y_pred_robust, train=False)


In [None]:
y_train_pred_standard = grid_sgdlogreg_standard.predict(X_train_standard)
y_pred_standard = grid_sgdlogreg_standard.predict(X_test_standard)

In [None]:
print_score(y_train_standard, y_train_pred_standard, train=True)
print_score(y_test_standard, y_pred_standard, train=False)

In [None]:
y_train_pred_minmax = grid_sgdlogreg_minmax.predict(X_train_minmax)
y_pred_minmax = grid_sgdlogreg_minmax.predict(X_test_minmax)

In [None]:
print_score(y_train_minmax, y_train_pred_minmax, train=True)
print_score(y_test_minmax, y_pred_minmax, train=False)

In [None]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))

for y_pred, y_true, label in zip([y_pred_robust, y_pred_standard, y_pred_minmax],[y_test_robust, y_test_standard, y_test_minmax], ['Robust', 'Standard', 'MinMax']):
    # Compute precision-recall curve
    precision, recall, _ = precision_recall_curve(y_true, y_pred)
    
    # Plot precision-recall curve
    plt.plot(recall, precision, label=label)

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.grid(True)
plt.show()


## With undersampling

In [None]:
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.under_sampling import RandomUnderSampler

In [None]:
rus = RandomUnderSampler(random_state=42)

In [None]:
X_under_robust, y_under_robust = rus.fit_resample(X_train_robust, y_train_robust)
X_under_minmax, y_under_minmax = rus.fit_resample(X_train_minmax, y_train_minmax)
X_under_standard, y_under_standard = rus.fit_resample(X_train_standard, y_train_standard)

In [None]:
print('Fully Paid:', y_under_robust.value_counts()[0], '/', round(y_under_robust.value_counts()[0]/len(y_under_robust) * 100,2), '% of the dataset')
print('Charged Off:', y_under_robust.value_counts()[1], '/',round(y_under_robust.value_counts()[1]/len(y_under_robust) * 100,2), '% of the dataset')




In [None]:
param_grid_sgdlogreg = {
    'sgdclassifier__alpha': [0.00001, 0.001, 10],
    'sgdclassifier__penalty': ['l1', 'l2'],
}

In [None]:
pipeline = make_pipeline(RandomUnderSampler(random_state=42),
                       SGDClassifier(loss='log_loss', max_iter=1000, tol=1e-3, random_state=1, warm_start=True, class_weight='balanced'))



In [None]:
grid_sgdlogreg_robust = GridSearchCV(estimator=pipeline, 
                               param_grid=param_grid_sgdlogreg, 
                               scoring='f1',  # Change scoring metric to F1 score
                               n_jobs=-1, 
                               pre_dispatch='2*n_jobs', 
                               cv=kf, 
                               verbose=1, 
                               return_train_score=True)

grid_sgdlogreg_standard = GridSearchCV(estimator=pipeline, 
                               param_grid=param_grid_sgdlogreg, 
                               scoring='f1',  # Change scoring metric to F1 score
                               n_jobs=-1, 
                               pre_dispatch='2*n_jobs', 
                               cv=kf, 
                               verbose=1, 
                               return_train_score=True)


grid_sgdlogreg_minmax = GridSearchCV(estimator=pipeline, 
                               param_grid=param_grid_sgdlogreg, 
                               scoring='f1',  # Change scoring metric to F1 score
                               n_jobs=-1, 
                               pre_dispatch='2*n_jobs', 
                               cv=kf, 
                               verbose=1, 
                               return_train_score=True)

In [None]:
grid_sgdlogreg_robust.fit(X_train_robust, y_train_robust)

In [None]:
grid_sgdlogreg_robust.best_score_

In [None]:
grid_sgdlogreg_robust.best_params_

In [None]:
y_train_pred_robust = grid_sgdlogreg_robust.predict(X_train_robust)
y_pred_robust = grid_sgdlogreg_robust.predict(X_test_robust)

In [None]:
print_score(y_train_robust, y_train_pred_robust, train=True)
print_score(y_test_robust, y_pred_robust, train=False)


In [None]:
grid_sgdlogreg_standard.fit(X_train_standard, y_train_standard)

In [None]:
grid_sgdlogreg_standard.best_score_

In [None]:
grid_sgdlogreg_standard.best_params_

In [None]:
y_train_pred_standard = grid_sgdlogreg_standard.predict(X_train_standard)
y_pred_standard = grid_sgdlogreg_standard.predict(X_test_standard)

In [None]:
print_score(y_train_standard, y_train_pred_standard, train=True)
print_score(y_test_standard, y_pred_standard, train=False)

In [None]:
grid_sgdlogreg_minmax.fit(X_train_minmax, y_train_minmax)

In [None]:
grid_sgdlogreg_minmax.best_score_

In [None]:
grid_sgdlogreg_minmax.best_params_

In [None]:
y_train_pred_minmax = grid_sgdlogreg_minmax.predict(X_train_minmax)
y_pred_minmax = grid_sgdlogreg_minmax.predict(X_test_minmax)

In [None]:
print_score(y_train_minmax, y_train_pred_minmax, train=True)
print_score(y_test_minmax, y_pred_minmax, train=False)

In [None]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))

for y_pred, y_true, label in zip([y_pred_robust, y_pred_standard, y_pred_minmax],[y_test_robust, y_test_standard, y_test_minmax], ['Robust', 'Standard', 'MinMax']):
    # Compute precision-recall curve
    precision, recall, _ = precision_recall_curve(y_true, y_pred)
    
    # Plot precision-recall curve
    plt.plot(recall, precision, label=label)

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
def evaluate_nn(true, pred, train=True):
    if train:
        clf_report = pd.DataFrame(classification_report(true, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(true, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(true, pred)}\n")
        
    elif train==False:
        clf_report = pd.DataFrame(classification_report(true, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(true, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(true, pred)}\n")
        
def plot_learning_evolution(r):
    plt.figure(figsize=(12, 8))
    
    plt.subplot(2, 2, 1)
    plt.plot(r.history['loss'], label='Loss')
    plt.plot(r.history['val_loss'], label='val_Loss')
    plt.title('Loss evolution during trainig')
    plt.legend()

    plt.subplot(2, 2, 2)
    plt.plot(r.history['f1_score'], label='f1_score')
    plt.plot(r.history['val_f1_score'], label='f1_score')
    plt.title('f1 score evolution during trainig')
    plt.legend();

def nn_model(num_columns, num_labels, hidden_units, dropout_rates, learning_rate):
    inp = tf.keras.layers.Input(shape=(num_columns, ))
    x = BatchNormalization()(inp)
    x = Dropout(dropout_rates[0])(x)
    for i in range(len(hidden_units)):
        x = Dense(hidden_units[i], activation='relu')(x)
        x = BatchNormalization()(x)
        x = Dropout(dropout_rates[i + 1])(x)
    x = Dense(num_labels, activation='sigmoid')(x)
  
    model = Model(inputs=inp, outputs=x)
    model.compile(optimizer=Adam(learning_rate), loss='binary_crossentropy', metrics=[tfa.metrics.F1Score((name='f1_score',num_classes=num_labels))])
    return model

In [None]:
num_columns = X_train_robust.shape[1]
num_labels = 1
hidden_units = [150, 150, 150]
dropout_rates = [0.1, 0, 0.1, 0]
learning_rate = 1e-3


model = nn_model(
    num_columns=num_columns, 
    num_labels=num_labels,
    hidden_units=hidden_units,
    dropout_rates=dropout_rates,
    learning_rate=learning_rate
)
r = model.fit(
    X_train_robust, y_train_robust,
    validation_data=(X_test_robust, y_test_robust),
    epochs=20,
    batch_size=32
)