In [None]:
import gc

gc.collect()

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sys
import os

import xgboost as xgb

import shap 


import warnings
warnings.filterwarnings('ignore')

shap.initjs()

# Get the current working directory
current_dir = os.getcwd()


# Navigate two directories back
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
sys.path.append(parent_dir)


# define the data directory
data_dir = parent_dir + "/data/"

# This creates the full path to the output directory
Output = "results/"
output_dir = os.path.join(parent_dir, Output)

# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Path to the 'src' directory
src_path = os.path.join(parent_dir, 'src')



In [None]:
# Loop through each entry in the 'src' directory
for item in os.listdir(src_path):
    full_path = os.path.join(src_path, item)
    # Check if the entry is a directory
    if os.path.isdir(full_path):
        # Add the directory to sys.path
        sys.path.append(full_path)

In [None]:
# Load the data
data = pd.read_csv(data_dir + "cleaned/no_nan_ordinal_encoded.csv")

int_cols = np.setxor1d(np.array(data.columns), np.array(['AMT_CREDIT', 'AMT_INCOME_TOTAL']))
data[int_cols] = data[int_cols].astype(int)
data.head()

In [None]:
# Defining categorical columns
selected_columns = []
for column in data.columns:
    unique_values = data[column].nunique()
    if unique_values <= 10 and unique_values > 2:
        selected_columns.append(column)

data[selected_columns] = data[selected_columns].astype('category')

In [None]:
# Encoding the categorical columns
from fn_encoding import one_hot_encode

data = one_hot_encode(data, selected_columns)
data.replace({True: 1, False: 0}, inplace=True)
data.head()

# Training

In [None]:
from sklearn.model_selection import train_test_split

X = data.drop(columns=["TARGET", "STATUS"])
y = data["TARGET"]


# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)
del data, X, y
X_train.shape, X_test.shape, y_train.head()

In [None]:

run_dir = os.path.join(parent_dir, 'model_runs')
run_dir = os.path.join(run_dir, 'randomforest')

if not os.path.exists(run_dir):
    os.makedirs(run_dir)

In [None]:
# XGB Classifier
xg_cl = xgb.XGBClassifier(learning_rate = 0.9, max_depth = 1000, n_estimators = 1000)

# Fit the classifier to the training set
xg_cl.fit(X_train, y_train)


# Predict the labels of the test set: preds
predictions = xg_cl.predict(X_test)

# Assuming y_pred contains the predicted values and y_test contains the actual values
recall = recall_score(y_test, predictions)

print("Recall:", recall)

# Hyperparameter Tunning

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as stats

# Define the hyperparameter grid
param_grid = {
    "colsample_bytree": uniform(0.7, 0.3),
    "gamma": uniform(0, 0.5),
    "learning_rate": uniform(0.03, 0.3), # default 0.1 
    "max_depth": randint(2, 6), # default 3
    "n_estimators": randint(100, 150), # default 100
    "subsample": uniform(0.6, 0.4)
    }

# Define the hyperparameter distributions
param_dist = {
    #'max_depth': stats.randint(3, 10),
    'learning_rate': stats.uniform(0.01, 0.9),
    #'subsample': stats.uniform(0.5, 0.5),
    #'n_estimators':stats.randint(50, 200)
}

# Create the XGBoost model object
xgb_model = xgb.XGBClassifier(n_estimators=1000)

random_search = RandomizedSearchCV(xgb_model, param_distributions=param_dist, n_iter=10, cv=5, scoring='recall')

# Fit the RandomizedSearchCV object to the training data
random_search.fit(X_train, y_train)

# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
    #'max_depth': range(3, 10, 1),
    'n_estimators': range(100, 10000, 500),
    'learning_rate': np.logspace(np.log10(0.01), np.log10(10), num=20),
    #'subsample': [0.5, 0.7, 1]
    }

# Create the XGBoost model object
xgb_model = xgb.XGBClassifier()

# Create the GridSearchCV object
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='recall')

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

# Performances

In [None]:
from sklearn.metrics import roc_curve, auc, f1_score, confusion_matrix, recall_score, precision_score

params = grid_search.best_params_
# Initialize the XGBClassifier with the best parameters
model = xgb.XGBClassifier(n_estimators=1000,**params)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Assuming y_pred contains the predicted values and y_test contains the actual values
recall = recall_score(y_test, y_pred)

print("Recall:", recall)

In [None]:
from PerformanceMetrics import plot_roc

experiment_path = os.path.join(parent_dir, 'model_runs/xgboost')

fig = plt.figure(figsize=(10, 6))
f = plot_roc(model, X_test, y_test)
f.savefig(experiment_path + '/roc_curve-gridsearch0.png', dpi=600, format='png',bbox_inches='tight')
plt.show()
