In [1]:
import pandas as pd

train = pd.read_csv('/kaggle/input/home-credit-credit-risk-model-stability/csv_files/train/train_base.csv')
test = pd.read_csv('/kaggle/input/home-credit-credit-risk-model-stability/csv_files/test/test_base.csv')

In [2]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1526659 entries, 0 to 1526658
Data columns (total 5 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   case_id        1526659 non-null  int64 
 1   date_decision  1526659 non-null  object
 2   MONTH          1526659 non-null  int64 
 3   WEEK_NUM       1526659 non-null  int64 
 4   target         1526659 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 58.2+ MB


In [3]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   case_id        10 non-null     int64 
 1   date_decision  10 non-null     object
 2   MONTH          10 non-null     int64 
 3   WEEK_NUM       10 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 448.0+ bytes


In [4]:
numerical_columns_train = train.select_dtypes(include=['int64', 'float64'])
numerical_columns_test = test.select_dtypes(include=['int64', 'float64'])

In [5]:
# Define a function to cap outliers based on the 1st and 99th percentiles
def cap_outliers(df, columns):
    for column in columns:
        lower_bound = df[column].quantile(0.01)
        upper_bound = df[column].quantile(0.99)
        df[column] = df[column].clip(lower=lower_bound, upper=upper_bound)
    return df

# Cap outliers in numerical columns for train and test datasets
numerical_columns_train = numerical_columns_train.columns
numerical_columns_test = numerical_columns_test.columns

train_capped = cap_outliers(train.copy(), numerical_columns_train)
test_capped = cap_outliers(test.copy(), numerical_columns_test)

In [6]:
# Perform one-hot encoding for categorical columns in train and test datasets
train_encoded = pd.get_dummies(train_capped, columns=['date_decision'], drop_first=True)
test_encoded = pd.get_dummies(test_capped, columns=['date_decision'], drop_first=True)

In [7]:
# Set up features and target for the train dataset
X_train = train_encoded[['MONTH', 'WEEK_NUM']]
X_test = test_encoded[['MONTH', 'WEEK_NUM']]
y_train = train_encoded['target']

In [8]:
from sklearn.model_selection import train_test_split

# Perform train-test split with stratification
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.25, stratify=y_train, random_state=42
)

X_train_split.shape, X_val_split.shape, y_train_split.shape, y_val_split.shape

((1144994, 2), (381665, 2), (1144994,), (381665,))

In [9]:
%%time

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import numpy as np

# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize the scaler
scaler = StandardScaler()

# Initialize the model
model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(np.unique(y_train)))

gini_scores = []

for train_index, val_index in kf.split(X_train):
    # Split the data into training and validation sets
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # Scale the features
    X_train_fold_scaled = scaler.fit_transform(X_train_fold)
    X_val_fold_scaled = scaler.transform(X_val_fold)

    # Train the model
    model.fit(X_train_fold_scaled, y_train_fold)

    # Make predictions on the validation set
    y_val_pred_proba = model.predict_proba(X_val_fold_scaled)

    # Evaluate Gini coefficient (2 * AUC - 1)
    auc = roc_auc_score(y_val_fold, y_val_pred_proba[:, 1])  # Use probabilities for the positive class
    gini = 2 * auc - 1
    gini_scores.append(gini)

# Calculate the average Gini coefficient
average_gini = np.mean(gini_scores)
average_gini

CPU times: user 2min 37s, sys: 433 ms, total: 2min 38s
Wall time: 41.4 s


0.14290845304092686

In [10]:
import pickle

# Save the trained model into a pickle file
with open('home_credit_risk_xgb_model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [11]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Scale the test data
X_test_scaled = scaler.transform(X_test)

# Train the model on the entire training data
model.fit(scaler.fit_transform(X_train), y_train)

# Predict on the test data
y_test_pred = model.predict(X_test_scaled)

In [14]:
# Calculate the percentage of clients predicted to default on their loans
default_probability = (y_val_pred_proba == 1).sum() / len(y_val_pred_proba) * 100
default_probability

0.0

- The percentage of clients predicted to default on their loans 0%