In [1]:
import pandas as pd

train = pd.read_csv('/kaggle/input/home-credit-credit-risk-model-stability/csv_files/train/train_base.csv')
test = pd.read_csv('/kaggle/input/home-credit-credit-risk-model-stability/csv_files/test/test_base.csv')

In [2]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1526659 entries, 0 to 1526658
Data columns (total 5 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   case_id        1526659 non-null  int64 
 1   date_decision  1526659 non-null  object
 2   MONTH          1526659 non-null  int64 
 3   WEEK_NUM       1526659 non-null  int64 
 4   target         1526659 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 58.2+ MB


In [3]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   case_id        10 non-null     int64 
 1   date_decision  10 non-null     object
 2   MONTH          10 non-null     int64 
 3   WEEK_NUM       10 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 448.0+ bytes


In [4]:
numerical_columns_train = train.select_dtypes(include=['int64', 'float64'])
numerical_columns_test = test.select_dtypes(include=['int64', 'float64'])

In [5]:
# Define a function to cap outliers based on the 1st and 99th percentiles
def cap_outliers(df, columns):
    for column in columns:
        lower_bound = df[column].quantile(0.01)
        upper_bound = df[column].quantile(0.99)
        df[column] = df[column].clip(lower=lower_bound, upper=upper_bound)
    return df

# Cap outliers in numerical columns for train and test datasets
numerical_columns_train = numerical_columns_train.columns
numerical_columns_test = numerical_columns_test.columns

train_capped = cap_outliers(train.copy(), numerical_columns_train)
test_capped = cap_outliers(test.copy(), numerical_columns_test)

In [6]:
# Perform one-hot encoding for categorical columns in train and test datasets
train_encoded = pd.get_dummies(train_capped, columns=['date_decision'], drop_first=True)
test_encoded = pd.get_dummies(test_capped, columns=['date_decision'], drop_first=True)

In [7]:
# Set up features and target for the train dataset
X_train = train_encoded[['MONTH', 'WEEK_NUM']]
X_test = test_encoded[['MONTH', 'WEEK_NUM']]
y_train = train_encoded['target']

In [8]:
from sklearn.model_selection import train_test_split

# Perform train-test split with stratification
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.25, stratify=y_train, random_state=42
)

X_train_split.shape, X_val_split.shape, y_train_split.shape, y_val_split.shape

((1144994, 2), (381665, 2), (1144994,), (381665,))

In [9]:
%%time

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np

# Initialize the scaler
scaler = StandardScaler()

# Define K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize arrays to store all predictions and true labels
all_predictions = []
all_true_labels = []

# Perform K-Fold cross-validation
for train_index, test_index in kf.split(X_train):
    X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]
    
    # Standardize the features
    X_train_scaled = scaler.fit_transform(X_train_fold)
    X_test_scaled = scaler.transform(X_test_fold)
    
    # Define the neural network model architecture
    def build_model(input_dim):
        model = Sequential()
        model.add(Dense(64, input_dim=input_dim, activation='relu'))
        model.add(Dropout(0.3))
        model.add(Dense(32, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return model

    # Build and train the model
    model = build_model(input_dim=X_train_scaled.shape[1])
    
    # Early stopping callback
    early_stop = EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)
    
    # Train the model
    model.fit(X_train_scaled, y_train_fold, epochs=5, batch_size=32, validation_data=(X_test_scaled, y_test_fold), callbacks=[early_stop], verbose=1)
    
    # Predict probabilities on the test set
    y_pred_proba = model.predict(X_test_scaled).flatten()
    
    # Store the predictions and true labels
    all_predictions.extend(y_pred_proba)
    all_true_labels.extend(y_test_fold.values)

# Convert to numpy arrays
y_pred_proba_all = np.array(all_predictions)
y_true_all = np.array(all_true_labels)

# Calculate the stability metric (difference between predicted probabilities and true labels)
stability_metric = np.mean(np.abs(y_pred_proba_all - y_true_all))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m38167/38167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 2ms/step - accuracy: 0.9676 - loss: 0.1449 - val_accuracy: 0.9687 - val_loss: 0.1387
Epoch 2/5
[1m38167/38167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 2ms/step - accuracy: 0.9688 - loss: 0.1386 - val_accuracy: 0.9687 - val_loss: 0.1387
[1m9542/9542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1ms/step
Epoch 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m38167/38167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 2ms/step - accuracy: 0.9685 - loss: 0.1439 - val_accuracy: 0.9686 - val_loss: 0.1392
Epoch 2/5
[1m38167/38167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 2ms/step - accuracy: 0.9685 - loss: 0.1395 - val_accuracy: 0.9686 - val_loss: 0.1391
Epoch 3/5
[1m38167/38167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 2ms/step - accuracy: 0.9684 - loss: 0.1398 - val_accuracy: 0.9686 - val_loss: 0.1396
[1m9542/9542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1ms/step
Epoch 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m38167/38167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 2ms/step - accuracy: 0.9677 - loss: 0.1435 - val_accuracy: 0.9682 - val_loss: 0.1409
Epoch 2/5
[1m38167/38167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 2ms/step - accuracy: 0.9687 - loss: 0.1388 - val_accuracy: 0.9682 - val_loss: 0.1407
Epoch 3/5
[1m38167/38167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 2ms/step - accuracy: 0.9688 - loss: 0.1386 - val_accuracy: 0.9682 - val_loss: 0.1404
Epoch 4/5
[1m38167/38167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 2ms/step - accuracy: 0.9688 - loss: 0.1385 - val_accuracy: 0.9682 - val_loss: 0.1405
[1m9542/9542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1ms/step
Epoch 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m38167/38167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 2ms/step - accuracy: 0.9684 - loss: 0.1437 - val_accuracy: 0.9687 - val_loss: 0.1387
Epoch 2/5
[1m38167/38167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 2ms/step - accuracy: 0.9688 - loss: 0.1384 - val_accuracy: 0.9687 - val_loss: 0.1387
Epoch 3/5
[1m38167/38167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 2ms/step - accuracy: 0.9686 - loss: 0.1393 - val_accuracy: 0.9687 - val_loss: 0.1389
[1m9542/9542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1ms/step
Epoch 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m38167/38167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 2ms/step - accuracy: 0.9684 - loss: 0.1436 - val_accuracy: 0.9687 - val_loss: 0.1388
Epoch 2/5
[1m38167/38167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 2ms/step - accuracy: 0.9685 - loss: 0.1398 - val_accuracy: 0.9687 - val_loss: 0.1388
[1m9542/9542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 1ms/step
CPU times: user 20min 47s, sys: 3min 19s, total: 24min 6s
Wall time: 15min 35s


In [14]:
stability_metric

0.0604986230444254

In [25]:
# Predict test data

y_pred = model.predict(X_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step


In [11]:
import pickle

# Save the trained model into a pickle file
with open('home_credit_risk.pkl', 'wb') as file:
    pickle.dump(model, file)

In [27]:
from sklearn.metrics import confusion_matrix, roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt

# Convert probability predictions to binary predictions using 0.5 threshold
y_pred_binary = (y_pred_proba_all >= 0.5).astype(int)

# Calculate Gini score
# Gini = 2 * AUC - 1
def calculate_gini(y_true, y_pred_proba):
    auc = roc_auc_score(y_true, y_pred_proba)
    gini = 2 * auc - 1
    return gini

gini_score = calculate_gini(y_true_all, y_pred_proba_all)
print(f"Gini Score: {gini_score:.4f}")

Gini Score: 0.1151
