In [1]:
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import gc
gc.collect()

0

In [2]:
test_data = pd.read_csv('test_data_EDA.csv',index_col=0)
train_data = pd.read_csv('train_data_EDA.csv',index_col=0)

In [3]:
# Initialize GroupKFold with 5 splits
skf = GroupKFold(n_splits=5)

# Initialize arrays for storing out-of-fold predictions
oof_preds = np.zeros(len(train_data))  # Out-of-fold predictions for validation

# List of features to be used in training (make sure all categorical features are encoded)
features = [col for col in train_data.columns if col not in ['gender','address','description','expires','acct_open_date','card_brand','card_type','errors','zip','merchant_state','merchant_city','use_chip','date','isFraud']]

# Target variable
target = 'isFraud'

# Group KFold loop for training and validation
for train_idx, val_idx in skf.split(train_data, train_data[target], groups=train_data['client_id']):
    # Split into training and validation sets
    X_train, X_val = train_data.iloc[train_idx][features], train_data.iloc[val_idx][features]
    y_train, y_val = train_data.iloc[train_idx][target], train_data.iloc[val_idx][target]

    # Initialize your model (e.g., Random Forest, XGBoost, or others)
    model = RandomForestClassifier(n_estimators=100, random_state=42)

    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Predict on validation data
    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]

    del X_train, X_val, y_train, y_val

# Evaluate ROC-AUC on out-of-fold predictions
roc_auc = roc_auc_score(train_data[target], oof_preds)
print(f"Out-of-Fold ROC-AUC: {roc_auc}")

# Save the test set predictions for later analysis in the XAI and causal discovery
test_preds = model.predict_proba(test_data[features])[:, 1]
test_data['predictions_RF'] = test_preds

Out-of-Fold ROC-AUC: 0.9730428651365423


In [6]:
df = pd.DataFrame(oof_preds)
df.to_csv('oof_preds_rf.csv')
df = pd.DataFrame(test_preds)
df.to_csv('test_preds_rf.csv')