In [1]:
from pprint import pprint
import pandas as pd
import numpy as np
import gc
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score


In [16]:
# # Data Directory
# data_dir = 'dataset/'

# # kaggle Data Directory
data_dir = '/kaggle/input/train-test/dataset/'

# Load the data
train_1 = pd.read_csv(data_dir + 'train_1.csv')
train_2_1 = pd.read_csv(data_dir + 'train_2_1.csv')
train_2_2 = pd.read_csv(data_dir + 'train_2_2.csv')
test_1 = pd.read_csv(data_dir + 'test_1.csv')
test_2_1 = pd.read_csv(data_dir + 'test_2_1.csv')
test_2_2 = pd.read_csv(data_dir + 'test_2_2.csv')

  train_1 = pd.read_csv(data_dir + 'train_1.csv')
  train_2_1 = pd.read_csv(data_dir + 'train_2_1.csv')
  train_2_2 = pd.read_csv(data_dir + 'train_2_2.csv')


In [17]:
# Step 1: Remove duplicates based on 'id' in the additional information DataFrames
train_2_1 = train_2_1.drop_duplicates(subset='id')
train_2_2 = train_2_2.drop_duplicates(subset='id')
test_2_1 = test_2_1.drop_duplicates(subset='id')
test_2_2 = test_2_2.drop_duplicates(subset='id')

# Step 2: Identify loans with and without bureau data
# Create 'train_with_bureau': Loans present in both train_1, train_2_1, and train_2_2
train_with_bureau = pd.merge(train_1, train_2_1, on='id', how='inner')
train_with_bureau = pd.merge(train_with_bureau, train_2_2, on='id', how='inner')

# Create 'train_without_bureau': Loans present only in train_1 (no additional bureau data)
train_without_bureau = train_1[~train_1['id'].isin(train_with_bureau['id'])].copy()

# Step 3: Merge the test DataFrames similarly to how you handled train_with_bureau
# Create 'test_with_bureau': Loans present in both test_1, test_2_1, and test_2_2
test_with_bureau = pd.merge(test_1, test_2_1, on='id', how='inner')
test_with_bureau = pd.merge(test_with_bureau, test_2_2, on='id', how='inner')

# Create 'test_without_bureau': Loans present only in test_1 (no additional bureau data)
test_without_bureau = test_1[~test_1['id'].isin(test_with_bureau['id'])].copy()

# Step 4: Replace "NR" with NaN in both datasets
train_with_bureau.replace("NR", np.nan, inplace=True)
train_without_bureau.replace("NR", np.nan, inplace=True)
test_with_bureau.replace("NR", np.nan, inplace=True)
test_without_bureau.replace("NR", np.nan, inplace=True)

In [18]:
from scipy import stats

# Identify numerical and categorical columns for `train_with_bureau`
numerical_cols_train_with_bureau = train_with_bureau.select_dtypes(include=['int64', 'float64']).columns
categorical_cols_train_with_bureau = train_with_bureau.select_dtypes(include=['object']).columns

# Identify numerical and categorical columns for `train_without_bureau`
numerical_cols_train_without_bureau = train_without_bureau.select_dtypes(include=['int64', 'float64']).columns
categorical_cols_train_without_bureau = train_without_bureau.select_dtypes(include=['object']).columns

# Identify numerical and categorical columns for `test_with_bureau`
numerical_cols_test_with_bureau = test_with_bureau.select_dtypes(include=['int64', 'float64']).columns
categorical_cols_test_with_bureau = test_with_bureau.select_dtypes(include=['object']).columns

# Identify numerical and categorical columns for `test_without_bureau`
numerical_cols_test_without_bureau = test_without_bureau.select_dtypes(include=['int64', 'float64']).columns
categorical_cols_test_without_bureau = test_without_bureau.select_dtypes(include=['object']).columns


In [19]:
# Convert categorical columns to numeric if possible
def convert_to_numeric(df, col):
    try:
        df[col] = pd.to_numeric(df[col])  # 'coerce' will replace invalid parsing with NaN
    except ValueError:
        pass

# For `train_with_bureau`
for col in categorical_cols_train_with_bureau:
    convert_to_numeric(train_with_bureau, col)

# For `train_without_bureau`
for col in categorical_cols_train_without_bureau:
    convert_to_numeric(train_without_bureau, col)

# For `test_with_bureau`
for col in categorical_cols_test_with_bureau:
    convert_to_numeric(test_with_bureau, col)

# For `test_without_bureau`
for col in categorical_cols_test_without_bureau:
    convert_to_numeric(test_without_bureau, col)


In [20]:
# Import necessary libraries
import ast

# Function to convert string representations to lists
def convert_to_list(value):
    if pd.isna(value):
        return np.nan
    try:
        value = value.strip('"')  # Remove quotes
        return ast.literal_eval(value)  # Convert string to list
    except (ValueError, SyntaxError):
        return np.nan

# Handle NaN by converting floats to empty lists
def handle_NaN(value):
    if isinstance(value, float):
        return []  # Convert NaN to empty list for consistency
    return value

# Columns to process (specific to datasets with bureau data)
columns_to_convert = ['add_671_x', 'add_671_y']

# Apply transformations to `train_with_bureau`
for col in columns_to_convert:
    train_with_bureau[col] = train_with_bureau[col].apply(convert_to_list)
    train_with_bureau[col] = train_with_bureau[col].apply(handle_NaN)

# Apply transformations to `test_with_bureau`
for col in columns_to_convert:
    test_with_bureau[col] = test_with_bureau[col].apply(convert_to_list)
    test_with_bureau[col] = test_with_bureau[col].apply(handle_NaN)

In [21]:
# Function to flatten list columns and create separate features
def flatten_lists(df1, df2, col):
    # Get the maximum length of the lists in both DataFrames
    max_length = df1[col].apply(len).max()
    max_length = max(max_length, df2[col].apply(len).max())
    # Iterate through the max length and create new columns for each element in the list
    for i in range(max_length):
        name = col + "_" + str(i)
        # For df1 (train_with_bureau)
        df1[name] = df1[col].apply(lambda x: x[i] if i < len(x) else np.nan)
        convert_to_numeric(df1, name)
        # For df2 (test_with_bureau)
        df2[name] = df2[col].apply(lambda x: x[i] if i < len(x) else np.nan)
        convert_to_numeric(df2, name)
    # Drop the original list column as it has been flattened
    df1.drop(columns=[col], inplace=True)
    df2.drop(columns=[col], inplace=True)
    return max_length

# Apply flatten_lists only to datasets with bureau data
flatten_lists(train_with_bureau, test_with_bureau, 'add_671_x')
flatten_lists(train_with_bureau, test_with_bureau, 'add_671_y')


5

In [22]:
# Identify numerical and categorical columns for each dataset

# Train with bureau data
numerical_cols_train_with_bureau = train_with_bureau.select_dtypes(include=['int64', 'float64']).columns
categorical_cols_train_with_bureau = train_with_bureau.select_dtypes(include=['object']).columns

# Train without bureau data
numerical_cols_train_without_bureau = train_without_bureau.select_dtypes(include=['int64', 'float64']).columns
categorical_cols_train_without_bureau = train_without_bureau.select_dtypes(include=['object']).columns

# Test with bureau data
numerical_cols_test_with_bureau = test_with_bureau.select_dtypes(include=['int64', 'float64']).columns
categorical_cols_test_with_bureau = test_with_bureau.select_dtypes(include=['object']).columns

# Test without bureau data
numerical_cols_test_without_bureau = test_without_bureau.select_dtypes(include=['int64', 'float64']).columns
categorical_cols_test_without_bureau = test_without_bureau.select_dtypes(include=['object']).columns


In [23]:
# # Print categorical columns and distinct values for each dataset

# # Train with bureau
# print('Categorical columns (train_with_bureau): ', categorical_cols_train_with_bureau)
# print("\nDistinct values in categorical columns (train_with_bureau):")
# for col in categorical_cols_train_with_bureau:
#     print(f"{col}: {train_with_bureau[col].nunique()}")

# # Train without bureau
# print('Categorical columns (train_without_bureau): ', categorical_cols_train_without_bureau)
# print("\nDistinct values in categorical columns (train_without_bureau):")
# for col in categorical_cols_train_without_bureau:
#     print(f"{col}: {train_without_bureau[col].nunique()}")

# # Test with bureau
# print('Categorical columns (test_with_bureau): ', categorical_cols_test_with_bureau)
# print("\nDistinct values in categorical columns (test_with_bureau):")
# for col in categorical_cols_test_with_bureau:
#     print(f"{col}: {test_with_bureau[col].nunique()}")

# # Test without bureau
# print('Categorical columns (test_without_bureau): ', categorical_cols_test_without_bureau)
# print("\nDistinct values in categorical columns (test_without_bureau):")
# for col in categorical_cols_test_without_bureau:
#     print(f"{col}: {test_without_bureau[col].nunique()}")


In [24]:
from sklearn.impute import SimpleImputer

imputer_num = SimpleImputer(strategy='mean')
imputer_cat = SimpleImputer(strategy='constant', fill_value='NA')

dfs = [train_with_bureau, train_without_bureau, test_with_bureau, test_without_bureau]

for df in dfs:
    num_cols = df.select_dtypes(include=['int64', 'float64']).columns
    cat_cols = df.select_dtypes(include=['object']).columns
    
    df[num_cols] = imputer_num.fit_transform(df[num_cols])
    df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])

In [11]:
# # Release memory by deleting unnecessary variables and forcing garbage collection
# del  test_2_1, test_2_2, train_1, train_2_1, train_2_2
# gc.collect()

In [26]:
# Select relevant features for train_with_bureau
features_with_bureau = train_with_bureau.columns.drop(['id', 'label', 'loan_id'])
X_train_with_bureau = train_with_bureau[features_with_bureau]
y_train_with_bureau = train_with_bureau['label']

# Select relevant features for train_without_bureau
features_without_bureau = train_without_bureau.columns.drop(['id', 'label', 'loan_id'])
X_train_without_bureau = train_without_bureau[features_without_bureau]
y_train_without_bureau = train_without_bureau['label']

# Select relevant features for test_with_bureau
features_test_with_bureau = test_with_bureau.columns.drop(['id', 'loan_id'])
X_test_with_bureau = test_with_bureau[features_test_with_bureau]

# Select relevant features for test_without_bureau
features_test_without_bureau = test_without_bureau.columns.drop(['id', 'loan_id'])
X_test_without_bureau = test_without_bureau[features_test_without_bureau]


In [None]:
# import gc

# # Release memory by deleting unnecessary variables
# del train_with_bureau, train_without_bureau, test_with_bureau, test_without_bureau

# # Force garbage collection
# gc.collect()

In [27]:
import re

# Function to sanitize column names by removing special characters
def sanitize_column_names(df):
    # Replace or remove invalid characters (anything not a letter, number, or underscore)
    df.columns = [re.sub(r'[^A-Za-z0-9_]+', '_', col) for col in df.columns]
    return df

# Function to process datasets
def process_dataset(X_train, X_test, categorical_cols):
    # Remove 'loan_id' and 'id' from the list of categorical columns if they exist
    categorical_cols = [col for col in categorical_cols if col not in ['loan_id', 'id']]

    # Remove 'loan_id' and 'id' from the features if they exist
    X_train = X_train.drop(columns=['loan_id', 'id'], errors='ignore')
    X_test = X_test.drop(columns=['loan_id', 'id'], errors='ignore')

    # Convert categorical columns to numerical using one-hot encoding
    X_train = pd.get_dummies(X_train, columns=categorical_cols)
    X_test = pd.get_dummies(X_test, columns=categorical_cols)

    # Apply sanitization to both train and test datasets
    X_train = sanitize_column_names(X_train)
    X_test = sanitize_column_names(X_test)

    # Ensure the same columns in train and test after one-hot encoding
    X_train, X_test = X_train.align(X_test, join='inner', axis=1, fill_value=0)

    return X_train, X_test

# Process datasets with bureau data
X_train_with_bureau, X_test_with_bureau = process_dataset(X_train_with_bureau, X_test_with_bureau, categorical_cols_train_with_bureau)

# Process datasets without bureau data
X_train_without_bureau, X_test_without_bureau = process_dataset(X_train_without_bureau, X_test_without_bureau, categorical_cols_train_without_bureau)


In [47]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

# Function to train the model
def train_model(X_train, y_train):
    print(f"Trainig with {X_train}")
    
    # Split data for validation
    X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    model = xgb.XGBClassifier(
        booster='dart',
        reg_lambda=0.00005633433783615297,
        alpha=0.000017886579960049807,
        subsample=0.7695036259589584,
        colsample_bytree=0.6145464426199919,
        max_depth=6,
        learning_rate=0.01,
        n_estimators=637,
        min_child_weight=9,
        gamma=0.34168672204351896,
        eval_metric='auc',
        random_state=42,
        tree_method='hist',
        device='cuda'
    )

    model.fit(
        X_train_split, y_train_split,
        eval_set=[(X_val, y_val)],
        verbose=True
    )
    
    return model




In [40]:
# Train the model with bureau data
model_with_bureau = train_model(X_train_with_bureau, y_train_with_bureau)


Trainig with           col_1  col_2  col_3  col_4  col_5     col_6     col_7     col_8  \
0      0.004214  16.99    0.0    0.0    0.0  0.000766  0.000213  0.003886   
1      0.004880  28.00    0.0    0.0    0.0  0.001947  0.001069  0.001620   
2      0.005392  34.00    0.0    0.0    0.0  0.002217  0.001044  0.002108   
3      0.000901  30.00    0.0    0.0    0.0  0.000253  0.000034  0.001592   
4      0.000850  36.00    0.0    0.0    0.0  0.000632  0.000052  0.002150   
...         ...    ...    ...    ...    ...       ...       ...       ...   
45364  0.141421  14.62    0.0    0.0   37.0  0.205002  0.191226  0.002089   
45365  0.038488  14.22    0.0    0.0  203.0  0.032655  0.028844  0.013361   
45366  0.614863  12.47    0.0    0.0  234.0  0.749999  0.698917  0.010655   
45367  0.139209  12.72    0.0    0.0  234.0  0.145601  0.133762  0.003242   
45368  0.020997  14.22    0.0    0.0  234.0  0.006304  0.004095  0.004451   

          col_9    col_10  ...  add_677_x__  add_677_x_NA  add

In [None]:

# Train the model without bureau data
model_without_bureau = train_model(X_train_without_bureau, y_train_without_bureau)

In [None]:
# Function to make predictions and create a mapping
def make_predictions_with_mapping(model, X_test, loan_ids):
    # Make predictions on the test set
    predictions = model.predict_proba(X_test)[:, 1]  # Get probabilities for class 1

    # Create a dictionary to map loan_id to its prediction
    prediction_mapping = {loan_id: prediction for loan_id, prediction in zip(loan_ids, predictions)}

    return prediction_mapping

# Make predictions for the test set with bureau data
predictions_with_bureau_mapping = make_predictions_with_mapping(model_with_bureau, X_test_with_bureau, test_with_bureau['loan_id'])

# Make predictions for the test set without bureau data
predictions_without_bureau_mapping = make_predictions_with_mapping(model_without_bureau, X_test_without_bureau, test_without_bureau['loan_id'])

# Combine the mappings
combined_predictions_mapping = {**predictions_with_bureau_mapping, **predictions_without_bureau_mapping}

# Create the submission DataFrame using the combined mapping
submission = pd.DataFrame({
    'loan_id': combined_predictions_mapping.keys(),
    'prob': combined_predictions_mapping.values()
})

# Save the predictions to a single submission file
submission.to_csv('submission.csv', index=False)

In [91]:
from lightgbm import LGBMClassifier
def train_lgb_model(X_train, y_train):
    print(f"Training with {X_train.shape} shaped data")
    
    # Split data for validation
    X_train_split, X_val, y_train_split, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
    )
    
    # Initialize LGBMClassifier with specified parameters
    model = LGBMClassifier(
        device='gpu',
        gpu_platform_id=0,
        gpu_device_id=0,
        n_estimators=1000,
        learning_rate=0.01,
        num_leaves=50,
        max_depth= 12,
        min_child_samples=30,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        # Additional parameters that might be useful
        boosting_type='gbdt',
        objective='binary',
        metric='auc',
        
    )
    
    # Train the model with validation set
    model.fit(
        X_train_split, 
        y_train_split,
        eval_set=[(X_train_split, y_train_split), (X_val, y_val)],
        eval_names=['train', 'valid'],
        eval_metric='auc',
        callbacks=[
            lgb.callback.early_stopping(stopping_rounds=50),
            lgb.callback.log_evaluation(period=100)
        ]
    )
    
#     # Get validation score
#     val_preds = model.predict_proba(X_val)[:, 1]
#     val_score = model.best_score_['valid_0']['auc']  # Extract the specific value
#     print(f"\nBest validation AUC: {val_score:.4f}")

    # Return the model and validation score
    return model, val_score




In [92]:
# Train the model with bureau data
model_with_bureau_lgb = train_lgb_model(X_train_with_bureau, y_train_with_bureau)



Training with (45369, 1615) shaped data
[LightGBM] [Info] Number of positive: 1925, number of negative: 34370
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 116616
[LightGBM] [Info] Number of data points in the train set: 36295, number of used features: 1005
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 617 dense feature groups (21.46 MB) transferred to GPU in 0.018761 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.053038 -> initscore=-2.882258
[LightGBM] [Info] Start training from score -2.882258
Training until validation scores don't improve for 50 rounds
[100]	train's auc: 0.908995	valid's auc: 0.838428
[200]	train's auc: 0.951222	valid's auc: 0.854506
[300]	train's au

KeyError: 'auc'

In [87]:
# Train the model without bureau data
model_without_bureau_lgb = train_lgb_model(X_train_without_bureau, y_train_without_bureau)


Training with (54631, 254) shaped data
[LightGBM] [Info] Number of positive: 1408, number of negative: 42296
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 28876
[LightGBM] [Info] Number of data points in the train set: 43704, number of used features: 230
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 82 dense feature groups (3.50 MB) transferred to GPU in 0.003566 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.032217 -> initscore=-3.402522
[LightGBM] [Info] Start training from score -3.402522
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 82 dense feature groups (2.79 MB) transferred to GPU in 0.003067 secs. 1 sparse feature groups
Training until valid

TypeError: unsupported format string passed to collections.OrderedDict.__format__

In [98]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

def train_rf_model(X_train, y_train):
    print(f"Training with {X_train.shape} shaped data")
    
    # Split data for validation
    X_train_split, X_val, y_train_split, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
    )
    
    # Initialize RandomForestClassifier with parameters
    model = RandomForestClassifier(
        n_estimators=300,
        max_depth=12,           # Similar to LightGBM
        min_samples_leaf=30,    # Similar to min_child_samples
        max_features=0.8,       # Similar to colsample_bytree
        max_samples=0.8,        # Similar to subsample
        random_state=42,
        n_jobs=-1,             # Use all CPU cores
        class_weight='balanced', # Handle imbalanced datasets
        # Additional parameters
        min_samples_split=10,
        bootstrap=True,
        verbose=1
    )
    
    # Train the model
    model.fit(X_train_split, y_train_split)
    
    # Calculate validation scores
    train_preds = model.predict_proba(X_train_split)[:, 1]
    val_preds = model.predict_proba(X_val)[:, 1]
    
    train_score = roc_auc_score(y_train_split, train_preds)
    val_score = roc_auc_score(y_val, val_preds)
    
    print(f"\nTraining AUC: {train_score:.4f}")
    print(f"Validation AUC: {val_score:.4f}")
    
#     # Optional: Print feature importances
#     if hasattr(X_train, 'columns'):
#         importances = pd.DataFrame({
#             'feature': X_train.columns,
#             'importance': model.feature_importances_
#         })
#         print("\nTop 10 Most Important Features:")
#         print(importances.sort_values('importance', ascending=False).head(10))
    
    return model, val_score



In [99]:
# Example usage:
rf_model, rf_score = train_rf_model(X_train_with_bureau, y_train_with_bureau)

Training with (45369, 1615) shaped data


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  5.3min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:    0.3s finished



Training AUC: 0.9634
Validation AUC: 0.8438


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:    0.1s finished


In [None]:
# Function to make predictions
def make_predictions(model, X_test):
    # Make predictions on the test set
    return model.predict_proba(X_test)[:, 1]  # Get probabilities for class 1

# Make predictions for the test set with bureau data
predictions_with_bureau = make_predictions(model_with_bureau, X_test_with_bureau)

# Make predictions for the test set without bureau data
predictions_without_bureau = make_predictions(model_without_bureau, X_test_without_bureau)
# Combine loan_ids from both test sets
loan_ids = pd.concat([test_with_bureau['loan_id'], test_without_bureau['loan_id']], ignore_index=True)

# Combine predictions from both models
predictions = np.concatenate([predictions_with_bureau, predictions_without_bureau])

# Create the submission DataFrame
submission = pd.DataFrame({
    'loan_id': loan_ids,
    'prob': predictions
})
# Save the predictions to a single submission file
submission.to_csv('submission.csv', index=False)


In [None]:
# Get the number of rows
num_rows = submission.shape[0]

print("Number of rows in the DataFrame:", num_rows)

In [None]:
import optuna
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score

# Split data for validation
X_train_with_bureau, X_valid_with_bureau, y_train_with_bureau, y_valid_with_bureau = train_test_split(
    X_train_with_bureau, y_train_with_bureau, test_size=0.2, random_state=42
)

X_train_without_bureau, X_valid_without_bureau, y_train_without_bureau, y_valid_without_bureau = train_test_split(
    X_train_without_bureau, y_train_without_bureau, test_size=0.2, random_state=42
)

# Objective function for Optuna
def objective(trial, X_train, y_train, X_valid, y_valid):
    # Sample hyperparameters
    param = {
        'verbosity': 0,
        'objective': 'binary:logistic',
        # 'tree_method': 'hist',  # Use GPU
        # 'device': 'cuda',       # Use GPU
        'eval_metric': 'auc',
        'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.05),
        'n_estimators': trial.suggest_int('n_estimators', 100, 700),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0, 5)
    }

    # Create XGBoost DMatrix for train and validation sets
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)
    cv_results = xgb.cv(
        params=param,
        dtrain=dtrain,
        nfold=5,  # 5-fold cross-validation
        num_boost_round= 50,
        early_stopping_rounds=5,
        metrics='auc',
        seed=42,
        verbose_eval=True
    )
    # Extract the best score from cross-validation results
    mean_auc = cv_results['test-auc-mean'].max()
    return mean_auc


In [None]:
# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize')  # We want to maximize the ROC AUC score

n_trails = 100
timeout = 3600*3
# Optimize the study for the model with bureau data
study.optimize(lambda trial: objective(trial, X_train_with_bureau, y_train_with_bureau, X_valid_with_bureau, y_valid_with_bureau), n_trials=n_trails, timeout=timeout)

# Get the best hyperparameters for the model with bureau data
best_params_with_bureau = study.best_params
print("Best hyperparameters (with bureau data): ", best_params_with_bureau)



In [None]:
import json
# Define the file path
file_path = 'best_params_with_bureau.json'

# Write the best hyperparameters to a JSON file
with open(file_path, 'w') as file:
    json.dump(best_params_with_bureau, file, indent=4)

print(f"Best hyperparameters saved to {file_path}")

In [None]:
# Reset and optimize for the model without bureau data
study = optuna.create_study(direction='maximize')  # Resetting the study for the next optimization
study.optimize(lambda trial: objective(trial, X_train_without_bureau, y_train_without_bureau, X_valid_without_bureau, y_valid_without_bureau), n_trials=n_trails, timeout=timeout)

# Get the best hyperparameters for the model without bureau data
best_params_without_bureau = study.best_params
print("Best hyperparameters (without bureau data): ", best_params_without_bureau)

In [None]:
# Define the file path
file_path = 'best_params_without_bureau.json'

# Write the best hyperparameters to a JSON file
with open(file_path, 'w') as file:
    json.dump(best_params_without_bureau, file, indent=4)

print(f"Best hyperparameters saved to {file_path}")