In [1]:
from pprint import pprint
import pandas as pd
import numpy as np
import gc
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score


In [None]:
# Data Directory
data_dir = 'dataset/'

# # kaggle Data Directory
# data_dir = '/kaggle/input/train-test/dataset/'

# Load the data
train_1 = pd.read_csv(data_dir + 'train_1.csv')
train_2_1 = pd.read_csv(data_dir + 'train_2_1.csv')
train_2_2 = pd.read_csv(data_dir + 'train_2_2.csv')
test_1 = pd.read_csv(data_dir + 'test_1.csv')
test_2_1 = pd.read_csv(data_dir + 'test_2_1.csv')
test_2_2 = pd.read_csv(data_dir + 'test_2_2.csv')

In [3]:
# Remove duplicates based on 'id' in the additional information DataFrames
train_2_1 = train_2_1.drop_duplicates(subset='id')
train_2_2 = train_2_2.drop_duplicates(subset='id')
test_2_1 = test_2_1.drop_duplicates(subset='id')
test_2_2 = test_2_2.drop_duplicates(subset='id')

# Merge the DataFrames
train = pd.merge(train_1, train_2_1, on='id', how='left')
train = pd.merge(train, train_2_2, on='id', how='left')

test = pd.merge(test_1, test_2_1, on='id', how='left')
test = pd.merge(test, test_2_2, on='id', how='left')

# Replace "NR" with NaN
train.replace("NR", np.nan, inplace=True)
test.replace("NR", np.nan, inplace=True)

In [5]:
from scipy import stats

# Identify numerical and categorical columns
numerical_cols_train = train.select_dtypes(include=['int64', 'float64']).columns
categorical_cols_train = train.select_dtypes(include=['object']).columns

numerical_cols_test = test.select_dtypes(include=['int64', 'float64']).columns
categorical_cols_test = test.select_dtypes(include=['object']).columns

In [6]:
#convert col to numeric
def convert_to_numeric(df, col):
    try:
        df[col] = pd.to_numeric(df[col])
    except ValueError:
        pass

for col in categorical_cols_train:
    convert_to_numeric(train, col)
for col in categorical_cols_test:
    convert_to_numeric(test, col)  

In [7]:
# Convert string to list
import ast

def convert_to_list(value):
    if pd.isna(value):
        return np.nan
    try:
        value = value.strip('"')
        return ast.literal_eval(value)
    except (ValueError, SyntaxError):
        return np.nan
def handle_NaN(value):
    #check if value is float
    if isinstance(value, float):
        return []
    return value
for col in ['add_671_x', 'add_671_y']:
    train[col] = train[col].apply(convert_to_list)
    test[col] = test[col].apply(convert_to_list)
for col in ['add_671_x', 'add_671_y']:
    train[col] = train[col].apply(handle_NaN)
    test[col] = test[col].apply(handle_NaN)


In [None]:
def flatten_lists(df1,df2, col):
    max_length = df1[col].apply(len).max()
    max_length = max(max_length, df2[col].apply(len).max())
    for i in range(max_length):
        name = col + "_" + str(i)
        df1[name] = df1[col].apply(lambda x: x[i] if i < len(x) else np.nan)
        convert_to_numeric(df1, name)
        df2[name] = df2[col].apply(lambda x: x[i] if i < len(x) else np.nan)
        convert_to_numeric(df2, name)
        
    df1.drop(columns=[col], inplace=True)
    df2.drop(columns=[col], inplace=True)
    return max_length
flatten_lists(train, test, 'add_671_x')
flatten_lists(train, test, 'add_671_y')


In [None]:
# Identify numerical and categorical columns
numerical_cols_train = train.select_dtypes(include=['int64', 'float64']).columns
categorical_cols_train = train.select_dtypes(include=['object']).columns

numerical_cols_test = test.select_dtypes(include=['int64', 'float64']).columns
categorical_cols_test = test.select_dtypes(include=['object']).columns


print('Categorical columns train: ', categorical_cols_train)
print('Categorical columns test: ', categorical_cols_test)
# Print how many distinct values each categorical column has
print("\nDistinct values in categorical columns (train):")
for col in categorical_cols_train:
    print(f"{col}: {train[col].nunique()}")

print("\nDistinct values in categorical columns (test):")
for col in categorical_cols_test:
    print(f"{col}: {test[col].nunique()}")


In [10]:
# Handle missing values for numerical columns train
for col in numerical_cols_train:
    train[col] = train[col].fillna(train[col].mean())
# Handle missing values for categorical columns train
for col in categorical_cols_train:
    train[col] = train[col].fillna('NA')

# Handle missing values for numerical columns test
for col in numerical_cols_test:
    test[col] = test[col].fillna(test[col].mean())
# Handle missing values for categorical columns test
for col in categorical_cols_test:
    test[col] = test[col].fillna('NA')

# # Handle outliers for numerical columns using Z-score
# for col in numerical_cols_train:
#     train = train[(np.abs(stats.zscore(train[col])) < 3)]
# for col in numerical_cols_test:
#     test = test[(np.abs(stats.zscore(test[col])) < 3)]

In [None]:
# Release memory by deleting unnecessary variables and forcing garbage collection
del  test_2_1, test_2_2, train_1, train_2_1, train_2_2
gc.collect()

In [12]:
# Select relevant features
# features = train.columns.drop(categorical_cols_train).drop('label')
features = train.columns.drop(['id', 'label','loan_id'])
X_train = train[features]
y_train = train['label']
X_test = test[features]

In [None]:
# Release memory by deleting unnecessary variables and forcing garbage collection
del train, test
gc.collect()

In [14]:
# Remove 'loan_id' and 'id' from the list of categorical columns if they exist
categorical_cols_train = [col for col in categorical_cols_train if col not in ['loan_id', 'id']]
categorical_cols_test = [col for col in categorical_cols_test if col not in ['loan_id', 'id']]

# Remove 'loan_id' and 'id' from the features if they exist
X_train = X_train.drop(columns=['loan_id', 'id'], errors='ignore')
X_test = X_test.drop(columns=['loan_id', 'id'], errors='ignore')

# Convert categorical columns to numerical using one-hot encoding
X_train = pd.get_dummies(X_train, columns=categorical_cols_train)
X_test = pd.get_dummies(X_test, columns=categorical_cols_test)

# Ensure the same columns in train and test after one-hot encoding
X_train, X_test = X_train.align(X_test, join='inner', axis=1, fill_value=0)

# Rename columns to ensure they are valid strings without special characters
X_train.columns = [str(col).replace('[', '').replace(']', '').replace('<', '').replace('>', '') for col in X_train.columns]
X_test.columns = [str(col).replace('[', '').replace(']', '').replace('<', '').replace('>', '') for col in X_test.columns]



In [None]:

import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
from zmq import device

# Split data for validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# # Train the XGBoost model
# model = xgb.XGBClassifier(
#     n_estimators=1000,        # Increase the number of boosting rounds
#     learning_rate=0.05,       # Reduce the learning rate
#     max_depth=15,             # Increase the maximum depth of trees
#     eval_metric='auc',        # Evaluation metric
#     random_state=42           # Seed for reproducibility
# )
model = xgb.XGBClassifier(
    reg_lambda=9.462550627558011e-06,
    alpha=0.0004394598576628656,
    subsample=0.933188231971851,
    colsample_bytree=0.5047365676319157,
    max_depth=10,
    learning_rate=0.019850318615112727,
    n_estimators=331,
    min_child_weight=8,
    gamma=4.346407258134641,
    eval_metric='auc',
    random_state=42,
    booster='dart',
    tree_method='hist',
    device = 'cuda'
)
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=True
    )


In [None]:
# You can then make predictions on the test set
predictions = model.predict_proba(X_test)[:, 1]  # Get probabilities for class 1

# Ensure the lengths match
# predictions = predictions[:len(test_1)]

# Save predictions to a submission file
submission = pd.DataFrame({
    'loan_id': test_1['loan_id'],
    'prob': predictions
})
submission.to_csv('submission.csv', index=False)

In [None]:
import optuna
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score

# Split data for validation
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


# Objective function for Optuna
def objective(trial):
    # Sample hyperparameters
    param = {
        'verbosity': 0,
        'objective': 'binary:logistic',
        # 'tree_method': 'hist',  # Use GPU
        # 'device': 'cuda',       # Use GPU
        'eval_metric': 'auc',
        'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 5000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0, 5)
    }

    # Create XGBoost DMatrix for train and validation sets
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)

    # # Train the model
    # model = xgb.train(param, dtrain, evals=[(dvalid, 'validation')], early_stopping_rounds=10, verbose_eval=False)

    # # Predict on validation set
    # preds = model.predict(dvalid)
    
    # # Evaluate the model
    # auc = roc_auc_score(y_valid, preds)
    cv_results = xgb.cv(
        params=param,
        dtrain=dtrain,
        nfold=5,  # 5-fold cross-validation
        num_boost_round=1000,
        early_stopping_rounds=20,
        metrics='auc',
        seed=42,
        verbose_eval=False
    )
    # Extract the best score from cross-validation results
    mean_auc = cv_results['test-auc-mean'].max()
    return mean_auc


In [None]:
# Create Optuna study
study = optuna.create_study(direction='maximize')  # We want to maximize the ROC AUC score

# Optimize the study
study.optimize(objective, n_trials=100, timeout=3600)  # You can adjust the number of trials and timeout

# Get the best hyperparameters
best_params = study.best_params
print("Best hyperparameters: ", best_params)

In [None]:
import optuna.visualization as vis

# Save optimization history plot
opt_history = vis.plot_optimization_history(study)
opt_history.write_image("optuna_optimization_history.png")

# Save hyperparameter importance plot
param_importance = vis.plot_param_importances(study)
param_importance.write_image("optuna_param_importance.png")

# Save parallel coordinate plot
parallel_plot = vis.plot_parallel_coordinate(study)
parallel_plot.write_image("optuna_parallel_coordinate.png")

In [None]:
#save best hyperparameters to a file

with open('best_hyperparams.txt', 'w') as f:
    f.write(f"Best Hyperparameters: {best_params}\n")
    f.write(f"Best AUC Score: {study.best_value}\n")

In [21]:
best_params['verbosity'] = 0
best_params['objective'] = 'binary:logistic'
best_params['eval_metric'] = 'auc'

dtrain = xgb.DMatrix(X_train, label=y_train)

# Train the model using the best hyperparameters
final_model = xgb.train(best_params, dtrain, num_boost_round=1000)

In [None]:
#save model
final_model.save_model('model.json')

In [22]:
dtest = xgb.DMatrix(X_test)
test_preds = final_model.predict(dtest)

# Prepare submission file
submission = pd.DataFrame({
    'loan_id': test_1['loan_id'],
    'prob': test_preds
})

submission.to_csv('submission.csv', index=False)