In [6]:
import os
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import optuna

# Random seed for reproducibility
SEED = 1123
def set_global_seed(seed):
    np.random.seed(seed)
    
set_global_seed(SEED)

def read_and_process_data():
    """Reads and processes the data."""
    train_df = pd.read_csv('data/train.csv')
    test_df = pd.read_csv('data/test.csv')
    for df in [train_df, test_df]:
        process_employee_data(df)
    return train_df, test_df

def process_employee_data(df):
    """Processes the employee columns to convert them into float type."""
    df.employee1 = df.employee1.astype('str').str.replace(",", "").astype('float')
    df.employee2 = df.employee2.astype('str').str.replace(",", "").astype('float')

def fill_missing_values(df):
    """Fills the missing values in the dataframe."""
    df.loc[df.inst_id == 430, ['instkind']] = 'dental_clinic'
    df.loc[df.inst_id == 430, ['bedCount']] = 0
    df.loc[df.inst_id == 413, ['bedCount']] = -999

    factor_columns = df.select_dtypes(include=['object']).columns
    numeric_columns = df.columns.difference(factor_columns)
    df[factor_columns] = df[factor_columns].fillna('Not_sure')
    df[numeric_columns] = df[numeric_columns].fillna(-999)

def label_encode(df, factor_columns):
    """Encodes the categorical columns using Label Encoder."""
    fac_le = LabelEncoder()
    df[factor_columns] = df.loc[:, factor_columns].apply(lambda x: fac_le.fit_transform(x))


def objective_for_rf(trial, X, y):
    """Objective function for tuning Random Forest using Optuna."""
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 150),
        'max_depth': trial.suggest_int('max_depth', 2, 32, log=True),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 16),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 16),
        'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2'])
    }
    model = RandomForestClassifier(**params, random_state=SEED)
    return np.mean(cross_val_score(model, X, y, cv=3, scoring='f1'))

def objective_for_gbm(trial, X, Y):
    """Objective function for tuning Gradient Boosting using Optuna."""
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 150),
        'max_depth': trial.suggest_int('max_depth', 2, 32, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 16),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 16)
    }


    model = GradientBoostingClassifier(**params)
    return np.mean(cross_val_score(model, X, Y, cv=3, scoring='f1'))

def objective_for_xgb(trial, X, Y):
    """Objective function for tuning XGBoost using Optuna."""
    
    dtrain = xgb.DMatrix(X, label=Y)
    
    param = {
        'silent': 1,
        'objective': 'binary:logistic',
        'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
        'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
    }

    if param['booster'] == 'gbtree' or param['booster'] == 'dart':
        param['max_depth'] = trial.suggest_int('max_depth', 1, 9)
        param['n_estimators'] = trial.suggest_int('n_estimators', 50, 300)
        param['learning_rate'] = trial.suggest_loguniform('learning_rate', 1e-8, 1.0)
        param['gamma'] = trial.suggest_loguniform('gamma', 1e-8, 1.0)
        param['grow_policy'] = trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])
        
    if param['booster'] == 'dart':
        param['sample_type'] = trial.suggest_categorical('sample_type', ['uniform', 'weighted'])
        param['normalize_type'] = trial.suggest_categorical('normalize_type', ['tree', 'forest'])
        param['rate_drop'] = trial.suggest_loguniform('rate_drop', 1e-8, 1.0)
        param['skip_drop'] = trial.suggest_loguniform('skip_drop', 1e-8, 1.0)
    
    bst = xgb.train(param, dtrain)
    preds = bst.predict(dtrain)
    
    pred_labels = np.rint(preds)
    return f1_score(Y, pred_labels)

def find_best_threshold(probs, y_true):
    best_f1 = 0
    best_threshold = 0
    for threshold in np.linspace(0, 1, 200): 
        preds = (probs >= threshold).astype(int)
        f1 = f1_score(y_true, preds)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    return best_threshold

def main():
    # Load and process data
    train_data, test_data = read_and_process_data()
    
    # Fill missing values
    fill_missing_values(train_data)
    fill_missing_values(test_data)
    
    # Process employee columns
    process_employee_data(train_data)
    process_employee_data(test_data)

    # Define columns that need to be label encoded
    factor_columns = train_data.select_dtypes(include=['object']).columns
    label_encode(train_data, factor_columns)
    label_encode(test_data, factor_columns)

    # Extract features and target variable
    X = train_data.drop(columns=['OC'])
    y = train_data['OC']
    
    # Split data into training and validation sets
    train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=SEED)

    # Optuna optimization for RandomForest
    study_rf = optuna.create_study(direction='maximize')
    study_rf.optimize(lambda trial: objective_for_rf(trial, train_X, train_y), n_trials=10)

    # Optuna optimization for GradientBoosting
    study_gbm = optuna.create_study(direction='maximize')
    study_gbm.optimize(lambda trial: objective_for_gbm(trial, train_X, train_y), n_trials=10)

    # Optuna optimization for XGBoost
    study_xgb = optuna.create_study(direction='maximize')
    study_xgb.optimize(lambda trial: objective_for_xgb(trial, train_X, train_y), n_trials=10)

    # Train models with optimized hyperparameters
    RF_model = RandomForestClassifier(**study_rf.best_params, random_state=SEED).fit(train_X, train_y)
    GBM_model = GradientBoostingClassifier(**study_gbm.best_params).fit(train_X, train_y)
    XGB_model = xgb.XGBClassifier(**study_xgb.best_params).fit(train_X, train_y)

    # Ensemble predictions on validation set
    ensemble_val = pd.DataFrame({
        'RF': RF_model.predict_proba(val_X)[:, 1],
        'GBM': GBM_model.predict_proba(val_X)[:, 1],
        'XGB': XGB_model.predict_proba(val_X)[:, 1]
    })
    ensemble_val['ens'] = ensemble_val.mean(axis=1)
    best_threshold = find_best_threshold(ensemble_val['ens'], val_y)

    # Ensemble predictions on test set
    ensemble = pd.DataFrame({
        'inst_id': test_data['inst_id'],
        'RF': RF_model.predict_proba(test_data.drop(columns=['OC']))[:, 1],
        'GBM': GBM_model.predict_proba(test_data.drop(columns=['OC']))[:, 1],
        'XGB': XGB_model.predict_proba(test_data.drop(columns=['OC']))[:, 1]
    })
    ensemble['ens'] = ensemble.mean(axis=1)
    ensemble['OC'] = (ensemble['ens'] >= best_threshold).astype(int)

    # Save the ensemble predictions
    submission = ensemble[['inst_id', 'OC']]
    submission.to_csv('ensemble_submission.csv', index=False)
    
main()


[I 2023-08-24 10:19:09,499] A new study created in memory with name: no-name-4c80b8c1-c83b-404d-9b04-eb0a4913495d


[I 2023-08-24 10:19:10,097] Trial 0 finished with value: 0.9743589743589743 and parameters: {'n_estimators': 140, 'max_depth': 20, 'min_samples_split': 12, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 0 with value: 0.9743589743589743.
[I 2023-08-24 10:19:10,437] Trial 1 finished with value: 0.9743589743589743 and parameters: {'n_estimators': 80, 'max_depth': 25, 'min_samples_split': 6, 'min_samples_leaf': 15, 'max_features': 'log2'}. Best is trial 0 with value: 0.9743589743589743.
[I 2023-08-24 10:19:10,908] Trial 2 finished with value: 0.9743589743589743 and parameters: {'n_estimators': 114, 'max_depth': 2, 'min_samples_split': 13, 'min_samples_leaf': 8, 'max_features': 'log2'}. Best is trial 0 with value: 0.9743589743589743.
[I 2023-08-24 10:19:11,329] Trial 3 finished with value: 0.9743589743589743 and parameters: {'n_estimators': 102, 'max_depth': 3, 'min_samples_split': 5, 'min_samples_leaf': 13, 'max_features': 'log2'}. Best is trial 0 with value: 0.9743589743589

Parameters: { "n_estimators", "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "n_estimators", "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "n_estimators", "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "n_estimators", "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "n_estimators", "silent" } are not used.

