In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, auc
import xgboost as xgb
import lightgbm as lgb
import optuna
from optuna.samplers import TPESampler
from autogluon.tabular import TabularDataset, TabularPredictor
import matplotlib.pylab as plt


## Data Processing

In [None]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

X_train, X_val, y_train, y_val = train_test_split(train_data.drop(['label'], axis=1), train_data['label'], random_state=42, train_size=0.8)


ag_train = X_train.copy()
ag_train['label'] = y_train

## LGBMClassifier

In [None]:
def LGBM_objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 50, 250),
        'max_depth': trial.suggest_int('max_depth', 6, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.03),
        'n_estimators': trial.suggest_int('n_estimators', 500, 3000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
        'subsample': trial.suggest_float('subsample', 0.3, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 2.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 2.0),
        'random_state': 42,
        'early_stopping': 100,
        'verbose': -1
    }
    
    lgb_model = lgb.train(
        param, 
        lgb.Dataset(X_train, label=y_train), 
        valid_sets=[lgb.Dataset(X_val, label=y_val, reference=lgb.Dataset(X_train, label=y_train))], 
    )
    
    y_pred = lgb_model.predict(X_val)
    auc_score = roc_auc_score(y_val, y_pred)
    return auc_score

sampler = TPESampler(seed=42)

lgbm_study = optuna.create_study(direction='maximize', sampler=sampler)
lgbm_study.optimize(LGBM_objective, n_trials=200)

In [None]:
print(lgbm_study.best_value)
print(lgbm_study.best_params)

In [None]:
lgbm_best_params = lgbm_study.best_params

optimal_lgbm = lgb.LGBMClassifier(
    **lgbm_best_params, 
    objective='binary', 
    boosting_type='gbdt', 
    random_state=42, 
    metric='auc', 
    early_stopping=100
)

optimal_lgbm.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='auc')

In [None]:
optimal_lgbm.best_score_

In [None]:
y_val

In [None]:
lgbm_y_pred = optimal_lgbm.predict_proba(X_val)[:, 1]

lgbm_fpr, lgbm_tpr, lgbm_thresholds = roc_curve(y_val, lgbm_y_pred)
lgbm_auroc = auc(lgbm_fpr, lgbm_tpr)
print(f"LightGBM AUROC: {lgbm_auroc}")

plt.plot(lgbm_fpr, lgbm_tpr)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

_ = plt.title('LightGBM ROC Curve')


In [None]:
lgbm_predictions = optimal_lgbm.predict_proba(test_data)[:, 1]
lgbm_predictions

## XGBoost

In [None]:
X_train

In [None]:
def XGBCobjective(trial):
    xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'random_state': 42,
        'early_stopping_rounds': 200,
        'n_estimators': trial.suggest_int('n_estimators', 500, 3000),
        'max_depth': trial.suggest_int('max_depth', 5, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.03),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 3),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.1, 3),
        'gamma': trial.suggest_float('gamma', 0.01, 2),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
        
    }
    
    xgbc = xgb.XGBClassifier(**xgb_params, verbosity=0)
    
    xgbc.fit(X_train, y_train, eval_set=[(X_val, y_val)])
    
    y_pred = xgbc.predict_proba(X_val)[:, 1]
    auc_score = roc_auc_score(y_val, y_pred)
    return auc_score
    
sampler = TPESampler(seed=42)
optuna.logging.set_verbosity(optuna.logging.INFO)
xgb.set_config(verbosity=0)
xgbc_study = optuna.create_study(direction='maximize', sampler=sampler)
xgbc_study.optimize(XGBCobjective, n_trials=200)

In [None]:
print(xgbc_study.best_value)
print(xgbc_study.best_params)

In [None]:
xgbc_study.best_params

In [None]:
xgb_best_params = xgbc_study.best_params

In [None]:
optimal_xgb = xgb.XGBClassifier(
    **xgb_best_params, random_state=42, eval_metric='auc', early_stopping_rounds=200, objective='binary:logistic',
)

In [None]:
optimal_xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)])

In [None]:
optimal_xgb.best_score

### XGBoost ROC Curve

In [None]:
xgb_y_pred = optimal_xgb.predict_proba(X_val)[:, 1]

xgb_fpr, xgb_tpr, xgb_thresholds = roc_curve(y_val, xgb_y_pred)
xgb_auroc = auc(xgb_fpr, xgb_tpr)
print(f"XGBoost AUROC: {xgb_auroc}")

plt.plot(xgb_fpr, xgb_tpr)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False PositiveRate')
plt.ylabel('True Positive Rate')
_ = plt.title('XGBoost ROC Curve')

In [None]:
xgb_predictions = optimal_xgb.predict_proba(test_data)[:, 1]
xgb_predictions

## Autogluon Model

In [None]:
predictor = TabularPredictor(
    label='label', 
    path='autogluon_model/', 
    eval_metric='roc_auc'
).fit(ag_train, presets='best_quality', num_gpus=1, time_limit=10800)

autogluon_predictions = predictor.predict_proba(test_data, tuning_data=X_val, as_multiclass=False, transform_features=False)

ag_y_pred = predictor.predict_proba(X_val, as_multiclass=False, transform_features=False)

In [None]:
predictor = TabularPredictor.load("/Users/jubinchoi/s25_uh_manoa/ics435/hw3/autogluon_model")

In [None]:
predictor.fit_summary()

In [None]:
ag_y_pred = predictor.predict_proba(X_val, as_multiclass=False, transform_features=False)

In [None]:
autogluon_predictions = predictor.predict_proba(test_data)[:, 1]

### Submission File

In [None]:
idx = np.arange(0, 50000).reshape(-1, 1)
lgbm_predictions = lgbm_predictions.reshape(-1, 1)
xgb_predictions = xgb_predictions.reshape(-1, 1)
autogluon_predictions = autogluon_predictions.reshape(-1, 1)

lgbm_output = np.hstack((ids, lgbm_predictions))
xgb_output = np.hstack((ids, xgb_predictions))
autogluon_output np.hstack((ids, autogluon_predictions))

np.savetxt(fname='xgboost_submission_optuna.csv', X=lgbm_output, header='Id,Predicted', delimiter=',', comments='')
np.savetxt(fname='lgbm_submission_optuna.csv', X=xgb_output, header='Id,Predicted', delimiter=',', comments='')
np.savetxt(fname='autogluon_submission.csv', X=autogluon_predictions, header='Id,Predicted', delimiter=',', comments='')
