In [1]:
import numpy as np
import pandas as pd

import joblib
import optuna
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import VotingClassifier

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

SEED = 578
np.random.seed(SEED)

In [2]:
train_df = pd.read_csv('../data/preprocess_train.csv', index_col=0)
valid_df = pd.read_csv('../data/preprocess_valid.csv', index_col=0)
test_df = pd.read_csv('../data/preprocess_test.csv', index_col=0)

# optuna hyperparameter optimization

In [3]:
def optimize_hyperparameters(objective, n_trials):
    optuna.logging.set_verbosity(optuna.logging.CRITICAL)
    study = optuna.create_study(direction = "maximize")
    study.optimize(objective, n_trials=n_trials)
    print("Total Trials:", len(study.trials))
    print("Best Score:", study.best_value)
    print("Best Parameters:", study.best_params)
    return study

def plot_study(study):
    optuna.visualization._get_intermediate_plot(study)
    optuna.visualization._get_slice_plot(study)
    optuna.visualization.plot_optimization_history(study)
    optuna.visualization.plot_parallel_coordinate(study)
    optuna.visualization.plot_contour(study)
    optuna.visualization.plot_param_importances(study)

## LightGBM

In [4]:
# def lgbm_objective(trial):
#     hyperparams = {
#         'n_estimators' : trial.suggest_int('n_estimators',500,1000),
#         "max_depth":trial.suggest_int('max_depth',3,50),
#         "learning_rate" : trial.suggest_float('learning_rate',1e-4,0.25,log=True),
#         "min_child_weight" : trial.suggest_float('min_child_weight',0.5,4),
#         "min_child_samples" : trial.suggest_int('min_child_samples',1,250),
#         "subsample" : trial.suggest_float('subsample',0.2,1),
#         "subsample_freq" : trial.suggest_int('subsample_freq',0,5),
#         "colsample_bytree" : trial.suggest_float('colsample_bytree',0.2,1),
#         'num_leaves' : trial.suggest_int('num_leaves',2,128),
#     }
    
#     lgbm_model = lgb.LGBMClassifier(**hyperparams, random_state=SEED)

#     # aucs = cross_val_score(lgbm_model, X, y, cv = 5, scoring='roc_auc')
#     # auc_mean = aucs.mean()

#     # print("AUCs:", aucs)
#     # print("AUC Mean:", auc_mean)

#     train_df = pd.read_csv('../data/playground_train.csv', index_col=0)
#     train_folds = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED).split(train_df, train_df['smoking']))
#     train_fold_aucs = []

#     X = train_df.drop(['smoking'], axis=1)
#     y = train_df['smoking']

#     for train_fold, (train_index, val_index) in enumerate(train_folds):
#         print(f'Fold {train_fold} Training: ...\n')
#         X_train, X_val = X.iloc[train_index], X.iloc[val_index]
#         y_train, y_val = y.iloc[train_index], y.iloc[val_index]

#         lgbm_model.fit(X_train, y_train)

#         y_pred = lgbm_model.predict_proba(X_val)[:,1]
        
#         train_fold_auc = roc_auc_score(y_val, y_pred)
#         train_fold_aucs.append(train_fold_auc)

#     train_fold_auc_mean = sum(train_fold_aucs) / len(train_fold_aucs)

#     print("AUCs:", train_fold_aucs)
#     print("AUC Mean:", train_fold_auc_mean)

#     return train_fold_auc_mean

In [5]:
lgbm_study = optimize_hyperparameters(lgbm_objective, 10)

Fold 0 Training: ...

[LightGBM] [Info] Number of positive: 55722, number of negative: 71682
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2150
[LightGBM] [Info] Number of data points in the train set: 127404, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.437365 -> initscore=-0.251865
[LightGBM] [Info] Start training from score -0.251865
Fold 1 Training: ...

[LightGBM] [Info] Number of positive: 55722, number of negative: 71683
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2136
[LightGBM] [Info] Number of data points in the train set: 127405, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.437361 -> initscore=-0.251879
[LightGBM] [Info] Start training from score -0.251879
Fold 2 Training: ...

[LightGBM] [Info] Number of positive: 55722, number of negative: 71683
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total

In [6]:
best_lgbm_hyperparams =  {
    'n_estimators': 589, 
    'max_depth': 37, 
    'learning_rate': 0.0337181156999899, 
    'min_child_weight': 0.8208416035206278, 
    'min_child_samples': 216, 
    'subsample': 0.3917309843360768, 
    'subsample_freq': 2, 
    'colsample_bytree': 0.8168392899314203, 
    'num_leaves': 82
}

## XgBoost

In [8]:
def xgb_objective(trial):
    hyperparams = {
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
        "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
        "subsample": trial.suggest_float("subsample", 0.01, 1.0, step = 0.1),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0, step = 0.1),
        "max_depth": trial.suggest_int("max_depth", 1, 12),
        "n_estimators": trial.suggest_int("n_estimators", 256, 4096),
        "eta": trial.suggest_float("eta", 0.01, 0.5, step = 0.01),
        "gamma": trial.suggest_loguniform("gamma", 1e-8, 1.0),
        "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
        "tree_method": "hist",
    }

    if hyperparams["booster"] == "dart":
        hyperparams["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        hyperparams["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        hyperparams["rate_drop"] = trial.suggest_loguniform("rate_drop", 1e-8, 1.0)
        hyperparams["skip_drop"] = trial.suggest_loguniform("skip_drop", 1e-8, 1.0)

    xgb_model = xgb.XGBClassifier(**hyperparams, random_state=SEED)
 
    train_df = pd.read_csv('../data/playground_train.csv', index_col=0)
    train_folds = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED).split(train_df, train_df['smoking']))
    train_fold_aucs = []

    X = train_df.drop(['smoking'], axis=1)
    y = train_df['smoking']

    for train_fold, (train_index, val_index) in enumerate(train_folds):
        print(f'Fold {train_fold} Training: ...\n')
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        xgb_model.fit(X_train, y_train)
        
        y_pred = xgb_model.predict_proba(X_val)[:,1]
        
        train_fold_auc = roc_auc_score(y_val, y_pred)
        train_fold_aucs.append(train_fold_auc)

    train_fold_auc_mean = sum(train_fold_aucs) / len(train_fold_aucs)

    print("AUCs:", train_fold_aucs)
    print("AUC Mean:", train_fold_auc_mean)

    return train_fold_auc_mean

In [9]:
xgb_study = optimize_hyperparameters(xgb_objective, 10)

  "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
  "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
  "gamma": trial.suggest_loguniform("gamma", 1e-8, 1.0),


Fold 0 Training: ...

Fold 1 Training: ...

Fold 2 Training: ...

Fold 3 Training: ...

Fold 4 Training: ...

AUCs: [0.7611599062589163, 0.7736326402543916, 0.767465782809023, 0.7572242545080902, 0.7701022069701862]
AUC Mean: 0.7659169581601214
Fold 0 Training: ...



  "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
  "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
  "gamma": trial.suggest_loguniform("gamma", 1e-8, 1.0),


Fold 1 Training: ...

Fold 2 Training: ...

Fold 3 Training: ...

Fold 4 Training: ...

AUCs: [0.8629763487411397, 0.8655292494674005, 0.8638838212183516, 0.8622809356354741, 0.8609119757596082]
AUC Mean: 0.8631164661643949
Fold 0 Training: ...



  "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
  "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
  "gamma": trial.suggest_loguniform("gamma", 1e-8, 1.0),


Fold 1 Training: ...

Fold 2 Training: ...

Fold 3 Training: ...

Fold 4 Training: ...

AUCs: [0.8579095742670222, 0.8581327326261061, 0.8585858887905442, 0.8557561576886481, 0.8557236107598825]
AUC Mean: 0.8572215928264406
Fold 0 Training: ...



  "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
  "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
  "gamma": trial.suggest_loguniform("gamma", 1e-8, 1.0),


Fold 1 Training: ...

Fold 2 Training: ...

Fold 3 Training: ...

Fold 4 Training: ...

AUCs: [0.8687105817642911, 0.8716033206069198, 0.8695710587641129, 0.8669118168104226, 0.8654436178436964]
AUC Mean: 0.8684480791578885
Fold 0 Training: ...



  "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
  "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
  "gamma": trial.suggest_loguniform("gamma", 1e-8, 1.0),
  hyperparams["rate_drop"] = trial.suggest_loguniform("rate_drop", 1e-8, 1.0)
  hyperparams["skip_drop"] = trial.suggest_loguniform("skip_drop", 1e-8, 1.0)


Fold 1 Training: ...

Fold 2 Training: ...



KeyboardInterrupt: 

In [9]:
best_xgb_hyperparams =  {
      # 'n_estimators': 748, 
      # 'max_depth': 46, 
      # 'min_child_weight': 32.69774846647831, 
      # 'learning_rate': 0.0370498003558445, 
      # 'subsample': 0.8695796691153823, 
      # 'gamma': 0.597338372907374, 
      # 'colsample_bytree': 0.6865936901180945, 
      # 'colsample_bylevel': 0.551057156483749, 
      # 'colsample_bynode': 0.30581796867932687
}

## CatBoost

In [None]:
def cb_objective(trial):
    hyperparams = {
        'iterations': trial.suggest_int('iterations',500,750),
        'depth': trial.suggest_int('depth',3,10),
        'learning_rate': trial.suggest_float('learning_rate',1e-4,0.2,log=True),
        'random_strength': trial.suggest_int('random_strength',0,100),
        'bagging_temperature': trial.suggest_float('bagging_temperature',0,1),
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg',3,30),
        'border_count': trial.suggest_int('border_count',32,255),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight',0.01,1.0),
    }
    
    catboost_model = cb.CatBoostClassifier(**hyperparams, random_seed=SEED)

    train_df = pd.read_csv('../data/playground_train.csv', index_col=0)
    train_folds = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED).split(train_df, train_df['smoking']))
    train_fold_aucs = []

    X = train_df.drop(['smoking'], axis=1)
    y = train_df['smoking']
    
    for train_fold, (train_index, val_index) in enumerate(train_folds):
        print(f'Fold {train_fold} Training: ...\n')
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        catboost_model.fit(X_train, y_train)
        
        y_pred = catboost_model.predict_proba(X_val)[:, 1]
        
        train_fold_auc = roc_auc_score(y_val, y_pred)
        train_fold_aucs.append(train_fold_auc)

    train_fold_auc_mean = sum(train_fold_aucs) / len(train_fold_aucs)

    print("AUCs:", train_fold_aucs)
    print("AUC Mean:", train_fold_auc_mean)
    
    return train_fold_auc_mean

In [None]:
cb_study = optimize_hyperparameters(cb_objective, 10)

In [12]:
best_cb_hyperparams =  {
      # 'iterations': 593,
      # 'depth': 4,
      # 'learning_rate': 0.12259905637676717,
      # 'random_strength': 62,
      # 'bagging_temperature': 0.2083824448364907,
      # 'l2_leaf_reg': 26,
      # 'border_count': 142,
      # 'scale_pos_weight': 0.7204495417194038
}

# train

In [13]:
X_train = train_df.drop(['smoking'], axis=1)
y_train = train_df['smoking']

X_valid = valid_df.drop(['smoking'], axis=1)
y_valid = valid_df['smoking']

X_test = test_df

In [14]:
models = [
    lgb.LGBMClassifier(**best_lgbm_hyperparams, random_state=SEED),
    xgb.XGBClassifier(**best_xgb_hyperparams, random_state=SEED),
    cb.CatBoostClassifier(**best_cb_hyperparams, random_state=SEED),
]

num_models = len(models)

In [15]:
num_folds = 5
train_folds = joblib.load(f'../fold/{num_folds}_train_stratifiedkfolds.jl')

In [16]:
preds_train = np.zeros((len(X_train), num_models))
preds_valid = np.zeros((len(X_valid), num_models))
preds_test = np.zeros((len(X_test), num_models))

for train_fold_idx, (t_idx, v_idx) in enumerate(train_folds):
    print(f'Fold {train_fold_idx} Training: ...\n')
    X_t, y_t = X_train.iloc[t_idx], y_train.iloc[t_idx]
    X_v, y_v = X_train.iloc[v_idx], y_train.iloc[v_idx]

    for model_idx, model in enumerate(models):
        print(f'Fold {train_fold_idx} Model {model_idx} Fitting: ...\n')
        model.fit(X_t, y_t)

        print(f'Fold {train_fold_idx} Model {model_idx} Predicting: ...\n')
        pred_train = model.predict_proba(X_v)[:, 1]
        pred_valid = model.predict_proba(X_valid)[:, 1]
        pred_test = model.predict_proba(X_test)[:, 1]

        preds_train[v_idx, model_idx] = pred_train
        preds_valid[:, model_idx] += pred_valid / num_folds
        preds_test[:, model_idx] += pred_test / num_folds

np.save('../prediction/preds_train.npy', preds_train)
np.save('../prediction/preds_valid.npy', preds_valid)
np.save('../prediction/preds_test.npy', preds_test)

Fold 0 Training: ...

Fold 0 Model 0 Fitting: ...

[LightGBM] [Info] Number of positive: 39703, number of negative: 51072
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2089
[LightGBM] [Info] Number of data points in the train set: 90775, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.437378 -> initscore=-0.251810
[LightGBM] [Info] Start training from score -0.251810
Fold 0 Model 0 Predicting: ...

Fold 0 Model 1 Fitting: ...

Fold 0 Model 1 Predicting: ...

Fold 0 Model 2 Fitting: ...

Learning rate set to 0.070629
0:	learn: 0.6577475	total: 64.6ms	remaining: 1m 4s
1:	learn: 0.6286833	total: 71.8ms	remaining: 35.8s
2:	learn: 0.6059112	total: 78.9ms	remaining: 26.2s
3:	learn: 0.5867026	total: 85.6ms	remaining: 21.3s
4:	learn: 0.5702969	total: 92.8ms	remaining: 18.5s
5:	learn: 0.5570622	total: 99.8ms	remaining: 16.5s
6:	learn: 0.5456449	total: 107ms	rema

In [17]:
for model_idx in range(num_models):
    print(f'Model {model_idx} train AUC: = {roc_auc_score(y_train, preds_train[:, model_idx]):.6f}\tvalid AUC:  = {roc_auc_score(y_valid, preds_valid[:, model_idx]):.6f}')

Model 0 train AUC: = 0.861926	valid AUC:  = 0.867738
Model 1 train AUC: = 0.860660	valid AUC:  = 0.871544
Model 2 train AUC: = 0.864935	valid AUC:  = 0.871305


In [18]:
# preds_train = np.load('../prediction/preds_train.npy')
# preds_valid = np.load('../prediction/preds_valid.npy')
# preds_test = np.load('../prediction/preds_test.npy')

## ensemble

In [19]:
VALID_RESULTS = {'Dataset': 'valid'}

### best model

In [20]:
best_valid_score = float('-inf')
best_model_idx = None

for model_idx in range(num_models):
    valid_score = roc_auc_score(y_valid, preds_valid[:, model_idx])
    if valid_score > best_valid_score:
        best_valid_score = valid_score
        best_model_idx = model_idx

print(f'Best Model Index: {best_model_idx}')
VALID_RESULTS['Best Model'] = roc_auc_score(y_valid, preds_valid[:, best_model_idx])

Best Model Index: 1


In [21]:
VALID_RESULTS['Best Model'] = roc_auc_score(y_valid, preds_valid[:, best_model_idx])

### average

In [22]:
VALID_RESULTS['Average'] = roc_auc_score(y_valid, preds_valid.mean(axis = 1))

### weighted average

In [23]:
model_weights = [roc_auc_score(y_valid, preds_valid[:, model_idx]) for model_idx in range(num_models)]
normalized_model_weights = model_weights / np.sum(model_weights)

weighted_average_preds_train = np.average(preds_train, weights=normalized_model_weights, axis=1)
weighted_average_preds_valid = np.average(preds_valid, weights=normalized_model_weights, axis=1)

VALID_RESULTS['Weighted Average'] = roc_auc_score(y_valid, weighted_average_preds_valid)

### stacking

In [24]:
stacking_model = models[1]

stacking_model.fit(preds_valid, y_valid)
stacking_preds_valid = stacking_model.predict_proba(preds_valid)[:, 1]

VALID_RESULTS['Stacking'] = roc_auc_score(y_valid, stacking_preds_valid)

## result

In [25]:
pd.DataFrame.from_records([VALID_RESULTS])

Unnamed: 0,Dataset,Best Model,Average,Weighted Average,Stacking
0,valid,0.871544,0.871287,0.87129,0.891665


# predict

## ensemble

### stacking

In [26]:
stacking_preds_test = stacking_model.predict_proba(preds_test)[:, 1]

# submit

In [27]:
submission_df = pd.read_csv('../submission/sample_submission.csv')
submission_df['smoking'] = stacking_preds_test
submission_df.to_csv(f'../submission/submission.csv', index = False)

# pseudo-labels

In [28]:
# def make_pseudo_set(cutoff):
#     test_df['pred'] = y_pred

#     pseudo_set_1 = test_df[test_df['pred'] > cutoff]
#     pseudo_set_1['smoking'] = 1
#     pseudo_set_1.drop(['pred'], axis=1, inplace=True)

#     pseudo_set_2 = test_df[test_df['pred'] < 1-cutoff]
#     pseudo_set_2['smoking'] = 0
#     pseudo_set_2.drop(['pred'], axis=1, inplace=True)

#     pseudo_df = pd.concat([pseudo_set_1,pseudo_set_2])

#     return pseudo_df

In [29]:
# pseudo_df = make_pseudo_set(0.95)
# pseudo_train_df = pd.concat([train_df, pseudo_df])

# X_pseudo_train = pseudo_train_df.drop(['smoking'], axis=1)
# y_pseudo_train = pseudo_train_df['smoking']
 
# pseudo_model = xgb.XGBClassifier(**best_xgb_hyperparams)
# pseudo_model.fit(X_pseudo_train, y_pseudo_train)

# submission_df = pd.read_csv('../submission/sample_submission.csv')
# submission_df['smoking'] =  pseudo_model.predict_proba(test_df.drop(['smoking'], axis=1))[:,1]
# submission_df.to_csv('../submission/pseudo_xgb_submission.csv', index=False)