In [1]:
import numpy as np
import pandas as pd

import joblib
import optuna
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import VotingClassifier

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

SEED = 578
np.random.seed(SEED)

In [2]:
train_df = pd.read_csv('../data/preprocess_train.csv', index_col=0)
valid_df = pd.read_csv('../data/preprocess_valid.csv', index_col=0)
test_df = pd.read_csv('../data/preprocess_test.csv', index_col=0)

# optuna hyperparameter optimization

In [3]:
# def optimize_hyperparameters(objective, n_trials):
#     optuna.logging.set_verbosity(optuna.logging.CRITICAL)
#     study = optuna.create_study(direction = "maximize")
#     study.optimize(objective, n_trials=n_trials)
#     print("Total Trials:", len(study.trials))
#     print("Best Score:", study.best_value)
#     print("Best Parameters:", study.best_params)
#     return study

# def plot_study(study):
#     optuna.visualization._get_intermediate_plot(study)
#     optuna.visualization._get_slice_plot(study)
#     optuna.visualization.plot_optimization_history(study)
#     optuna.visualization.plot_parallel_coordinate(study)
#     optuna.visualization.plot_contour(study)
#     optuna.visualization.plot_param_importances(study)

## LightGBM

In [4]:
# def lgbm_objective(trial):
#     hyperparams = {
#         'n_estimators' : trial.suggest_int('n_estimators',500,1000),
#         "max_depth":trial.suggest_int('max_depth',3,50),
#         "learning_rate" : trial.suggest_float('learning_rate',1e-4,0.25,log=True),
#         "min_child_weight" : trial.suggest_float('min_child_weight',0.5,4),
#         "min_child_samples" : trial.suggest_int('min_child_samples',1,250),
#         "subsample" : trial.suggest_float('subsample',0.2,1),
#         "subsample_freq" : trial.suggest_int('subsample_freq',0,5),
#         "colsample_bytree" : trial.suggest_float('colsample_bytree',0.2,1),
#         'num_leaves' : trial.suggest_int('num_leaves',2,128),
#     }
    
#     lgbm_model = lgb.LGBMClassifier(**hyperparams, random_state=SEED)

#     # aucs = cross_val_score(lgbm_model, X, y, cv = 5, scoring='roc_auc')
#     # auc_mean = aucs.mean()

#     # print("AUCs:", aucs)
#     # print("AUC Mean:", auc_mean)

#     train_df = pd.read_csv('../data/playground_train.csv', index_col=0)
#     train_folds = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED).split(train_df, train_df['smoking']))
#     train_fold_aucs = []

#     X = train_df.drop(['smoking'], axis=1)
#     y = train_df['smoking']

#     for train_fold, (train_index, val_index) in enumerate(train_folds):
#         print(f'Fold {train_fold} Training: ...\n')
#         X_train, X_val = X.iloc[train_index], X.iloc[val_index]
#         y_train, y_val = y.iloc[train_index], y.iloc[val_index]

#         lgbm_model.fit(X_train, y_train)

#         y_pred = lgbm_model.predict_proba(X_val)[:,1]
        
#         train_fold_auc = roc_auc_score(y_val, y_pred)
#         train_fold_aucs.append(train_fold_auc)

#     train_fold_auc_mean = sum(train_fold_aucs) / len(train_fold_aucs)

#     print("AUCs:", train_fold_aucs)
#     print("AUC Mean:", train_fold_auc_mean)

#     return train_fold_auc_mean

# lgbm_study = optimize_hyperparameters(lgbm_objective, 10)

In [5]:
best_lgbm_hyperparams =  {
    # 'n_estimators': 589, 
    # 'max_depth': 37, 
    # 'learning_rate': 0.0337181156999899, 
    # 'min_child_weight': 0.8208416035206278, 
    # 'min_child_samples': 216, 
    # 'subsample': 0.3917309843360768, 
    # 'subsample_freq': 2, 
    # 'colsample_bytree': 0.8168392899314203, 
    # 'num_leaves': 82
}

## XgBoost

In [6]:
# def xgb_objective(trial):
#     hyperparams = {
#         "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
#         "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
#         "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
#         "subsample": trial.suggest_float("subsample", 0.01, 1.0, step = 0.1),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0, step = 0.1),
#         "max_depth": trial.suggest_int("max_depth", 1, 12),
#         "n_estimators": trial.suggest_int("n_estimators", 256, 4096),
#         "eta": trial.suggest_float("eta", 0.01, 0.5, step = 0.01),
#         "gamma": trial.suggest_loguniform("gamma", 1e-8, 1.0),
#         "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
#         "tree_method": "hist",
#     }

#     if hyperparams["booster"] == "dart":
#         hyperparams["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
#         hyperparams["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
#         hyperparams["rate_drop"] = trial.suggest_loguniform("rate_drop", 1e-8, 1.0)
#         hyperparams["skip_drop"] = trial.suggest_loguniform("skip_drop", 1e-8, 1.0)

#     xgb_model = xgb.XGBClassifier(**hyperparams, random_state=SEED)
 
#     train_df = pd.read_csv('../data/playground_train.csv', index_col=0)
#     train_folds = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED).split(train_df, train_df['smoking']))
#     train_fold_aucs = []

#     X = train_df.drop(['smoking'], axis=1)
#     y = train_df['smoking']

#     for train_fold, (train_index, val_index) in enumerate(train_folds):
#         print(f'Fold {train_fold} Training: ...\n')
#         X_train, X_val = X.iloc[train_index], X.iloc[val_index]
#         y_train, y_val = y.iloc[train_index], y.iloc[val_index]

#         xgb_model.fit(X_train, y_train)
        
#         y_pred = xgb_model.predict_proba(X_val)[:,1]
        
#         train_fold_auc = roc_auc_score(y_val, y_pred)
#         train_fold_aucs.append(train_fold_auc)

#     train_fold_auc_mean = sum(train_fold_aucs) / len(train_fold_aucs)

#     print("AUCs:", train_fold_aucs)
#     print("AUC Mean:", train_fold_auc_mean)

#     return train_fold_auc_mean

# xgb_study = optimize_hyperparameters(xgb_objective, 10)

In [7]:
best_xgb_hyperparams =  {
      # 'booster': 'gbtree', 
      # 'lambda': 2.7605232454083657e-07, 
      # 'alpha': 8.808575699430626e-07, 
      # 'subsample': 0.11, 
      # 'colsample_bytree': 0.6, 
      # 'max_depth': 4, 
      # 'n_estimators': 2152, 
      # 'eta': 0.060000000000000005, 
      # 'gamma': 1.4797859540923306e-08, 
      # 'grow_policy': 'depthwise'
}

## CatBoost

In [8]:
# def cb_objective(trial):
#     hyperparams = {
#         'iterations': trial.suggest_int('iterations',500,750),
#         'depth': trial.suggest_int('depth',3,10),
#         'learning_rate': trial.suggest_float('learning_rate',1e-4,0.2,log=True),
#         'random_strength': trial.suggest_int('random_strength',0,100),
#         'bagging_temperature': trial.suggest_float('bagging_temperature',0,1),
#         'l2_leaf_reg': trial.suggest_int('l2_leaf_reg',3,30),
#         'border_count': trial.suggest_int('border_count',32,255),
#         'scale_pos_weight': trial.suggest_float('scale_pos_weight',0.01,1.0),
#     }
    
#     catboost_model = cb.CatBoostClassifier(**hyperparams, random_seed=SEED)

#     train_df = pd.read_csv('../data/playground_train.csv', index_col=0)
#     train_folds = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED).split(train_df, train_df['smoking']))
#     train_fold_aucs = []

#     X = train_df.drop(['smoking'], axis=1)
#     y = train_df['smoking']
    
#     for train_fold, (train_index, val_index) in enumerate(train_folds):
#         print(f'Fold {train_fold} Training: ...\n')
#         X_train, X_val = X.iloc[train_index], X.iloc[val_index]
#         y_train, y_val = y.iloc[train_index], y.iloc[val_index]

#         catboost_model.fit(X_train, y_train)
        
#         y_pred = catboost_model.predict_proba(X_val)[:, 1]
        
#         train_fold_auc = roc_auc_score(y_val, y_pred)
#         train_fold_aucs.append(train_fold_auc)

#     train_fold_auc_mean = sum(train_fold_aucs) / len(train_fold_aucs)

#     print("AUCs:", train_fold_aucs)
#     print("AUC Mean:", train_fold_auc_mean)
    
#     return train_fold_auc_mean

# cb_study = optimize_hyperparameters(cb_objective, 10)

In [9]:
best_cb_hyperparams = {
    # 'iterations': 691, 
    # 'depth': 6, 
    # 'learning_rate': 0.06417498314463302, 
    # 'random_strength': 91, 
    # 'bagging_temperature': 0.604989772339542, 
    # 'l2_leaf_reg': 22, 
    # 'border_count': 239, 
    # 'scale_pos_weight': 0.6770816654162939
}

# train

In [10]:
X_train = train_df.drop(['smoking'], axis=1)
y_train = train_df['smoking']

X_valid = valid_df.drop(['smoking'], axis=1)
y_valid = valid_df['smoking']

X_test = test_df

In [11]:
models = [
    lgb.LGBMClassifier(**best_lgbm_hyperparams, random_state=SEED),
    xgb.XGBClassifier(**best_xgb_hyperparams, random_state=SEED),
    cb.CatBoostClassifier(**best_cb_hyperparams, random_state=SEED),
]

num_models = len(models)

In [12]:
num_folds = 20
train_folds = joblib.load(f'../fold/{num_folds}_train_stratifiedkfolds.jl')

In [13]:
preds_train = np.zeros((len(X_train), num_models))
preds_valid = np.zeros((len(X_valid), num_models))
preds_test = np.zeros((len(X_test), num_models))

for train_fold_idx, (t_idx, v_idx) in enumerate(train_folds):
    print(f'Fold {train_fold_idx} Training: ...\n')
    X_t, y_t = X_train.iloc[t_idx], y_train.iloc[t_idx]
    X_v, y_v = X_train.iloc[v_idx], y_train.iloc[v_idx]

    for model_idx, model in enumerate(models):
        print(f'Fold {train_fold_idx} Model {model_idx} Fitting: ...\n')
        model.fit(X_t, y_t)

        print(f'Fold {train_fold_idx} Model {model_idx} Predicting: ...\n')
        pred_train = model.predict_proba(X_v)[:, 1]
        pred_valid = model.predict_proba(X_valid)[:, 1]
        pred_test = model.predict_proba(X_test)[:, 1]

        preds_train[v_idx, model_idx] = pred_train
        preds_valid[:, model_idx] += pred_valid / num_folds
        preds_test[:, model_idx] += pred_test / num_folds

np.save('../prediction/preds_train.npy', preds_train)
np.save('../prediction/preds_valid.npy', preds_valid)
np.save('../prediction/preds_test.npy', preds_test)

for model_idx in range(num_models):
    print(f'model index = {model_idx}\ttrain AUC = {roc_auc_score(y_train, preds_train[:, model_idx]):.6f}\tvalid AUC = {roc_auc_score(y_valid, preds_valid[:, model_idx]):.6f}')

Fold 0 Training: ...

Fold 0 Model 0 Fitting: ...

[LightGBM] [Info] Number of positive: 46688, number of negative: 63163
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1832
[LightGBM] [Info] Number of data points in the train set: 109851, number of used features: 67
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.425012 -> initscore=-0.302232
[LightGBM] [Info] Start training from score -0.302232
Fold 0 Model 0 Predicting: ...

Fold 0 Model 1 Fitting: ...

Fold 0 Model 1 Predicting: ...

Fold 0 Model 2 Fitting: ...

Learning rate set to 0.076623
0:	learn: 0.6528959	total: 66.1ms	remaining: 1m 5s
1:	learn: 0.6224947	total: 75.6ms	remaining: 37.7s
2:	learn: 0.5966978	total: 84.5ms	remaining: 28.1s
3:	learn: 0.5777581	total: 94ms	remaining: 23.4s
4:	learn: 0.5598167	total: 102ms	remaining: 20.4s
5:	learn: 0.5466419	total: 112ms	remaining: 18.5s
6:	learn: 0.5359080	total: 121ms	remaini

In [14]:
preds_train = np.load('../prediction/preds_train.npy')
preds_valid = np.load('../prediction/preds_valid.npy')
preds_test = np.load('../prediction/preds_test.npy')

## ensemble

In [15]:
TRAIN_RESULTS = {'Dataset': 'train'}
VALID_RESULTS = {'Dataset': 'valid'}

### best model

In [16]:
best_valid_score = float('-inf')
best_model_idx = None

for model_idx in range(num_models):
    valid_score = roc_auc_score(y_valid, preds_valid[:, model_idx])
    if valid_score > best_valid_score:
        best_valid_score = valid_score
        best_model_idx = model_idx

print(f'Best Model Index: {best_model_idx}')
TRAIN_RESULTS['Best Model'] = roc_auc_score(y_train, preds_train[:, best_model_idx])
VALID_RESULTS['Best Model'] = roc_auc_score(y_valid, preds_valid[:, best_model_idx])

Best Model Index: 1


### average

In [17]:
TRAIN_RESULTS['Average'] = roc_auc_score(y_train, preds_train.mean(axis = 1))
VALID_RESULTS['Average'] = roc_auc_score(y_valid, preds_valid.mean(axis = 1))

### weighted average

In [18]:
model_weights = [roc_auc_score(y_valid, preds_valid[:, model_idx]) for model_idx in range(num_models)]
normalized_model_weights = model_weights / np.sum(model_weights)

weighted_average_preds_train = np.average(preds_train, weights=normalized_model_weights, axis=1)
weighted_average_preds_valid = np.average(preds_valid, weights=normalized_model_weights, axis=1)

TRAIN_RESULTS['Weighted Average'] = roc_auc_score(y_train, weighted_average_preds_train)
VALID_RESULTS['Weighted Average'] = roc_auc_score(y_valid, weighted_average_preds_valid)

### stacking

In [19]:
stacking_model_idx = 1
stacking_model = models[stacking_model_idx]

In [20]:
stacking_model.fit(preds_valid, y_valid)
stacking_preds_valid = stacking_model.predict_proba(preds_valid)[:, 1]
stacking_preds_test = stacking_model.predict_proba(preds_test)[:, 1]

TRAIN_RESULTS['Stacking_V1'] = 0.0
VALID_RESULTS['Stacking_V1'] = roc_auc_score(y_valid, stacking_preds_valid)

In [21]:
num_folds
stacking_folds = list(StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=SEED).split(train_df, train_df['smoking']))

X_stacking_train = np.hstack((X_train, preds_train))
X_stacking_valid = np.hstack((X_valid, preds_valid))
X_stacking_test = np.hstack((X_test, preds_test))

stacking_preds_stacking_train = np.zeros(len(X_stacking_train))
stacking_preds_stacking_valid = np.zeros(len(X_stacking_valid))
stacking_preds_stacking_test = np.zeros(len(X_stacking_test))

for stacking_train_fold_idx, (stacking_t_index, stacking_v_index) in enumerate(stacking_folds):
    print(f'Fold {stacking_train_fold_idx} Training: ...\n')
    X_stacking_t, y_stacking_t = X_stacking_train[stacking_t_index, :], y_train.iloc[stacking_t_index]
    X_stacking_v, y_stacking_v = X_stacking_train[stacking_v_index, :], y_train.iloc[stacking_v_index]

    print(f'Fold {stacking_train_fold_idx} Stacking Model Fitting: ...\n')
    stacking_model.fit(X_stacking_t, y_stacking_t)

    print(f'Fold {stacking_train_fold_idx} Stacking Model Predicting: ...\n')
    pred_stacking_train = stacking_model.predict_proba(X_stacking_v)[:, 1]
    pred_stacking_valid = stacking_model.predict_proba(X_stacking_valid)[:, 1]
    pred_stacking_test = stacking_model.predict_proba(X_stacking_test)[:, 1]

    stacking_preds_stacking_train[stacking_v_index] = pred_stacking_train
    stacking_preds_stacking_valid += pred_stacking_valid / num_folds
    stacking_preds_stacking_test += pred_stacking_test / num_folds

TRAIN_RESULTS['Stacking_V2'] = roc_auc_score(y_train, stacking_preds_stacking_train)
VALID_RESULTS['Stacking_V2'] = roc_auc_score(y_valid, stacking_preds_stacking_valid)

Fold 0 Training: ...

Fold 0 Stacking Model Fitting: ...

Fold 0 Stacking Model Predicting: ...

Fold 1 Training: ...

Fold 1 Stacking Model Fitting: ...

Fold 1 Stacking Model Predicting: ...

Fold 2 Training: ...

Fold 2 Stacking Model Fitting: ...

Fold 2 Stacking Model Predicting: ...

Fold 3 Training: ...

Fold 3 Stacking Model Fitting: ...

Fold 3 Stacking Model Predicting: ...

Fold 4 Training: ...

Fold 4 Stacking Model Fitting: ...

Fold 4 Stacking Model Predicting: ...

Fold 5 Training: ...

Fold 5 Stacking Model Fitting: ...

Fold 5 Stacking Model Predicting: ...

Fold 6 Training: ...

Fold 6 Stacking Model Fitting: ...

Fold 6 Stacking Model Predicting: ...

Fold 7 Training: ...

Fold 7 Stacking Model Fitting: ...

Fold 7 Stacking Model Predicting: ...

Fold 8 Training: ...

Fold 8 Stacking Model Fitting: ...

Fold 8 Stacking Model Predicting: ...

Fold 9 Training: ...

Fold 9 Stacking Model Fitting: ...

Fold 9 Stacking Model Predicting: ...

Fold 10 Training: ...

Fold 10

## result

In [22]:
pd.DataFrame.from_records([TRAIN_RESULTS, VALID_RESULTS])

Unnamed: 0,Dataset,Best Model,Average,Weighted Average,Stacking_V1,Stacking_V2
0,train,0.861048,0.865604,0.865604,0.0,0.859889
1,valid,0.866161,0.8657,0.865703,0.884639,0.86628


# submit

In [23]:
submission_df = pd.read_csv('../submission/sample_submission.csv')
submission_df['smoking'] = stacking_preds_stacking_test # stacking_preds_test 
submission_df.to_csv(f'../submission/submission.csv', index = False)

# pseudo-labels

In [24]:
# def make_pseudo_set(cutoff):
#     test_df['pred'] = y_pred

#     pseudo_set_1 = test_df[test_df['pred'] > cutoff]
#     pseudo_set_1['smoking'] = 1
#     pseudo_set_1.drop(['pred'], axis=1, inplace=True)

#     pseudo_set_2 = test_df[test_df['pred'] < 1-cutoff]
#     pseudo_set_2['smoking'] = 0
#     pseudo_set_2.drop(['pred'], axis=1, inplace=True)

#     pseudo_df = pd.concat([pseudo_set_1,pseudo_set_2])

#     return pseudo_df

In [25]:
# pseudo_df = make_pseudo_set(0.95)
# pseudo_train_df = pd.concat([train_df, pseudo_df])

# X_pseudo_train = pseudo_train_df.drop(['smoking'], axis=1)
# y_pseudo_train = pseudo_train_df['smoking']
 
# pseudo_model = xgb.XGBClassifier(**best_xgb_hyperparams)
# pseudo_model.fit(X_pseudo_train, y_pseudo_train)

# submission_df = pd.read_csv('../submission/sample_submission.csv')
# submission_df['smoking'] =  pseudo_model.predict_proba(test_df.drop(['smoking'], axis=1))[:,1]
# submission_df.to_csv('../submission/pseudo_xgb_submission.csv', index=False)