In [1]:
import numpy as np
import pandas as pd

import joblib
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import VotingClassifier

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

SEED = 578
np.random.seed(SEED)

In [2]:
train_df = pd.read_csv('../data/preprocess_train.csv', index_col=0)
valid_df = pd.read_csv('../data/preprocess_valid.csv', index_col=0)
test_df = pd.read_csv('../data/preprocess_test.csv', index_col=0)

# optuna hyperparameter optimization

In [3]:
# def optimize_hyperparameters(objective, n_trials):
#     optuna.logging.set_verbosity(optuna.logging.CRITICAL)
#     study = optuna.create_study(direction = "maximize")
#     study.optimize(objective, n_trials=n_trials)
#     print("Total Trials:", len(study.trials))
#     print("Best Score:", study.best_value)
#     print("Best Parameters:", study.best_params)
#     return study

# def plot_study(study):
#     optuna.visualization._get_intermediate_plot(study)
#     optuna.visualization._get_slice_plot(study)
#     optuna.visualization.plot_optimization_history(study)
#     optuna.visualization.plot_parallel_coordinate(study)
#     optuna.visualization.plot_contour(study)
#     optuna.visualization.plot_param_importances(study)

## LightGBM

In [4]:
# def lgbm_objective(trial):
#     hyperparams = {
#         'n_estimators' : trial.suggest_int('n_estimators',500,1000),
#         "max_depth":trial.suggest_int('max_depth',3,50),
#         "learning_rate" : trial.suggest_float('learning_rate',1e-4,0.25,log=True),
#         "min_child_weight" : trial.suggest_float('min_child_weight',0.5,4),
#         "min_child_samples" : trial.suggest_int('min_child_samples',1,250),
#         "subsample" : trial.suggest_float('subsample',0.2,1),
#         "subsample_freq" : trial.suggest_int('subsample_freq',0,5),
#         "colsample_bytree" : trial.suggest_float('colsample_bytree',0.2,1),
#         'num_leaves' : trial.suggest_int('num_leaves',2,128),
#     }
    
#     X = train_df.drop(['smoking'], axis=1)
#     y = train_df['smoking']

#     lgbm_model = lgb.LGBMClassifier(**hyperparams, random_state=SEED)

#     # aucs = cross_val_score(lgbm_model, X, y, cv = 5, scoring='roc_auc')
#     # auc_mean = aucs.mean()

#     # print("AUCs:", aucs)
#     # print("AUC Mean:", auc_mean)

#     train_folds = joblib.load('../fold/10_train_stratifiedkfolds.jl')
#     train_fold_aucs = []

#     for train_fold, (train_index, val_index) in enumerate(train_folds):
#         print(f'Fold {train_fold} Training: ...\n')
#         X_train, X_val = X.iloc[train_index], X.iloc[val_index]
#         y_train, y_val = y.iloc[train_index], y.iloc[val_index]

#         lgbm_model.fit(X_train, y_train)

#         y_pred = lgbm_model.predict_proba(X_val)[:,1]
        
#         train_fold_auc = roc_auc_score(y_val, y_pred)
#         train_fold_aucs.append(train_fold_auc)

#     train_fold_auc_mean = sum(train_fold_aucs) / len(train_fold_aucs)

#     print("AUCs:", train_fold_aucs)
#     print("AUC Mean:", train_fold_auc_mean)

#     return train_fold_auc_mean

In [5]:
# lgbm_study = optimize_hyperparameters(lgbm_objective, 10)

In [6]:
best_lgbm_hyperparams =  {
      'n_estimators': 555,
      'max_depth': 43,
      'learning_rate': 0.007268707576420426, 
      'min_child_weight': 3.12991455545269, 
      'min_child_samples': 159, 
      'subsample': 0.6259017376430374, 
      'subsample_freq': 2, 
      'colsample_bytree': 0.4519524559914368, 
      'num_leaves': 68
}

## XgBoost

In [7]:
# def xgb_objective(trial):
#     hyperparams = {
#         'n_estimators' : trial.suggest_int('n_estimators',500,750),
#         'max_depth':  trial.suggest_int('max_depth',3,50),
#         'min_child_weight': trial.suggest_float('min_child_weight',2,50),
#         "learning_rate" : trial.suggest_float('learning_rate',1e-4,0.2,log=True),
#         'subsample': trial.suggest_float('subsample',0.2,1),
#         'gamma': trial.suggest_float("gamma",1e-4,1.0),
#         "colsample_bytree" : trial.suggest_float('colsample_bytree',0.2,1),
#         "colsample_bylevel" : trial.suggest_float('colsample_bylevel',0.2,1),
#         "colsample_bynode" : trial.suggest_float('colsample_bynode',0.2,1),
#     }

#     X = train_df.drop(['smoking'], axis=1)
#     y = train_df['smoking']

#     xgb_model = xgb.XGBClassifier(**hyperparams, random_state=SEED)
 
#     train_folds = joblib.load('../fold/10_train_stratifiedkfolds.jl')
#     train_fold_aucs = []

#     for train_fold, (train_index, val_index) in enumerate(train_folds):
#         print(f'Fold {train_fold} Training: ...\n')
#         X_train, X_val = X.iloc[train_index], X.iloc[val_index]
#         y_train, y_val = y.iloc[train_index], y.iloc[val_index]

#         xgb_model.fit(X_train, y_train)
        
#         y_pred = xgb_model.predict_proba(X_val)[:,1]
        
#         train_fold_auc = roc_auc_score(y_val, y_pred)
#         train_fold_aucs.append(train_fold_auc)

#     train_fold_auc_mean = sum(train_fold_aucs) / len(train_fold_aucs)

#     print("AUCs:", train_fold_aucs)
#     print("AUC Mean:", train_fold_auc_mean)

#     return train_fold_auc_mean

In [8]:
# xgb_study = optimize_hyperparameters(xgb_objective, 10)

In [9]:
best_xgb_hyperparams =  {
      'n_estimators': 748, 
      'max_depth': 46, 
      'min_child_weight': 32.69774846647831, 
      'learning_rate': 0.0370498003558445, 
      'subsample': 0.8695796691153823, 
      'gamma': 0.597338372907374, 
      'colsample_bytree': 0.6865936901180945, 
      'colsample_bylevel': 0.551057156483749, 
      'colsample_bynode': 0.30581796867932687
}

## CatBoost

In [10]:
# def cb_objective(trial):
#     hyperparams = {
#         'iterations': trial.suggest_int('iterations',500,750),
#         'depth': trial.suggest_int('depth',3,10),
#         'learning_rate': trial.suggest_float('learning_rate',1e-4,0.2,log=True),
#         'random_strength': trial.suggest_int('random_strength',0,100),
#         'bagging_temperature': trial.suggest_float('bagging_temperature',0,1),
#         'l2_leaf_reg': trial.suggest_int('l2_leaf_reg',3,30),
#         'border_count': trial.suggest_int('border_count',32,255),
#         'scale_pos_weight': trial.suggest_float('scale_pos_weight',0.01,1.0),
#     }
    
#     X = train_df.drop(['smoking'], axis=1)
#     y = train_df['smoking']
    
#     catboost_model = cb.CatBoostClassifier(**hyperparams, random_seed=SEED)

#     train_folds = joblib.load('../fold/10_train_stratifiedkfolds.jl')
#     train_fold_aucs = []
    
#     for train_fold, (train_index, val_index) in enumerate(train_folds):
#         print(f'Fold {train_fold} Training: ...\n')
#         X_train, X_val = X.iloc[train_index], X.iloc[val_index]
#         y_train, y_val = y.iloc[train_index], y.iloc[val_index]

#         catboost_model.fit(X_train, y_train)
        
#         y_pred = catboost_model.predict_proba(X_val)[:, 1]
        
#         train_fold_auc = roc_auc_score(y_val, y_pred)
#         train_fold_aucs.append(train_fold_auc)

#     train_fold_auc_mean = sum(train_fold_aucs) / len(train_fold_aucs)

#     print("AUCs:", train_fold_aucs)
#     print("AUC Mean:", train_fold_auc_mean)
    
#     return train_fold_auc_mean

In [11]:
# cb_study = optimize_hyperparameters(cb_objective, 10)

In [12]:
best_cb_hyperparams =  {
      'iterations': 593,
      'depth': 4,
      'learning_rate': 0.12259905637676717,
      'random_strength': 62,
      'bagging_temperature': 0.2083824448364907,
      'l2_leaf_reg': 26,
      'border_count': 142,
      'scale_pos_weight': 0.7204495417194038
}

# train

In [13]:
X_train = train_df.drop(['smoking'], axis=1)
y_train = train_df['smoking']

X_valid = valid_df.drop(['smoking'], axis=1)
y_valid = valid_df['smoking']

X_test = test_df

In [14]:
models = [
    lgb.LGBMClassifier(**best_lgbm_hyperparams, random_state=SEED),
    xgb.XGBClassifier(**best_xgb_hyperparams, random_state=SEED),
    cb.CatBoostClassifier(**best_cb_hyperparams, random_state=SEED),
]

num_models = len(models)

In [15]:
train_folds = joblib.load('../fold/10_train_stratifiedkfolds.jl')

num_folds = 10

In [16]:
# preds_train = np.zeros((len(X_train), num_models))
# preds_valid = np.zeros((len(X_valid), num_models))
# preds_test = np.zeros((len(X_test), num_models))

# for train_fold_idx, (t_idx, v_idx) in enumerate(train_folds):
#     print(f'Fold {train_fold_idx} Training: ...\n')
#     X_t, y_t = X_train.iloc[t_idx], y_train.iloc[t_idx]
#     X_v, y_v = X_train.iloc[v_idx], y_train.iloc[v_idx]

#     for model_idx, model in enumerate(models):
#         print(f'Model {model_idx} Fitting: ...\n')
#         model.fit(X_t, y_t)

#         print(f'Model {model_idx} Predicting: ...\n')
#         pred_train = model.predict_proba(X_v)[:, 1]
#         pred_valid = model.predict_proba(X_valid)[:, 1]
#         pred_test = model.predict_proba(X_test)[:, 1]

#         preds_train[v_idx, model_idx] = pred_train
#         preds_valid[:, model_idx] += pred_valid / num_folds
#         preds_test[:, model_idx] += pred_test / num_folds

# np.save('../prediction/preds_train.npy', preds_train)
# np.save('../prediction/preds_valid.npy', preds_valid)
# np.save('../prediction/preds_test.npy', preds_test)

In [18]:
# for model_idx in range(num_models):
#     print(f'Model {model_idx} train AUC: = {roc_auc_score(y_train, preds_train[:, model_idx])}\tvalid AUC:  = {roc_auc_score(y_valid, preds_valid[:, model_idx])}')

In [64]:
preds_train = np.load('../prediction/preds_train.npy')
preds_valid = np.load('../prediction/preds_valid.npy')
preds_test = np.load('../prediction/preds_test.npy')

## ensemble

In [65]:
VALID_RESULTS = {'Dataset': 'valid'}

### best model

In [66]:
best_valid_score = float('-inf')
best_model_idx = None

for model_idx in range(num_models):
    valid_score = roc_auc_score(y_valid, preds_valid[:, model_idx])
    if valid_score > best_valid_score:
        best_valid_score = valid_score
        best_model_idx = model_idx

print(f'Best Model Index: {best_model_idx}')
VALID_RESULTS['Best Model'] = roc_auc_score(y_valid, preds_valid[:, best_model_idx])

Best Model Index: 0


In [67]:
VALID_RESULTS['Best Model'] = roc_auc_score(y_valid, preds_valid[:, best_model_idx])

### average

In [68]:
VALID_RESULTS['Average'] = roc_auc_score(y_valid, preds_valid.mean(axis = 1))

### weighted average

In [69]:
model_weights = [roc_auc_score(y_valid, preds_valid[:, model_idx]) for model_idx in range(num_models)]
normalized_model_weights = model_weights / np.sum(model_weights)

weighted_average_preds_train = np.average(preds_train, weights=normalized_model_weights, axis=1)
weighted_average_preds_valid = np.average(preds_valid, weights=normalized_model_weights, axis=1)

VALID_RESULTS['Weighted Average'] = roc_auc_score(y_valid, weighted_average_preds_valid)

### stacking

In [70]:
stacking_model = lgb.LGBMClassifier(**best_lgbm_hyperparams, random_state=SEED)

stacking_model.fit(preds_valid, y_valid)
stacking_preds_valid = stacking_model.predict_proba(preds_valid)[:, 1]

VALID_RESULTS['Stacking'] = roc_auc_score(y_valid, stacking_preds_valid)

[LightGBM] [Info] Number of positive: 33588, number of negative: 45708
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001306 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 79296, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.423577 -> initscore=-0.308104
[LightGBM] [Info] Start training from score -0.308104


## result

In [71]:
pd.DataFrame.from_records([VALID_RESULTS])

Unnamed: 0,Dataset,Best Model,Average,Weighted Average,Stacking
0,valid,0.845448,0.843696,0.843708,0.846987


# predict

## ensemble

### best model

In [78]:
stacking_preds_test = preds_test[:, 0]

### stacking

In [72]:
stacking_preds_test = stacking_model.predict_proba(preds_test)[:, 1]

# submit

In [79]:
submission_df = pd.read_csv('../submission/sample_submission.csv')
submission_df['smoking'] = stacking_preds_test
submission_df.to_csv(f'../submission/submission.csv', index = False)

# pseudo-labels

In [None]:
def make_pseudo_set(cutoff):
    test_df['pred'] = y_pred

    pseudo_set_1 = test_df[test_df['pred'] > cutoff]
    pseudo_set_1['smoking'] = 1
    pseudo_set_1.drop(['pred'], axis=1, inplace=True)

    pseudo_set_2 = test_df[test_df['pred'] < 1-cutoff]
    pseudo_set_2['smoking'] = 0
    pseudo_set_2.drop(['pred'], axis=1, inplace=True)

    pseudo_df = pd.concat([pseudo_set_1,pseudo_set_2])

    return pseudo_df

In [None]:
pseudo_df = make_pseudo_set(0.95)
pseudo_train_df = pd.concat([train_df, pseudo_df])

X_pseudo_train = pseudo_train_df.drop(['smoking'], axis=1)
y_pseudo_train = pseudo_train_df['smoking']
 
pseudo_model = xgb.XGBClassifier(**best_xgb_hyperparams)
pseudo_model.fit(X_pseudo_train, y_pseudo_train)

submission_df = pd.read_csv('../submission/sample_submission.csv')
submission_df['smoking'] =  pseudo_model.predict_proba(test_df.drop(['smoking'], axis=1))[:,1]
submission_df.to_csv('../submission/pseudo_xgb_submission.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pseudo_set_1['smoking'] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pseudo_set_1.drop(columns=['pred'], axis = 1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pseudo_set_2['smoking'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/sta