In [1]:
import numpy as np
import pandas as pd

import joblib
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

SEED = 578
np.random.seed(SEED)

In [2]:
train_df = pd.read_csv('../data/preprocess_train.csv', index_col=0)
valid_df = pd.read_csv('../data/preprocess_valid.csv', index_col=0)
test_df = pd.read_csv('../data/preprocess_test.csv', index_col=0)

# optuna hyperparameter optimization

In [6]:
def optimize_hyperparameters(objective, n_trials):
    optuna.logging.set_verbosity(optuna.logging.CRITICAL)
    study = optuna.create_study(direction = "maximize")
    study.optimize(objective, n_trials=n_trials)
    print("Total Trials:", len(study.trials))
    print("Best Score:", study.best_value)
    print("Best Parameters:", study.best_params)
    return study

def plot_study(study):
    optuna.visualization._get_intermediate_plot(study)
    optuna.visualization._get_slice_plot(study)
    optuna.visualization.plot_optimization_history(study)
    optuna.visualization.plot_parallel_coordinate(study)
    optuna.visualization.plot_contour(study)
    optuna.visualization.plot_param_importances(study)

## LightGBM

In [11]:
def lgbm_objective(trial):
    hyperparams = {
        'n_estimators' : trial.suggest_int('n_estimators',500,1000),
        "max_depth":trial.suggest_int('max_depth',3,50),
        "learning_rate" : trial.suggest_float('learning_rate',1e-4,0.25,log=True),
        "min_child_weight" : trial.suggest_float('min_child_weight',0.5,4),
        "min_child_samples" : trial.suggest_int('min_child_samples',1,250),
        "subsample" : trial.suggest_float('subsample',0.2,1),
        "subsample_freq" : trial.suggest_int('subsample_freq',0,5),
        "colsample_bytree" : trial.suggest_float('colsample_bytree',0.2,1),
        'num_leaves' : trial.suggest_int('num_leaves',2,128),
    }
    
    X = train_df.drop(['smoking'], axis=1)
    y = train_df['smoking']

    lgbm_model = lgb.LGBMClassifier(**hyperparams, random_state=SEED)

    # aucs = cross_val_score(lgbm_model, X, y, cv = 5, scoring='roc_auc')
    # auc_mean = aucs.mean()

    # print("AUCs:", aucs)
    # print("AUC Mean:", auc_mean)

    train_folds = joblib.load('../fold/10_train_stratifiedkfolds.jl')
    train_fold_aucs = []

    for train_fold, (train_index, val_index) in enumerate(train_folds):
        print(f'Fold {train_fold} Training: ...\n')
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        lgbm_model.fit(X_train, y_train)

        y_pred = lgbm_model.predict_proba(X_val)[:,1]
        
        train_fold_auc = roc_auc_score(y_val, y_pred)
        train_fold_aucs.append(train_fold_auc)

    train_fold_auc_mean = sum(train_fold_aucs) / len(train_fold_aucs)

    print("AUCs:", train_fold_aucs)
    print("AUC Mean:", train_fold_auc_mean)

    return train_fold_auc_mean

In [12]:
lgbm_study = optimize_hyperparameters(lgbm_objective, 10)

Fold 0 Training: ...

[LightGBM] [Info] Number of positive: 43528, number of negative: 57279
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005413 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1523
[LightGBM] [Info] Number of data points in the train set: 100807, number of used features: 36
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.431795 -> initscore=-0.274530
[LightGBM] [Info] Start training from score -0.274530
Fold 1 Training: ...

[LightGBM] [Info] Number of positive: 43528, number of negative: 57279
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004419 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1525
[LightGBM] [Info] Number of data points in the train set: 100807, 

In [3]:
best_lgbm_hyperparams =  {

}

## XgBoost

In [3]:
def xgb_objective(trial):
    hyperparams = {
        'n_estimators' : trial.suggest_int('n_estimators',500,750),
        'max_depth':  trial.suggest_int('max_depth',3,50),
        'min_child_weight': trial.suggest_float('min_child_weight' 2,50),
        "learning_rate" : trial.suggest_float('learning_rate',1e-4,0.2,log=True),
        'subsample': trial.suggest_float('subsample',0.2,1),
        'gamma': trial.suggest_float("gamma",1e-4,1.0),
        "colsample_bytree" : trial.suggest_float('colsample_bytree',0.2,1),
        "colsample_bylevel" : trial.suggest_float('colsample_bylevel',0.2,1),
        "colsample_bynode" : trial.suggest_float('colsample_bynode',0.2,1),
    }

    X = train_df.drop(['smoking'], axis=1)
    y = train_df['smoking']

    xgb_model = xgb.XGBClassifier(**hyperparams, random_state=SEED)
 
    train_folds = joblib.load('../data/fold/10_train_stratifiedkfolds.jl')
    train_fold_aucs = []

    for train_fold, (train_index, val_index) in enumerate(train_folds):
        print(f'Fold {train_fold} Training: ...\n')
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        xgb_model.fit(X_train, y_train)
        
        y_pred = xgb_model.predict_proba(X_val)[:,1]
        
        train_fold_auc = roc_auc_score(y_val, y_pred)
        train_fold_aucs.append(train_fold_auc)

    train_fold_auc_mean = sum(train_fold_aucs) / len(train_fold_aucs)

    print("AUCs:", train_fold_aucs)
    print("AUC Mean:", train_fold_auc_mean)

    return train_fold_auc_mean

In [4]:
xgb_study = optimize_hyperparameters(xgb_objective, 10)

  "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
  "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
  "gamma": trial.suggest_loguniform("gamma", 1e-8, 1.0),


Training Fold 1: ...



KeyboardInterrupt: 

In [4]:
best_xgb_hyperparams =  {
    
}

## CatBoost

In [None]:
def cb_objective(trial):
    hyperparams = {
        'iterations': trial.suggest_int('iterations',500,750),
        'depth': trial.suggest_int('depth',3,10),
        'learning_rate': trial.suggest_float('learning_rate',1e-4,0.2,log=True),
        'random_strength': trial.suggest_int('random_strength',0,100),
        'bagging_temperature': trial.suggest_float('bagging_temperature',0,1),
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg',3,30),
        'border_count': trial.suggest_int('border_count',32,255),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight',0.01,1.0),
    }
    
    X = train_df.drop(['target_column'], axis=1)
    y = train_df['target_column']
    
    catboost_model = cb.CatBoostClassifier(**hyperparams, random_seed=SEED)

    train_folds = joblib.load('../data/fold/10_train_stratifiedkfolds.jl')
    train_fold_aucs = []
    
    for train_fold, (train_index, val_index) in enumerate(train_folds):
        print(f'Fold {train_fold} Training: ...\n')
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        catboost_model.fit(X_train, y_train)
        
        y_pred = catboost_model.predict_proba(X_val)[:, 1]
        
        train_fold_auc = roc_auc_score(y_val, y_pred)
        train_fold_aucs.append(train_fold_auc)

    train_fold_auc_mean = sum(train_fold_aucs) / len(train_fold_aucs)

    print("AUCs:", train_fold_aucs)
    print("AUC Mean:", train_fold_auc_mean)
    
    return train_fold_auc_mean

In [None]:
cb_study = optimize_hyperparameters(cb_objective, 10)

In [5]:
best_cb_hyperparams =  {

}

# train

In [6]:
X_train = train_df.drop(['smoking'], axis=1)
y_train = train_df['smoking']

X_valid = valid_df.drop(['smoking'], axis=1)
y_valid = valid_df['smoking']

X_test = test_df

In [7]:
models = [
    lgb.LGBMClassifier(**best_lgbm_hyperparams, random_state=SEED),
    xgb.XGBClassifier(**best_xgb_hyperparams, random_state=SEED),
    cb.CatBoostClassifier(**best_cb_hyperparams, random_state=SEED),
]

num_models = len(models)

In [8]:
train_folds = joblib.load('../fold/10_train_stratifiedkfolds.jl')
valid_folds = joblib.load('../fold/10_valid_stratifiedkfolds.jl')

num_folds = 10

In [10]:
preds_train = np.zeros((len(X_train), num_models))
preds_valid = np.zeros((len(X_valid), num_models))
preds_test = np.zeros((len(X_test), num_models))

for train_fold_idx, (t_idx, v_idx) in enumerate(train_folds):
    print(f'Fold {train_fold_idx} Training: ...\n')
    X_t, y_t = X_train.iloc[t_idx], y_train.iloc[t_idx]
    X_v, y_v = X_train.iloc[v_idx], y_train.iloc[v_idx]

    for model_idx, model in enumerate(models):
        print(f'Model {model_idx} Fitting: ...\n')
        model.fit(X_t, y_t)

        print(f'Model {model_idx} Predicting: ...\n')
        pred_train = model.predict_proba(X_v)[:, 1]
        pred_valid = model.predict_proba(X_valid)[:, 1]
        pred_test = model.predict_proba(X_test)[:, 1]

        preds_train[v_idx, model_idx] = pred_train
        preds_valid[:, model_idx] += pred_valid / num_folds
        preds_test[:, model_idx] += pred_test / num_folds

Fold 0 Training: ...

Model 0 Fitting: ...



[LightGBM] [Info] Number of positive: 43528, number of negative: 57279
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.049434 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1523
[LightGBM] [Info] Number of data points in the train set: 100807, number of used features: 36
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.431795 -> initscore=-0.274530
[LightGBM] [Info] Start training from score -0.274530
Model 0 Predicting: ...

Model 1 Fitting: ...

Model 1 Predicting: ...

Model 2 Fitting: ...

Learning rate set to 0.073863
0:	learn: 0.6523191	total: 251ms	remaining: 4m 10s
1:	learn: 0.6191123	total: 301ms	remaining: 2m 30s
2:	learn: 0.5917813	total: 361ms	remaining: 1m 59s
3:	learn: 0.5702701	total: 417ms	remaining: 1m 43s
4:	learn: 0.5532078	total: 476ms	remaining: 1m 34s
5:	learn: 0.5390381	total: 526ms	remaining: 1m 27s
6:	learn: 0.5272255	total: 609ms	remaining: 1m 26s
7:	learn: 0.5171858	tota

In [11]:
for model_idx in range(num_models):
    print(f'Model {model_idx} train AUC: = {roc_auc_score(y_train, preds_train[:, model_idx])} \t valid AUC:  = {roc_auc_score(y_train, preds_valid[:, model_idx])}')

ValueError: Found input variables with inconsistent numbers of samples: [112008, 74673]

In [None]:
TRAIN_RESULTS = {'Dataset': 'train'}
VALID_RESULTS = {'Dataset': 'valid'}

## best model

In [None]:
best_model_score = float('-inf')
best_model_idx = None

for model_idx in range(num_models):
    model_score = roc_auc_score(y_train, preds_train[:, model_idx])
    if model_score > best_score:
        best_score = model_score
        best_model_idx = model_idx

print(f'Best Train Model: {best_model_idx}')
print(f'Best Train Score: {best_score}')

## average

In [None]:
average_score = roc_auc_score(y_train, train_preds.mean(axis = 1))

print(f'Average Score: {average_score}')

## weighted average

In [None]:
best_score = float('-inf')
worst_score = float('inf')

for model_idx in range(num_models):
    globals()[f'model_{model_idx}_score'] = roc_auc_score(y_train, train_preds[:, model_idx])
    if globals()[f'model_{model_idx}_score'] > best_score:
        best_score = globals()[f'model_{model_idx}score']
    if globals()[f'model_{model_idx}_score'] < worst_score:
        worst_score = globals()[f'model_{model_idx}score']

total_weights = 0
for model_idx in range(num_models):
    globals()[f'model_{model_idx}_weight'] = 1 - ((best_score-globals()[f'model_{model_idx}_score'])/(best_score-(worst_score-0.01)))
    total_weights += globals()[f'model_{model_idx}_weight']

weighted_average_train_preds = train_preds.copy()
for model_idx in range(num_models):
    weighted_average_train_preds[:, model_idx] = weighted_average_train_preds[:, model_idx] * globals()[f'model_{model_idx}_weight']

weighted_average_score = np.sum(weighted_average_train_preds, axis=1) / total_weights

print(f'Weighted Average Score: {weighted_average_score}')

## stacking

In [None]:
X_train_new = np.hstack((X_train, OOF_PREDS))
X_valid_new = np.hstack((X_valid, HOLD_PREDS))
X_test_new = np.hstack((X_test, TEST_PREDS))
X_train_new.shape, X_valid_new.shape, X_test_new.shape

N_FOLDS = 5
N_MODELS = 1

skf = StratifiedKFold(n_splits=N_FOLDS, random_state = 42, shuffle = True)

OOF_PREDS_v2 = np.zeros((len(X_train_new), N_MODELS))
HOLD_PREDS_v2 = np.zeros((len(X_valid_new), N_MODELS))
TEST_PREDS_v2 = np.zeros((len(X_test_new), N_MODELS))

for fold, (tr_index, val_index) in enumerate(skf.split(y_train, y_train)):
    print('Start fold {}'.format(fold))
    
    # Select train and valid
    X_tr, X_v = X_train_new[tr_index, :], X_train_new[val_index, :]
    y_tr, y_v = y_train[tr_index], y_train[val_index]
    
    # Models to use
    models = [
        RandomForestClassifier(n_estimators=100, n_jobs=-1)
    ]
    
    # Training-prediction cycle 
    for i, model in enumerate(models):
        print('\tModel {} training'.format(i))
        model.fit(X_tr, y_tr)
        
        print('\tModel {} prediction'.format(i))
        val_pred = model.predict_proba(X_v)[:, 1]
        hold_pred = model.predict_proba(X_valid_new)[:, 1]
        test_pred = model.predict_proba(X_test_new)[:, 1]
        
        print('\tModel {} scores:'.format(i))
        print('\t\tValid score = {:.5f}'.format(roc_auc_score(y_v, val_pred)))
        print('\t\tHoldout score = {:.5f}'.format(roc_auc_score(y_valid, hold_pred)))
        
        OOF_PREDS_v2[val_index, i] = val_pred
        HOLD_PREDS_v2[:, i] += hold_pred / N_FOLDS
        TEST_PREDS_v2[:, i] += test_pred / N_FOLDS
        
    print('='*30)
        
for i in range(N_MODELS):
    print('Model {} scores:'.format(i))
    print('\tOOF score = {:.5f}'.format(roc_auc_score(y_train, OOF_PREDS_v2[:, i])))
    print('\tHoldout score = {:.5f}'.format(roc_auc_score(y_valid, HOLD_PREDS_v2[:, i])))

OOF_RESULTS['Stacking'] = roc_auc_score(y_train, OOF_PREDS_v2[:, i])
HOLD_RESULTS['Stacking'] = roc_auc_score(y_valid, HOLD_PREDS_v2[:, i])

## result

In [None]:
pd.DataFrame(
    columns=['best model', 'average', 'weighted average', 'stacking'],
    data=[best_model_score, average_score, weighted_average_score, stacking_score]
)

# predict

# submit

In [44]:
submission_df = pd.read_csv('../submission/sample_submission.csv')
submission_df['smoking'] = y_pred
submission_df.to_csv(f'../submission/{final_model}_submission.csv', index = False)

# pseudo-labels

In [45]:
def make_pseudo_set(cutoff):
    test_df['pred'] = y_pred

    pseudo_set_1 = test_df[test_df['pred'] > cutoff]
    pseudo_set_1['smoking'] = 1
    pseudo_set_1.drop(['pred'], axis=1, inplace=True)

    pseudo_set_2 = test_df[test_df['pred'] < 1-cutoff]
    pseudo_set_2['smoking'] = 0
    pseudo_set_2.drop(['pred'], axis=1, inplace=True)

    pseudo_df = pd.concat([pseudo_set_1,pseudo_set_2])

    return pseudo_df

In [50]:
pseudo_df = make_pseudo_set(0.95)
pseudo_train_df = pd.concat([train_df, pseudo_df])

X_pseudo_train = pseudo_train_df.drop(['smoking'], axis=1)
y_pseudo_train = pseudo_train_df['smoking']
 
pseudo_model = xgb.XGBClassifier(**best_xgb_hyperparams)
pseudo_model.fit(X_pseudo_train, y_pseudo_train)

submission_df = pd.read_csv('../submission/sample_submission.csv')
submission_df['smoking'] =  pseudo_model.predict_proba(test_df.drop(['smoking'], axis=1))[:,1]
submission_df.to_csv('../submission/pseudo_xgb_submission.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pseudo_set_1['smoking'] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pseudo_set_1.drop(columns=['pred'], axis = 1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pseudo_set_2['smoking'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/sta