In [3]:
import numpy as np
import pandas as pd

import joblib
import optuna
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score

SEED = 578

In [41]:
train_df = pd.read_csv('../data/train.csv', index_col=0)
test_df = pd.read_csv('../data/test.csv', index_col=0)

# optuna hyperparameter optimization

In [None]:
def optimize_hyperparameters(objective, n_trials):
    optuna.logging.set_verbosity(optuna.logging.CRITICAL)
    study = optuna.create_study(direction = "maximize")
    study.optimize(objective, n_trials=n_trials)
    print("Best Score:", study.best_value)
    print("Best Parameters:", study.best_params)
    return study

def plot_study(study):
    optuna.visualization.plot_optimization_history(study)
    optuna.visualization.plot_parallel_coordinate(study)
    optuna.visualization.plot_contour(study)
    optuna.visualization.plot_param_importances(study)

## LGBM

In [None]:
def lgbm_objective(trial):
    hyperparams = {
        'n_estimators' : trial.suggest_int('n_estimators',500,1000),
        "max_depth":trial.suggest_int('max_depth',3,50),
        "learning_rate" : trial.suggest_float('learning_rate',1e-4, 0.25, log=True),
        "min_child_weight" : trial.suggest_float('min_child_weight', 0.5,4),
        "min_child_samples" : trial.suggest_int('min_child_samples',1,250),
        "subsample" : trial.suggest_float('subsample', 0.2, 1),
        "subsample_freq" : trial.suggest_int('subsample_freq',0,5),
        "colsample_bytree" : trial.suggest_float('colsample_bytree',0.2,1),
        'num_leaves' : trial.suggest_int('num_leaves', 2, 128),
    }
    
    X = train_df.drop(['smoking'], axis=1)
    y = train_df['smoking']

    lgbm_model = lgb.LGBMClassifier(**hyperparams, random_state=SEED, device="gpu")

    aucs = cross_val_score(lgbm_model, X, y, cv = 5, scoring='roc_auc')
    auc_mean = aucs.mean()

    print("AUCs:", aucs)
    print("AUC Mean:", auc_mean)

    return auc_mean

In [None]:
lgbm_study = optimize_hyperparameters(lgbm_objective, 10)

In [None]:
best_lgbm_hyperparams =  {
    'n_estimators'          : 2048,
    'max_depth'             : 9,
    'learning_rate'         : 0.05,
    'booster'               : 'gbtree',
    'subsample'             : 0.75,
    'colsample_bytree'      : 0.30,
    'reg_lambda'            : 1.00,
    'reg_alpha'             : 1.00,
    'gamma'                 : 1.00,
    'objective'             : 'binary:logistic',
    'tree_method'           : 'hist',
    'eval_metric'           : 'auc',
    'n_jobs'                : -1
}

## XGB

In [3]:
def xgb_objective(trial):
    hyperparams = {
        'n_estimators' : trial.suggest_int('n_estimators',500,750),
        'max_depth':  trial.suggest_int('max_depth',3,50),
        'min_child_weight': trial.suggest_float('min_child_weight', 2,50),
        "learning_rate" : trial.suggest_float('learning_rate',1e-4, 0.2,log=True),
        'subsample': trial.suggest_float('subsample', 0.2, 1),
        'gamma': trial.suggest_float("gamma", 1e-4, 1.0),
        "colsample_bytree" : trial.suggest_float('colsample_bytree',0.2,1),
        "colsample_bylevel" : trial.suggest_float('colsample_bylevel',0.2,1),
        "colsample_bynode" : trial.suggest_float('colsample_bynode',0.2,1),
    }

    X = train_df.drop(['smoking'], axis=1)
    y = train_df['smoking']

    xgb_model = xgb.XGBClassifier(**hyperparams, random_state=SEED, device="gpu")

    aucs = cross_val_score(xgb_model, X, y, cv = 5, scoring='roc_auc')
    auc_mean = aucs.mean()

    print("AUCs:", aucs)
    print("AUC Mean:", auc_mean)
 
    # folds = joblib.load('../data/fold/5_stratifiedkfolds.jl')
    # fold_aucs = []
    # fold = 1

    # for train_index, val_index in folds:
    #     print(f'Training Fold {fold}: ...\n')
    #     X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    #     y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    #     xgb_model.fit(X_train,
    #               y_train,
    #               eval_set = [(X_val, y_val)],
    #               verbose = 0)
        
    #     # best_iteration = model.get_booster().best_ntree_limit

    #     y_pred = model.predict_proba(X_val)[:,1] # ntree_limit=best_iteration
        
    #     fold_auc = roc_auc_score(y_val, y_pred)
    #     fold_aucs.append(fold_auc)
    #     fold += 1 

    # fold_auc_mean = sum(fold_aucs) / len(fold_aucs)

    # print("Hyperparameters:", hyperparams)
    # print("AUCs:", fold_aucs)
    # print("AUC Mean:", fold_auc_mean)

    return auc_mean

In [4]:
xgb_study = optimize_hyperparameters(xgb_objective, 10)

  "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
  "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
  "gamma": trial.suggest_loguniform("gamma", 1e-8, 1.0),


Training Fold 1: ...



KeyboardInterrupt: 

In [42]:
best_xgb_hyperparams =  {
    'n_estimators'          : 2048,
    'max_depth'             : 9,
    'learning_rate'         : 0.05,
    'booster'               : 'gbtree',
    'subsample'             : 0.75,
    'colsample_bytree'      : 0.30,
    'reg_lambda'            : 1.00,
    'reg_alpha'             : 1.00,
    'gamma'                 : 1.00,
    'random_state'          : SEED,
    'objective'             : 'binary:logistic',
    'tree_method'           : 'hist',
    'eval_metric'           : 'auc',
    'n_jobs'                : -1
}

# train

In [None]:
X_train = train_df.drop(['smoking'], axis=1)
y_train = train_df['smoking']

## LGBM

In [None]:
lgbm_model = xgb.XGBClassifier(**best_lgbm_hyperparams)
lgbm_model.fit(X_train, y_train)

## XGB

In [43]:
xgb_model = xgb.XGBClassifier(**best_xgb_hyperparams)
xgb_model.fit(X_train, y_train)

# predict

In [None]:
final_model = xgb_model

In [None]:
X_test = test_df
y_pred = final_model.predict_proba(X_test)[:,1]

# submit

In [44]:
submission_df = pd.read_csv('../submission/sample_submission.csv')
submission_df['smoking'] = y_pred
submission_df.to_csv(f'../submission/{final_model}_submission.csv', index = False)

# pseudo-labels

In [45]:
def make_pseudo_set(cutoff):
    test_df['pred'] = y_pred

    pseudo_set_1 = test_df[test_df['pred'] > cutoff]
    pseudo_set_1['smoking'] = 1
    pseudo_set_1.drop(['pred'], axis=1, inplace=True)

    pseudo_set_2 = test_df[test_df['pred'] < 1-cutoff]
    pseudo_set_2['smoking'] = 0
    pseudo_set_2.drop(['pred'], axis=1, inplace=True)

    pseudo_df = pd.concat([pseudo_set_1,pseudo_set_2])

    return pseudo_df

In [50]:
pseudo_df = make_pseudo_set(0.95)
pseudo_train_df = pd.concat([train_df, pseudo_df])

X_pseudo_train = pseudo_train_df.drop(['smoking'], axis=1)
y_pseudo_train = pseudo_train_df['smoking']
 
pseudo_model = xgb.XGBClassifier(**best_xgb_hyperparams)
pseudo_model.fit(X_pseudo_train, y_pseudo_train)

submission_df = pd.read_csv('../submission/sample_submission.csv')
submission_df['smoking'] =  pseudo_model.predict_proba(test_df.drop(['smoking'], axis=1))[:,1]
submission_df.to_csv('../submission/pseudo_xgb_submission.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pseudo_set_1['smoking'] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pseudo_set_1.drop(columns=['pred'], axis = 1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pseudo_set_2['smoking'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/sta