# LightGBM Model

In [None]:
import altair as alt
import os
import pandas as pd
import numpy as np

from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, accuracy_score
from sklearn.model_selection import train_test_split, RepeatedKFold

In [None]:
RANDOM_SEED = 17

project_base = os.path.dirname(os.path.realpath('.'))
print(f'Project base path: {project_base}')

In [None]:
# helper function
def make_kaggle_preds(fitted_model, data, output_file_name):
    preds = fitted_model.predict(data)
    kaggle_preds_df = pd.DataFrame({'id': list(range(len(preds))), 'label':preds})
    kaggle_preds_df['label'] = kaggle_preds_df['label'].astype(int)
    kaggle_preds_df.to_csv(os.path.join(project_base, 'data', 'kaggle_preds', output_file_name), index=False)

### Load Data

In [None]:
X_train_processed = pd.read_csv(os.path.join(project_base, 'data', 'cleaned_data', 'processed_train_data.csv'))
X_test_processed = pd.read_csv(os.path.join(project_base, 'data', 'cleaned_data', 'processed_test_data.csv'))
X_kaggle_processed = pd.read_csv(os.path.join(project_base, 'data', 'cleaned_data', 'processed_kaggle_data.csv'))


y_train = pd.read_csv(os.path.join(project_base,  'data', 'cleaned_data', 'processed_train_y.csv'))
y_test = pd.read_csv(os.path.join(project_base, 'data', 'cleaned_data', 'processed_test_y.csv'))

### LightGBM without Tuning

In [None]:
import lightgbm as lgb

In [None]:
# lgb_clf = LGBMClassifier(class_weight="balanced", n_jobs=-1, max_depth=10, min_child_samples=40, n_estimators=50)
lgb_clf = lgb.LGBMClassifier()
lgb_clf.fit(X_train_processed, y_train.values)

In [None]:
lgb_test_preds = lgb_clf.predict(X_test_processed)

In [None]:
print(f'Accuracy Score:\n\n{accuracy_score(y_test, lgb_test_preds)}')

In [None]:
make_kaggle_preds(fitted_model=lgb_clf, data=X_kaggle_processed, output_file_name='lgb_preds.csv')

### LightGBM with Hyperparameter Tuning

In [None]:


import optuna.integration.lightgbm as lgb
import optuna

In [None]:
rkf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=RANDOM_SEED)

params = {
        "objective": "binary",
        "metric": "binary_error",
#         "verbosity": -1,
        "boosting_type": "gbdt",                
        "seed": RANDOM_SEED
    }

study_tuner = optuna.create_study(direction='minimize')
lgb_data_train = lgb.Dataset(X_train_processed, label=y_train)

# Suppress information only outputs - otherwise optuna is 
# quite verbose, which can be nice, but takes up a lot of space
optuna.logging.set_verbosity(optuna.logging.WARNING) 
num_rounds = 100

tuner = lgb.LightGBMTunerCV(params, 
                            lgb_data_train, 
                            categorical_feature=[],
                            study=study_tuner,
#                             verbose_eval=False,                            
                            early_stopping_rounds=100,
                            time_budget=19800, # 19800 = 5 hours, probably wont take that long
                            seed = RANDOM_SEED,
                            folds=rkf,
                            num_boost_round=num_rounds,
                            callbacks=[lgb.reset_parameter(learning_rate = [0.005]*int(num_rounds*0.02) + [0.001]*int(num_rounds*.98)) ]
                           )

tuner.run()

In [None]:
print(tuner.best_params)
# Classification error
print(tuner.best_score)
# Or expressed as accuracy
print(1.0-tuner.best_score)

In [None]:
import lightgbm as lgb
from sklearn.model_selection import cross_val_predict, cross_val_score

import optuna
from optuna.integration import LightGBMPruningCallback

In [None]:
def objective(trial, X_train, y_train, X_test, y_test):
    param_grid = {
        # "device_type": trial.suggest_categorical("device_type", ['gpu']),
        "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.2, 0.95, step=0.1
        ),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.2, 0.95, step=0.1
        ),
    }

    lgbm_model = lgb.LGBMClassifier(objective="binary", **param_grid)
    # if GPU is available
#     lgbm_model = lgb.LGBMClassifier(objective="binary", **param_grid, device='gpu')
    fit_params= {
        'eval_set':[(X_test, y_test)],
        'eval_metric':'binary_logloss',
#         'verbose_eval':-1,
        'early_stopping_rounds':100,
        'callbacks':[LightGBMPruningCallback(trial, "binary_logloss")]
    }
    
    scores = cross_val_score(lgbm_model, X_train, y_train, cv=5, scoring='neg_log_loss', fit_params=fit_params)
#     print(scores)
    return np.mean(scores)

In [None]:
study = optuna.create_study(direction="minimize", study_name="LGBM Classifier")
func = lambda trial: objective(trial, X_train_processed, y_train, X_test_processed,y_test)
study.optimize(func, n_trials=2)

In [None]:
print(f"\tBest value (rmse): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")