In [6]:
import random

import pandas as pd
import numpy as np
import lightgbm as lgb
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, accuracy_score, confusion_matrix
import matplotlib
import matplotlib.pyplot as plt
from sklearn import metrics
import optuna
matplotlib.rcParams['font.family']='IPAGothic'

In [7]:
# generate dataframe

base_path = "../../data/dataframes/"
data = pd.read_pickle(base_path + 'data.pkl')
nomination_onehot = pd.read_pickle(base_path + 'nomination_onehot.pkl')
selected_performers_onehot = pd.read_pickle(base_path + 'selected_performers_onehot.pkl')
selected_directors_onehot = pd.read_pickle(base_path + 'selected_directors_onehot.pkl')
selected_studio_onehot = pd.read_pickle(base_path + 'selected_studio_onehot.pkl')
selected_scriptwriter_onehot = pd.read_pickle(base_path + 'selected_scriptwriter_onehot.pkl')

df = pd.concat(
    [
        nomination_onehot, 
        selected_performers_onehot,
        selected_directors_onehot,
        selected_studio_onehot,
        selected_scriptwriter_onehot,
        data["screen_time"],
        data["year"]
    ],
    axis=1
)

In [None]:
# for name in df.drop("year", axis=1).columns

In [None]:
def standarize_x(x_train, x_test):
    scaler = sklearn.preprocessing.StandardScaler()
    std_x_train = scaler.fit_transform(x_train)
    std_x_test = scaler.transform(x_test)
    return std_x_train, std_x_test


def custom_train_test_split():
    for year in range(1978, 2020):
        train_x = df[df["year"] != year].drop("year", axis=1).values
        test_x = df[df["year"] == year].drop("year", axis=1).values
        train_y = data[data["year"] != year]["prize"].values
        test_y = data[data["year"] == year]["prize"].values
        std_train_x, std_test_x = standarize_x(train_x, test_x)
        yield (std_train_x, std_test_x, train_y, test_y)

In [None]:
def objective(trial):
    #paramter_tuning using optuna
    param = {
        'boosting_type': trial.suggest_categorical('boosting', ['gbdt', 'dart', 'goss']),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        #'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 2, 100),
        'max_depth': trial.suggest_int('max_depth', 1, 20),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        #'num_leaves': trial.suggest_int('num_leaves', 2, 1000),
        #'num_threads': trial.suggest_int('num_threads',5, 10),
        #'min_sum_hessian_in_leaf': trial.suggest_int('min_sum_hessian_in_leaf', 1, 10),
        #'reg_alpha': trial.suggest_uniform('reg_alpha', 0., 1.0),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0., 1.0),
        #'class_weight': {str(class_name): 'balanced' for class_name in df.drop("year", axis=1).columns}
    }
    if param['boosting_type'] == 'dart':
        param['drop_rate'] = trial.suggest_loguniform('drop_rate', 1e-8, 1.0)
        param['skip_drop'] = trial.suggest_loguniform('skip_drop', 1e-8, 1.0)
    if param['boosting_type'] == 'goss':
        param['top_rate'] = trial.suggest_uniform('top_rate', 0.0, 1.0)
        param['other_rate'] = trial.suggest_uniform('other_rate', 0.0, 1.0 - param['top_rate'])

    """
    Best trial:
    Value: 0.7416173570019723
    Params: 
    bagging_freq: 5
    min_data_in_leaf: 17
    max_depth: 8
    learning_rate: 0.06221834301779217
    num_leaves: 17
    num_threads: 9
    min_sum_hessian_in_leaf: 4
    """
        
    lightgbm_tuna = lgb.LGBMRegressor(
        *param,
        random_state=0,
        verbosity=1,
        bagging_seed=0,
        boost_from_average='true',
        metric='auc',
    )
    
    total_auc = 0.0
    auc_add_count = 0
    
    pred_y_all = np.array([])
    y_true_all = np.array([])
    
    for (train_x, test_x, train_y, test_y) in custom_train_test_split(): 
        dtrain = lgb.Dataset(train_x, label=train_y)
        lightgbm_tuna.fit(train_x, train_y)
        pred_y = lightgbm_tuna.predict(test_x)
        pred_y_all = np.hstack((pred_y_all, pred_y))
        y_true_all = np.hstack((y_true_all, test_y))
        
    fpr, tpr, thresholds = metrics.roc_curve(y_true_all, pred_y_all, pos_label=1)
    
    return auc(fpr, tpr)

In [None]:
def main():
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=100)

    print('Number of finished trials: {}'.format(len(study.trials)))

    print('Best trial:')
    trial = study.best_trial

    print('  Value: {}'.format(trial.value))

    print('  Params: ')
    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))

main()