In [3]:
import random

import pandas as pd
import numpy as np
import lightgbm as lgb
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, accuracy_score, confusion_matrix
import matplotlib
import matplotlib.pyplot as plt
from sklearn import metrics
import optuna
matplotlib.rcParams['font.family']='IPAGothic'

In [2]:
# generate dataframe

base_path = "../../data/dataframes/"
data = pd.read_pickle(base_path + 'data.pkl')
nomination_onehot = pd.read_pickle(base_path + 'nomination_onehot.pkl')
selected_performers_onehot = pd.read_pickle(base_path + 'selected_performers_onehot.pkl')
selected_directors_onehot = pd.read_pickle(base_path + 'selected_directors_onehot.pkl')
selected_studio_onehot = pd.read_pickle(base_path + 'selected_studio_onehot.pkl')
selected_scriptwriter_onehot = pd.read_pickle(base_path + 'selected_scriptwriter_onehot.pkl')

df = pd.concat(
    [
        nomination_onehot, 
        selected_performers_onehot,
        selected_directors_onehot,
        selected_studio_onehot,
        selected_scriptwriter_onehot,
        data["screen_time"],
        data["year"]
    ],
    axis=1
)

In [21]:
def standarize_x(x_train, x_test):
    scaler = sklearn.preprocessing.StandardScaler()
    std_x_train = scaler.fit_transform(x_train)
    std_x_test = scaler.transform(x_test)
    return std_x_train, std_x_test


def custom_train_test_split():
    for year in range(1978, 2020):
        train_x = df[df["year"] != year].drop("year", axis=1).values
        test_x = df[df["year"] == year].drop("year", axis=1).values
        train_y = data[data["year"] != year]["prize"].values
        test_y = data[data["year"] == year]["prize"].values
        std_train_x, std_test_x = standarize_x(train_x, test_x)
        yield (std_train_x, std_test_x, train_y, test_y)

In [26]:
def objective(trial):
    #paramter_tuning using optuna
    bagging_freq =  trial.suggest_int('bagging_freq',1,10),
    min_data_in_leaf =  trial.suggest_int('min_data_in_leaf',2,100),
    max_depth = trial.suggest_int('max_depth',1,20),
    learning_rate = trial.suggest_loguniform('learning_rate',0.001,0.1),
    num_leaves = trial.suggest_int('num_leaves',2,70),
    num_threads = trial.suggest_int('num_threads',1,10),
    min_sum_hessian_in_leaf = trial.suggest_int('min_sum_hessian_in_leaf',1,10),
        
    lightgbm_tuna = lgb.LGBMRegressor(
        random_state = 0,
        verbosity = 1,
        bagging_seed = 0,
        boost_from_average = 'true',
        boost = 'gbdt',
        metric = 'auc',
        bagging_freq = bagging_freq ,
        min_data_in_leaf = min_data_in_leaf,
        max_depth = max_depth,
        learning_rate = learning_rate,
        num_leaves = num_leaves,
        num_threads = num_threads,
        min_sum_hessian_in_leaf = min_sum_hessian_in_leaf
    )
    
    total_auc = 0.0
    auc_add_count = 0
    
    for (train_x, test_x, train_y, test_y) in custom_train_test_split(): 
        dtrain = lgb.Dataset(train_x, label=train_y)
        lightgbm_tuna.fit(train_x, train_y)
        pred_y = lightgbm_tuna.predict(test_x)
        fpr, tpr, thresholds = metrics.roc_curve(test_y, pred_y, pos_label=1)
        total_auc += auc(fpr, tpr)
        auc_add_count += 1
    
    return total_auc / auc_add_count

In [None]:
def main():
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=100)

    print('Number of finished trials: {}'.format(len(study.trials)))

    print('Best trial:')
    trial = study.best_trial

    print('  Value: {}'.format(trial.value))

    print('  Params: ')
    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))

main()

[I 2019-08-20 12:08:14,708] Finished trial#0 resulted in value: 0.4577380952380953. Current best value is 0.4577380952380953 with parameters: {'bagging_freq': 5, 'min_data_in_leaf': 75, 'max_depth': 6, 'learning_rate': 0.003390775101241283, 'num_leaves': 31, 'num_threads': 3, 'min_sum_hessian_in_leaf': 4}.
[I 2019-08-20 12:08:20,413] Finished trial#1 resulted in value: 0.42083333333333334. Current best value is 0.4577380952380953 with parameters: {'bagging_freq': 5, 'min_data_in_leaf': 75, 'max_depth': 6, 'learning_rate': 0.003390775101241283, 'num_leaves': 31, 'num_threads': 3, 'min_sum_hessian_in_leaf': 4}.
[I 2019-08-20 12:08:22,023] Finished trial#2 resulted in value: 0.7053571428571429. Current best value is 0.7053571428571429 with parameters: {'bagging_freq': 2, 'min_data_in_leaf': 10, 'max_depth': 14, 'learning_rate': 0.005305853988926544, 'num_leaves': 2, 'num_threads': 1, 'min_sum_hessian_in_leaf': 8}.
[I 2019-08-20 12:08:23,504] Finished trial#3 resulted in value: 0.447619047