# Random Forest

In [13]:
import os

import numpy as np
import pandas as pd
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import matplotlib
matplotlib.rcParams['font.family']='IPAGothic'
import matplotlib.pyplot as plt
import optuna

# generate dataframe
base_path = "../data/dataframes/"
data = pd.read_pickle(base_path + 'data.pkl')
nomination_onehot = pd.read_pickle(base_path + 'nomination_onehot.pkl')
selected_performers_onehot = pd.read_pickle(base_path + 'selected_performers_onehot.pkl')
selected_directors_onehot = pd.read_pickle(base_path + 'selected_directors_onehot.pkl')
selected_studio_onehot = pd.read_pickle(base_path + 'selected_studio_onehot.pkl')
selected_scriptwriter_onehot = pd.read_pickle(base_path + 'selected_scriptwriter_onehot.pkl')

df = pd.concat(
    [
        nomination_onehot, 
        selected_performers_onehot,
        selected_directors_onehot,
        selected_studio_onehot,
        selected_scriptwriter_onehot,
        data["screen_time"],
        data["year"]
    ],
    axis=1
)

def standarize_x(x_train, x_test):
    scaler = sklearn.preprocessing.StandardScaler()
    std_x_train = scaler.fit_transform(x_train)
    std_x_test = scaler.transform(x_test)
    return std_x_train, std_x_test


def custom_train_test_split():
    for year in range(1978, 2020):
        train_x = df[df["year"] != year].drop("year", axis=1).values
        test_x = df[df["year"] == year].drop("year", axis=1).values
        train_y = data[data["year"] != year]["prize"].values
        test_y = data[data["year"] == year]["prize"].values
        std_train_x, std_test_x = standarize_x(train_x, test_x)
        yield (std_train_x, std_test_x, train_y, test_y)

In [15]:
def objective(trial):
    model = sklearn.ensemble.RandomForestRegressor(
        n_estimators=500, 
        criterion='mse',
        max_depth=trial.suggest_int("max_depth", 2, 100),
        min_samples_split=trial.suggest_int("min_samples_split", 1, 50),
        min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 50),
        min_weight_fraction_leaf=trial.suggest_uniform("min_weight_fraction_leaf", 0.0, 0.5),
        max_features=trial.suggest_uniform("max_features", 0.0, 0.99999999999),
        max_leaf_nodes=None,
        min_impurity_decrease=0.0,
        min_impurity_split=None,
        bootstrap=True,
        oob_score=False,
        n_jobs=None,
        random_state=None,
        verbose=0,
        warm_start=False
    )
    pred_y_all = np.array([])
    y_true_all = np.array([])
    
    for (train_x, test_x, train_y, test_y) in custom_train_test_split(): 
        model.fit(train_x, train_y)
        pred_y = model.predict(test_x)
        pred_y_all = np.hstack((pred_y_all, pred_y))
        y_true_all = np.hstack((y_true_all, test_y))
        
    fpr, tpr, thresholds = metrics.roc_curve(y_true_all, pred_y_all, pos_label=1)
    
    return metrics.auc(fpr, tpr)

In [None]:
def main():
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=100)

    print('Number of finished trials: {}'.format(len(study.trials)))

    print('Best trial:')
    trial = study.best_trial

    print('  Value: {}'.format(trial.value))

    print('  Params: ')
    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))

main()

[I 2019-08-20 16:28:00,730] Finished trial#0 resulted in value: 0.719075795998873. Current best value is 0.719075795998873 with parameters: {'max_depth': 6, 'min_samples_split': 28, 'min_samples_leaf': 13, 'min_weight_fraction_leaf': 0.06446203604734574, 'max_features': 0.49842317239665146}.
[I 2019-08-20 16:28:39,596] Finished trial#1 resulted in value: 0.40074668920822765. Current best value is 0.719075795998873 with parameters: {'max_depth': 6, 'min_samples_split': 28, 'min_samples_leaf': 13, 'min_weight_fraction_leaf': 0.06446203604734574, 'max_features': 0.49842317239665146}.
[I 2019-08-20 16:29:16,920] Finished trial#2 resulted in value: 0.4399126514511129. Current best value is 0.719075795998873 with parameters: {'max_depth': 6, 'min_samples_split': 28, 'min_samples_leaf': 13, 'min_weight_fraction_leaf': 0.06446203604734574, 'max_features': 0.49842317239665146}.
[I 2019-08-20 16:30:04,382] Finished trial#3 resulted in value: 0.7287968441814596. Current best value is 0.7287968441