In [1]:
import os, sys

import importlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, FunctionTransformer
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error

import statsmodels.api as sm
import seaborn as sns
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance


from six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import os

# from estimator_all_features import _merge_external_data
# from estimator import _merge_external_data

from keras.wrappers.scikit_learn import KerasRegressor

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense,Dropout
from tensorflow.keras.models import Sequential
from keras.metrics import RootMeanSquaredError

from xgboost import XGBRegressor, XGBRFRegressor
import xgboost as xgb


In [2]:
import xgboost as xgb
from hyperopt import fmin, tpe, STATUS_OK, STATUS_FAIL, Trials


class HPOpt(object):

    def __init__(self, x_train, x_test, y_train, y_test):
        self.x_train = x_train
        self.x_test  = x_test
        self.y_train = y_train
        self.y_test  = y_test

    def process(self, fn_name, space, trials, algo, max_evals):
        fn = getattr(self, fn_name)
        try:
            result = fmin(fn=fn, space=space, algo=algo, max_evals=max_evals, trials=trials)
        except Exception as e:
            return {'status': STATUS_FAIL,
                    'exception': str(e)}
        return result, trials

    def xgbrf_reg(self, para):
        reg = xgb.XGBRFRegressor(**para['reg_params'])
        return self.train_reg(reg, para)
    
    def xgb_reg(self, para):
        reg = xgb.XGBRegressor(**para['reg_params'])
        return self.train_reg(reg, para)

    def lgb_reg(self, para):
        reg = lgb.LGBMRegressor(**para['reg_params'])
        return self.train_reg(reg, para)

    def ctb_reg(self, para):
        reg = ctb.CatBoostRegressor(**para['reg_params'])
        return self.train_reg(reg, para)

    def train_reg(self, reg, para):
        reg.fit(self.x_train, self.y_train,
                eval_set=[(self.x_train, self.y_train), (self.x_test, self.y_test)],
                **para['fit_params'])
        pred = reg.predict(self.x_test)
        loss = para['loss_func'](self.y_test, pred)
        return {'loss': loss, 'status': STATUS_OK}

def power_2(x):
    return np.power(x, 2)

def inv_fct(x):
    return 1/x

In [3]:
X_train = pd.read_csv('../embedded_data/fasttext/X_train_fasttext.csv', index_col=['review_id'])
X_test = pd.read_csv('../embedded_data/fasttext/X_test_fasttext.csv', index_col=['review_id'])
y_train = np.load('../embedded_data/fasttext/y_train_fasttext.npy')
y_test = np.load('../embedded_data/fasttext/y_test_fasttext.npy')

y_train = np.where(y_train == 1)[1]
y_test = np.where(y_test == 1)[1]

In [5]:
from hyperopt import hp

# XGB parameters
xgb_reg_params = {
#     'learning_rate':    hp.choice('learning_rate',    np.arange(0.01, 0.7, 0.01)),
#     'max_depth':        hp.choice('max_depth',        np.arange(5, 16, 1, dtype=int)),
#     'min_child_weight': hp.choice('min_child_weight', np.arange(1, 8, 1, dtype=int)),
#     'colsample_bytree': hp.choice('colsample_bytree', np.arange(0.5, 1.0, 0.1)),
#     'subsample':        hp.uniform('subsample', 0.6, 1),
    'n_estimators':     hp.choice('n_estimators', np.arange(1000, 8000)),
}

xgb_fit_params = {
    'eval_metric': 'rmse',
    'early_stopping_rounds': 10,
    'verbose': True
}

xgb_para = dict()
xgb_para['reg_params'] = xgb_reg_params
xgb_para['fit_params'] = xgb_fit_params
xgb_para['loss_func'] = lambda y, pred: np.sqrt(mean_squared_error(y, pred))


obj = HPOpt(X_train, X_test, y_train, y_test)
xgb_opt = obj.process(fn_name='lgb_reg', space=xgb_para, trials=Trials(), algo=tpe.suggest, max_evals=100)
results, trials = xgb_opt

print(results)

[0]	validation_0-rmse:1.63734	validation_1-rmse:1.65911

[1]	validation_0-rmse:1.36972	validation_1-rmse:1.41097

[2]	validation_0-rmse:1.20373	validation_1-rmse:1.26951

[3]	validation_0-rmse:1.09980	validation_1-rmse:1.18725

[4]	validation_0-rmse:1.03375	validation_1-rmse:1.14436

[5]	validation_0-rmse:0.99056	validation_1-rmse:1.11761

[6]	validation_0-rmse:0.95536	validation_1-rmse:1.10555

[7]	validation_0-rmse:0.93181	validation_1-rmse:1.09575

[8]	validation_0-rmse:0.90831	validation_1-rmse:1.09122

[9]	validation_0-rmse:0.89093	validation_1-rmse:1.08625

[10]	validation_0-rmse:0.87325	validation_1-rmse:1.08481

[11]	validation_0-rmse:0.85849	validation_1-rmse:1.08118

[12]	validation_0-rmse:0.84479	validation_1-rmse:1.07802

[13]	validation_0-rmse:0.83056	validation_1-rmse:1.07745

[14]	validation_0-rmse:0.81794	validation_1-rmse:1.07667

[15]	validation_0-rmse:0.81185	validation_1-rmse:1.07808

[16]	validation_0-rmse:0.80129	validation_1-rmse:1.07778

[17]	validation_0-rmse:0




[0]	validation_0-rmse:1.63734	validation_1-rmse:1.65911                          

[1]	validation_0-rmse:1.36972	validation_1-rmse:1.41097                          

[2]	validation_0-rmse:1.20373	validation_1-rmse:1.26951                          

[3]	validation_0-rmse:1.09980	validation_1-rmse:1.18725                          

[4]	validation_0-rmse:1.03375	validation_1-rmse:1.14436                          

[5]	validation_0-rmse:0.99056	validation_1-rmse:1.11761                          

[6]	validation_0-rmse:0.95536	validation_1-rmse:1.10555                          

[7]	validation_0-rmse:0.93181	validation_1-rmse:1.09575                          

[8]	validation_0-rmse:0.90831	validation_1-rmse:1.09122                          

[9]	validation_0-rmse:0.89093	validation_1-rmse:1.08625                          

[10]	validation_0-rmse:0.87325	validation_1-rmse:1.08481                         

[11]	validation_0-rmse:0.85849	validation_1-rmse:1.08118                         

[12]




[0]	validation_0-rmse:1.63734	validation_1-rmse:1.65911                          

[1]	validation_0-rmse:1.36972	validation_1-rmse:1.41097                          

[2]	validation_0-rmse:1.20373	validation_1-rmse:1.26951                          

[3]	validation_0-rmse:1.09980	validation_1-rmse:1.18725                          

[4]	validation_0-rmse:1.03375	validation_1-rmse:1.14436                          

[5]	validation_0-rmse:0.99056	validation_1-rmse:1.11761                          

[6]	validation_0-rmse:0.95536	validation_1-rmse:1.10555                          

[7]	validation_0-rmse:0.93181	validation_1-rmse:1.09575                          

[8]	validation_0-rmse:0.90831	validation_1-rmse:1.09122                          

[9]	validation_0-rmse:0.89093	validation_1-rmse:1.08625                          

[10]	validation_0-rmse:0.87325	validation_1-rmse:1.08481                         

[11]	validation_0-rmse:0.85849	validation_1-rmse:1.08118                         

[12]




[0]	validation_0-rmse:1.63734	validation_1-rmse:1.65911                          

[1]	validation_0-rmse:1.36972	validation_1-rmse:1.41097                          

[2]	validation_0-rmse:1.20373	validation_1-rmse:1.26951                          

[3]	validation_0-rmse:1.09980	validation_1-rmse:1.18725                          

[4]	validation_0-rmse:1.03375	validation_1-rmse:1.14436                          

[5]	validation_0-rmse:0.99056	validation_1-rmse:1.11761                          

[6]	validation_0-rmse:0.95536	validation_1-rmse:1.10555                          

[7]	validation_0-rmse:0.93181	validation_1-rmse:1.09575                          

  3%|▎         | 3/100 [01:50<59:23, 36.73s/trial, best loss: 1.0744430389231314]


KeyboardInterrupt: 

In [None]:
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.9, learning_rate=0.01, 
                          n_estimators=5000, max_depth=8, min_child_weight=3, subsample=0.8666220148202693, booster='gbtree')

xg_reg.fit(X_train_data, y_train)

preds = xg_reg.predict(X_test_data)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

plt.rcParams['figure.figsize'] = [15, 15]
xgb.plot_importance(xg_reg)
plt.show()