In [8]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import lightgbm as lgbm
import xgboost
import multiprocessing

from sklearn.neighbors import KNeighborsRegressor

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [11]:
num_cores = multiprocessing.cpu_count()
num_cores

4

In [3]:
X_train = pd.read_csv("./Dataset/CompressedData.csv")
X_train.head()

Unnamed: 0,Count,Open,High,Low,Close,Volume,VWAP,new_date,Asset_ID,Target
0,-0.420178,-0.231602,-0.231818,-0.231463,-0.2316,-0.163735,-0.231576,2018-01-01,0,0.000148
1,-0.406696,-0.231558,-0.231774,-0.23142,-0.231556,-0.163475,-0.231532,2018-01-02,0,0.000393
2,-0.41285,-0.231527,-0.231744,-0.231389,-0.231525,-0.163609,-0.231501,2018-01-03,0,0.000549
3,-0.411005,-0.231471,-0.231686,-0.231333,-0.231469,-0.163602,-0.231445,2018-01-04,0,6e-06
4,-0.389464,-0.230984,-0.231194,-0.230852,-0.230981,-0.162833,-0.230958,2018-01-05,0,0.005618


In [4]:
%%time

from sklearn.model_selection import cross_validate

def cross_validate_manual(X, y, model):
    cv = cross_validate(
        estimator = model,
        X = X,
        y = y,
        scoring = ["neg_mean_squared_error"],
        cv = 5,
    )
    
    return cv["test_neg_mean_squared_error"].mean()

CPU times: user 12 µs, sys: 2 µs, total: 14 µs
Wall time: 17.6 µs


In [5]:
y_train = X_train['Target']
X_train.drop(["Target", "new_date", "High", "Low", "Close", "Volume"], axis = 1, inplace=True)
X_train.head()

Unnamed: 0,Count,Open,VWAP,Asset_ID
0,-0.420178,-0.231602,-0.231576,0
1,-0.406696,-0.231558,-0.231532,0
2,-0.41285,-0.231527,-0.231501,0
3,-0.411005,-0.231471,-0.231445,0
4,-0.389464,-0.230984,-0.230958,0


In [6]:
model_lgbm = LGBMRegressor()
mean_lgbm = cross_validate_manual(X_train, y_train, model_lgbm)
print("LGBM: ", np.abs(10e6*mean_lgbm))

LGBM:  2.7157851030033764


In [7]:
model_lgbm.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': 'warn',
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

In [13]:
import optuna.integration.lightgbm as lgb
def objective(trial):
    param_grid = {
        'num_jobs': num_cores - 1,
        "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.2, 0.95, step=0.1
        ),
    }
    
    model_lgbm = LGBMRegressor()
    mean_lgbm = cross_validate_manual(X_train, y_train, model_lgbm)
#     print("LGBM: ", np.abs(10e6*mean_lgbm))
    
    return np.abs(10e6*mean_lgbm)