In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import lightgbm as lgbm
import xgboost

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Ridge

from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso

from lightgbm import LGBMRegressor

from xgboost import XGBRegressor

In [2]:
X_train = pd.read_csv("./Dataset/train_final.csv")
X_train.drop("Unnamed: 0", axis = 1, inplace=True)
X_train.head()

Unnamed: 0,Count,Open,Volume,Asset_ID,Target,Range_Close_Open,Range_High_Low
0,-0.420178,-0.231602,-0.163735,0,0.000148,0.009311,-0.106561
1,-0.406696,-0.231558,-0.163475,0,0.000393,0.009823,-0.106239
2,-0.41285,-0.231527,-0.163609,0,0.000549,0.009775,-0.106443
3,-0.411005,-0.231471,-0.163602,0,6e-06,0.010801,-0.105861
4,-0.389464,-0.230984,-0.162833,0,0.005618,0.011829,-0.102587


In [3]:
y_train = X_train['Target']
X_train.drop("Target", axis = 1, inplace = True)

In [4]:
%%time

from sklearn.model_selection import cross_validate

def cross_validate_manual(X, y, model):
    cv = cross_validate(
        estimator = model,
        X = X,
        y = y,
        scoring = ["neg_mean_squared_error"],
        cv = 5,
    )
    
    return cv["test_neg_mean_squared_error"].mean()

CPU times: user 6 µs, sys: 5 µs, total: 11 µs
Wall time: 12.6 µs


In [5]:
X_train.head()

Unnamed: 0,Count,Open,Volume,Asset_ID,Range_Close_Open,Range_High_Low
0,-0.420178,-0.231602,-0.163735,0,0.009311,-0.106561
1,-0.406696,-0.231558,-0.163475,0,0.009823,-0.106239
2,-0.41285,-0.231527,-0.163609,0,0.009775,-0.106443
3,-0.411005,-0.231471,-0.163602,0,0.010801,-0.105861
4,-0.389464,-0.230984,-0.162833,0,0.011829,-0.102587


In [6]:
def train_basic_models(X, y):
    model_lr = LinearRegression(n_jobs = -1)
    mean_lr = cross_validate_manual(X, y, model_lr)
    print("Linear Regression: ", np.abs(10e5*mean_lr))
    
    model_knn = KNeighborsRegressor(n_neighbors = 10)
    mean_knn = cross_validate_manual(X, y, model_knn)
    print("KNN: ", np.abs(10e5*mean_knn))

    model_dt = DecisionTreeRegressor()
    mean_dt = cross_validate_manual(X, y, model_dt)
    print("Decision Tree: ", np.abs(10e5*mean_dt))

    model_rf = RandomForestRegressor(random_state = 42, n_jobs = -1)
    mean_rf = cross_validate_manual(X, y, model_rf)
    print("Random Forest: ", np.abs(10e5*mean_rf))
    
    model_xgb = XGBRegressor()
    mean_xgb = cross_validate_manual(X, y, model_xgb)
    print("XGB: ", np.abs(10e5*mean_xgb))
    
    model_lgbm = LGBMRegressor()
    mean_lgbm = cross_validate_manual(X, y, model_lgbm)
    print("LGBM: ", np.abs(10e5*mean_lgbm))
    
    model_lasso = LassoCV(n_jobs = -1, cv=5)
    mean_lasso = cross_validate_manual(X, y, model_lasso)
    print("LassoCV: ", np.abs(10e5*mean_lasso))
    
    model_ridge = RidgeCV(cv=5)
    mean_ridge = cross_validate_manual(X, y, model_ridge)
    print("RidgeCV: ", np.abs(10e5*mean_ridge))

In [7]:
train_basic_models(X_train,y_train)

Linear Regression:  0.27847453553821033
KNN:  0.2750872586068413
Decision Tree:  0.6172909686943512
Random Forest:  0.242716706232181
XGB:  0.23056290071649205
LGBM:  0.20484413908640328
LassoCV:  0.23906921550956728
RidgeCV:  0.27293100941055715


## Interpretations
* Based on what we see above, the top 3 models include LassoCV, LGBM and XGB Regressors.
* Thus, we will optimize these 3 models