In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

In [2]:
X_train = pd.read_csv("./Dataset/CompressedData.csv")
X_train.head()

Unnamed: 0,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target,new_date
0,0,12.935323,8.271426,8.281705,8.25924,8.269135,464.922075,8.269348,0.000148,2018-01-01
1,0,20.910552,8.526834,8.540125,8.510583,8.525199,882.242788,8.524557,0.000393,2018-01-02
2,0,17.171209,8.706437,8.718355,8.692207,8.70474,662.049371,8.704213,0.000549,2018-01-03
3,0,20.150885,9.032841,9.052548,9.014046,9.03246,742.490331,9.032244,6e-06,2018-01-04
4,0,31.315202,11.87438,11.924612,11.813422,11.875316,1928.156932,11.870353,0.005618,2018-01-05


In [3]:
new_date = X_train['new_date']
X_train.drop("new_date", axis = 1, inplace = True)

In [4]:
count = 0
median = X_train.median()
print("Median for column \n" + str(median))
for column in X_train.columns:
    X_train[column] = X_train[column].replace(np.nan, median[count])
    count += 1

Median for column 
Asset_ID       7.000000
Count         70.543244
Open          17.023201
High          17.043891
Low           17.007079
Close         17.023913
Volume      1840.961048
VWAP          17.015385
Target        -0.000031
dtype: float64


In [5]:
X_train.isnull().sum()

Asset_ID    0
Count       0
Open        0
High        0
Low         0
Close       0
Volume      0
VWAP        0
Target      0
dtype: int64

In [6]:
X_train = pd.concat([X_train, new_date], axis = 1)
X_train.head()

Unnamed: 0,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target,new_date
0,0,12.935323,8.271426,8.281705,8.25924,8.269135,464.922075,8.269348,0.000148,2018-01-01
1,0,20.910552,8.526834,8.540125,8.510583,8.525199,882.242788,8.524557,0.000393,2018-01-02
2,0,17.171209,8.706437,8.718355,8.692207,8.70474,662.049371,8.704213,0.000549,2018-01-03
3,0,20.150885,9.032841,9.052548,9.014046,9.03246,742.490331,9.032244,6e-06,2018-01-04
4,0,31.315202,11.87438,11.924612,11.813422,11.875316,1928.156932,11.870353,0.005618,2018-01-05


In [7]:
import lightgbm as lgbm
import xgboost

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB

from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Ridge

from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso



from lightgbm import LGBMRegressor


from xgboost import XGBRegressor



In [8]:
%%time

from sklearn.model_selection import cross_validate

def cross_validate_manual(X, y, model):
    cv = cross_validate(
        estimator = model,
        X = X,
        y = y,
        scoring = ["neg_mean_squared_error"],
        cv = 5,
    )
    
    return cv["test_neg_mean_squared_error"].mean()

CPU times: user 4 µs, sys: 4 µs, total: 8 µs
Wall time: 10.5 µs


In [9]:
def train_basic_models(X, y):
    model_lr = LinearRegression(n_jobs = -1)
    mean_lr = cross_validate_manual(X, y, model_lr)
    print("Linear Regression: ", np.log(np.abs(mean_lr)))
    
    #model_nb = GaussianNB()
    #mean_nb = cross_validate_manual(X_train, y_train, model_nb)
    #print("Naive Bayes: ", mean_nb)
    
    model_knn = KNeighborsRegressor(n_neighbors = 10)
    mean_knn = cross_validate_manual(X, y, model_knn)
    print("KNN: ", np.log(np.abs(mean_knn)))

    model_dt = DecisionTreeRegressor()
    mean_dt = cross_validate_manual(X, y, model_dt)
    print("Decision Tree: ", np.log(np.abs(mean_dt)))

    model_rf = RandomForestRegressor(random_state = 42, n_jobs = -1)
    mean_rf = cross_validate_manual(X, y, model_rf)
    print("Random Forest: ", np.log(np.abs(mean_rf)))
    
    model_xgb = XGBRegressor()
    mean_xgb = cross_validate_manual(X, y, model_xgb)
    print("XGB: ", np.log(np.abs(mean_xgb)))
    
    model_lgbm = LGBMRegressor()
    mean_lgbm = cross_validate_manual(X, y, model_lgbm)
    print("LGBM: ", np.log(np.abs(mean_lgbm)))

In [10]:
y_train = X_train['Target']
X_train.drop(["Target", 'new_date'], axis = 1, inplace = True)

In [11]:
X_train.head()

Unnamed: 0,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP
0,0,12.935323,8.271426,8.281705,8.25924,8.269135,464.922075,8.269348
1,0,20.910552,8.526834,8.540125,8.510583,8.525199,882.242788,8.524557
2,0,17.171209,8.706437,8.718355,8.692207,8.70474,662.049371,8.704213
3,0,20.150885,9.032841,9.052548,9.014046,9.03246,742.490331,9.032244
4,0,31.315202,11.87438,11.924612,11.813422,11.875316,1928.156932,11.870353


In [12]:
train_basic_models(X_train,y_train)

Linear Regression:  -14.899936218461665
KNN:  -15.197901153650024
Decision Tree:  -13.089956037737556
Random Forest:  -14.593374354339215
XGB:  -15.195465742257694
LGBM:  -15.040991139600022


KNN and XGBoost has almost similar MSE. KNN has lower, but since XGBoost has more parameters which we can tune, we think of thaking that model. 