In [3]:
import numpy as np
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, KFold, cross_val_score
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
df = pd.read_csv("../data/datefrom1st.csv")
df = df.drop(['datetime', 'Unnamed: 0'], axis=1)
df = df.astype({'difference': 'int32'})
y = df['result']

X = df.drop('result', axis=1)
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)

In [5]:
corr = df.corr()

In [6]:

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

In [7]:
def MAPE(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100



In [9]:
# https://partrita.github.io/posts/regression-error/
pipelines = []


# pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),('LR',LinearRegression())])))
pipelines.append(('ScaledSVR', Pipeline([('Scaler', StandardScaler()),('SVR', SVR())])))
#pipelines.append(('ScaledLASSO', Pipeline([('Scaler', StandardScaler()),('LASSO', Lasso())])))
# pipelines.append(('ScaledEN', Pipeline([('Scaler', StandardScaler()),('EN', ElasticNet())])))
pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN', KNeighborsRegressor())])))
pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),('CART', DecisionTreeRegressor())])))
pipelines.append(('ScaledGBM', Pipeline([('Scaler', StandardScaler()),('GBM', GradientBoostingRegressor())])))
pipelines.append(('ScaledXG', Pipeline([('Scaler', StandardScaler()),('XG', XGBRegressor())])))



results = [[]]
idx = 0
names = []
scoring = ['neg_root_mean_squared_error', 'r2', 'neg_mean_absolute_error']
for name, model in pipelines:
    kfold = KFold(shuffle=True, n_splits=10, random_state=21)
    cv_results = []
    for s in scoring:
        cv_results.append(cross_val_score(model, X_train, y_train, cv=kfold, scoring=s))
    names.append(name)
    print("\n=={}==".format(name))
    for idx in range (len(cv_results)):
        print("[+]{}".format(scoring[idx]))
        msg = "%f (%f)" %  (cv_results[idx].mean(), cv_results[idx].std())
        print(msg)
   




==ScaledLR==
[+]neg_root_mean_squared_error
-2.101628 (0.086105)
[+]r2
0.906249 (0.008209)
[+]explained_variance
0.906276 (0.008203)
[+]neg_mean_absolute_error
-1.196618 (0.025096)

==ScaledSVR==
[+]neg_root_mean_squared_error
-1.784116 (0.103419)
[+]r2
0.932305 (0.008226)
[+]explained_variance
0.932325 (0.008230)
[+]neg_mean_absolute_error
-0.759324 (0.023347)

==ScaledLASSO==
[+]neg_root_mean_squared_error
-2.402552 (0.059015)
[+]r2
0.877648 (0.006429)
[+]explained_variance
0.877669 (0.006430)
[+]neg_mean_absolute_error
-1.600232 (0.021680)

==ScaledEN==
[+]neg_root_mean_squared_error
-3.038942 (0.047732)
[+]r2
0.804380 (0.004970)
[+]explained_variance
0.804426 (0.004973)
[+]neg_mean_absolute_error
-2.216858 (0.026504)

==ScaledKNN==
[+]neg_root_mean_squared_error
-1.774588 (0.083280)
[+]r2
0.933133 (0.006503)
[+]explained_variance
0.933177 (0.006526)
[+]neg_mean_absolute_error
-0.819212 (0.026579)

==ScaledCART==
[+]neg_root_mean_squared_error
-2.075226 (0.098994)
[+]r2
0.907474 (0