# Baseline model for single step

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
d_data = pd.read_csv("../data/clean/df_41017.csv")

In [6]:
swh_arr = d_data.swh.values

In [7]:
swh_arr.shape

(32904,)

In [10]:
swh_arr.shape[0] - 6

32898

In [26]:
dataset = np.zeros((32899, 6))

In [27]:
for idx, start in enumerate(range(0, len(swh_arr) - 5)):
    row = swh_arr[start:start+6]
    dataset[idx, :] = row

In [46]:
dataset_nona = dataset[~np.isnan(dataset).any(axis=1)]

### Modeling

In [84]:
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, make_scorer

In [64]:
def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=True)

In [65]:
rmse_score = make_scorer(rmse, greater_is_better=False)

In [57]:
X = dataset_nona[:, :5]
y = dataset_nona[:, 5]

In [58]:
X

array([[2.15, 2.18, 2.32, 2.47, 2.31],
       [2.18, 2.32, 2.47, 2.31, 2.37],
       [2.32, 2.47, 2.31, 2.37, 2.25],
       ...,
       [2.  , 1.84, 2.04, 2.01, 1.99],
       [1.84, 2.04, 2.01, 1.99, 2.11],
       [2.04, 2.01, 1.99, 2.11, 2.15]])

In [59]:
y

array([2.37, 2.25, 2.25, ..., 2.11, 2.15, 2.  ])

Train Test Split

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [79]:
model = SVR()

In [80]:
model.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [81]:
y_pred = model.predict(X_test)

In [82]:
mean_squared_error(y_test, y_pred, squared=True)

0.012503255051962562

KFold

In [85]:
folds = KFold(n_splits=5).split(np.arange(0, len(X)))

In [87]:
scores = np.zeros(5)

In [88]:
for idx, (train, test) in enumerate(folds):
    X_train, y_train = X[train], y[train]
    X_test, y_test = X[test], y[test]
    
    model = SVR()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=True)
    scores[idx] = rmse

In [89]:
scores.mean(), scores.std()

(0.01294563862181603, 0.0008040770585380799)

Inference with latest data split

In [90]:
model = SVR()

In [91]:
model.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [92]:
y_pred = model.predict(X_test)

In [99]:
y_true_pred = np.concatenate((y_test.reshape(-1, 1), y_pred.reshape(-1,1)), axis=1)

In [101]:
pd.DataFrame(data=y_true_pred, columns=['true', 'pred'])

Unnamed: 0,true,pred
0,2.18,2.192955
1,2.49,2.171414
2,2.17,2.316106
3,2.28,2.253712
4,2.25,2.276997
...,...,...
6096,2.01,1.983925
6097,1.99,1.997016
6098,2.11,1.991488
6099,2.15,2.051199
