# SVM Forecasting Model

In [1]:
import tseriesRoutines as routines
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import sqlite3
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib

In [2]:
# RESULT REPRODUCIBILITY
# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.
np.random.seed(42)

In [3]:
def genData(mongoid, conn, cursor, impute=True, freq='daily'):
    '''
    Generate a timeseries dataframe for timeseries modelling.
    mongoid: str. string of mongodb id.
    conn: sqlite3 connection.
    cursor: sqlite3 cursor.
    impute:
    freq:
    actualrevcount:
    '''
    np.random.seed(42)
    initial = routines.sqlToDf(conn, cursor)
    allproduct = initial.selectReview3(mongoid, impute=impute)
    product = routines.tsSalesRateSentiment(allproduct, freq=freq)
    return product
    # product = genData('5aa2ad7735d6d34b0032a795', conn, c, impute=True, 
    #   freq='daily')

In [4]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    # https://machinelearningmastery.com/multivariate-time-series-forecasting-lstms-keras/
    n_vars = 1 if type(data) is list else data.shape[1]
    df = data.copy()
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]

    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]

    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg = agg.dropna()
    return agg

In [5]:
#Adj R square
def adj_r2_score(model,X_test, y_test,):
    y_pred = model.predict(X_test)
    # model.coefs_ doesn't exist
    adj = 1 - float(len(y_test)-1)/(len(y_test)-model.n_features_-1) * \
            (1 - r2_score(y_test,y_pred))
    return adj

# Evalute random search
def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    MSE = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
#    r2_adjusted = adj_r2_score(model, X_test, y_test)
    print('Model Validation')
    print('MSE: {0}'.format(MSE))
    print('R^2: {0}'.format(r2))
#    print('R^2 Adjusted: {0}'.format(r2_adjusted))

In [6]:
def splitDataNN(df, n_in=1, n_out=1, scale=True, percent=0.2):
    '''
    df: pandas dataframe. 3 columns (sales, rating, ovsentiment) with date as index
    n_in:
    n_out:
    scale:
    percent:
    X_train, y_train, X_test, y_test, dftrain = splitDataNN(product, n_in=1, 
        n_out=1, scale=True, percent=0.2)
    '''
    dftrain = series_to_supervised(df, n_in=n_in, n_out=n_out)
    # specific to this case
    dftrain = dftrain.drop(dftrain.columns[[4, 5]], axis=1)
    values = dftrain.values

    if scale:
        scaler = StandardScaler()
        values = scaler.fit_transform(values)
    else:
        pass

    # training data
    X, y = values[:, :-1], values[:, -1]
    # train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=percent, 
            shuffle=False, random_state=42)
    return X_train, y_train, X_test, y_test, dftrain, scaler

In [7]:
# make connection to sqlite db
conn = sqlite3.connect('product.db')
c = conn.cursor()

# enable foreign keys
c.execute("PRAGMA foreign_keys = ON")
conn.commit()

<br>pilihan:
>     2 data di database product.db dgn review > 900:
>         5aa2ad7735d6d34b0032a795
>         5aa39533ae1f941be7165ecd
>     cluster 3
>         5a93e8768cbad97881597597
>         or 
>         5a95d7ae35d6d33d3fea56ff
>     cluster 1
>         5aa2c35e35d6d34b0032a796
>     cluster 2 
>         5a92474635d6d32207bcd343
</br>

## <font color=blue> 1. Mongodb ID: 5aa2ad7735d6d34b0032a795 </font>

In [8]:
product = genData('5aa2ad7735d6d34b0032a795', conn, c, impute=True, freq='daily')
X_train, y_train, X_test, y_test, dftrain, scaler = splitDataNN(product, percent=0.2)

In [9]:
X_train

array([[-0.50777636,  0.07451271,  0.04161726],
       [ 0.13376664,  0.74302926,  1.25092118],
       [-1.36316702, -1.48535923, -1.16768666],
       ...,
       [ 1.20300496,  0.74302926, -0.36148405],
       [ 0.24069047, -1.48535923, -1.16768666],
       [-1.04239552,  0.74302926,  0.04161726]])

In [9]:
product.shape

(479, 3)

In [10]:
product['Sales'].sum()

41548

In [11]:
product.head()

Unnamed: 0_level_0,Sales,rating,ovsentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-11-06,82,3.5,0.5
2016-11-07,88,5.0,1.0
2016-11-08,74,0.0,0.0
2016-11-09,72,0.0,0.0
2016-11-10,98,4.666667,0.333333


In [12]:
X_train.shape

(382, 3)

In [13]:
from sklearn.svm import SVR
regressor = SVR(kernel='poly')

In [14]:
# Grid Search
from sklearn.model_selection import GridSearchCV
parameters = [{'C':[1, 5, 10, 100], 'kernel':['linear']},
        {'C':[1, 5, 10, 100], 'kernel':['rbf'], 'gamma':[0.5, 0.1, 0.01, 0.001, 0.0001]}
        ]
grid_search = GridSearchCV(estimator=regressor,
        param_grid=parameters,
        scoring=['neg_mean_squared_error', 'r2', 'explained_variance'],
        cv=5,
        n_jobs=-1,
        refit='r2',
        verbose=2)

In [15]:
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
best_estimator = grid_search.best_estimator_

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] C=5, kernel=linear ..............................................
[CV] C=5, kernel=linear ..............................................
[CV] C=5, kernel=linear ..............................................
[CV] C=5, kernel=linear ..............................................
[CV] C=10, kernel=linear .............................................
[CV] ............................... C=1, kernel=linear, total=   0.0s
[CV] ............................... C=1, kernel=linear, total=   0.0s
[CV] C=10, kernel=linear .............................................
[CV] C=10, kern

[CV] ..................... C=5, gamma=0.001, kernel=rbf, total=   0.0s
[CV] C=100, gamma=0.01, kernel=rbf ...................................
[CV] C=5, gamma=0.001, kernel=rbf ....................................
[CV] ................... C=10, gamma=0.0001, kernel=rbf, total=   0.0s
[CV] C=10, gamma=0.0001, kernel=rbf ..................................
[CV] ....................... C=1, gamma=0.1, kernel=rbf, total=   0.0s
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] ....................... C=5, gamma=0.5, kernel=rbf, total=   0.0s
[CV] C=5, gamma=0.5, kernel=rbf ......................................
[CV] ..................... C=5, gamma=0.001, kernel=rbf, total=   0.0s
[CV] C=5, gamma=0.001, kernel=rbf ....................................
[CV] .................... C=100, gamma=0.01, kernel=rbf, total=   0.0s
[CV] C=100, gamma=0.01, kernel=rbf ...................................
[CV] ................... C=10, gamma=0.0001, kernel=rbf, total=   0.0s
[CV] C

[CV] .................... C=10, gamma=0.001, kernel=rbf, total=   0.0s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] ..................... C=1, gamma=0.001, kernel=rbf, total=   0.0s
[CV] ..................... C=100, gamma=0.1, kernel=rbf, total=   0.0s
[CV] C=100, gamma=0.1, kernel=rbf ....................................
[CV] .................... C=10, gamma=0.001, kernel=rbf, total=   0.0s
[CV] ..................... C=100, gamma=0.1, kernel=rbf, total=   0.0s
[CV] C=100, gamma=0.1, kernel=rbf ....................................
[CV] ..................... C=100, gamma=0.1, kernel=rbf, total=   0.0s


[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    0.5s finished


In [16]:
best_parameters

{'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}

In [17]:
# k-fold cross validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=best_estimator, X=X_train, y=y_train, cv=10, n_jobs=-1,
        scoring='neg_mean_squared_error', verbose=2)
accuracies.mean(), accuracies.std()

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total=   0.0s
[CV] ................................................. , total=   0.0s
[CV] ................................................. , total=   0.0s
[CV] ................................................. , total=   0.0s
[CV] .

[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished


(-1.0016023554498308, 0.2962978718314309)

In [18]:
evaluate(best_estimator, X_test, y_test)

Model Validation
MSE: 1.0925107112024879
R^2: -0.05184196599132096


In [19]:
y_pred = best_estimator.predict(X_test)
np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_test.reshape(-1,1)))

1.045232371868805

In [20]:
mean_absolute_error(y_pred=y_pred, y_true=y_test.reshape(-1,1))

0.8421098318241441

In [21]:
joblib.dump(best_estimator, './training/forecast_svr_5aa2ad7735d6d34b0032a795.pkl')

['./training/forecast_svr_5aa2ad7735d6d34b0032a795.pkl']

## <font color=blue> 2. Mongodb ID: 5aa39533ae1f941be7165ecd </font>

In [22]:
product = genData('5aa39533ae1f941be7165ecd', conn, c, impute=True, freq='daily')
X_train, y_train, X_test, y_test, dftrain, scaler = splitDataNN(product, percent=0.2)

In [23]:
regressor = SVR(kernel='poly')

In [24]:
# Grid Search
from sklearn.model_selection import GridSearchCV
parameters = [{'C':[1, 5, 10, 100], 'kernel':['linear']},
        {'C':[1, 5, 10, 100], 'kernel':['rbf'], 'gamma':[0.5, 0.1, 0.01, 0.001, 0.0001]}
        ]
grid_search = GridSearchCV(estimator=regressor,
        param_grid=parameters,
        scoring=['neg_mean_squared_error', 'r2', 'explained_variance'],
        cv=5,
        n_jobs=-1,
        refit='r2',
        verbose=2)

In [25]:
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
best_estimator = grid_search.best_estimator_

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] C=5, kernel=linear ..............................................
[CV] C=5, kernel=linear ..............................................
[CV] C=5, kernel=linear ..............................................
[CV] C=10, kernel=linear .............................................
[CV] C=5, kernel=linear ..............................................
[CV] ............................... C=1, kernel=linear, total=   0.0s
[CV] C=1, kernel=linear ..............................................
[CV] C=5, kernel=linear ..............................................
[CV] ............................... C=1, kernel=linear, total=   0.0s
[CV] C=10, kern

[CV] .................... C=5, gamma=0.0001, kernel=rbf, total=   0.0s
[CV] C=10, gamma=0.5, kernel=rbf .....................................
[CV] ................... C=100, gamma=0.001, kernel=rbf, total=   0.0s
[CV] ...................... C=1, gamma=0.01, kernel=rbf, total=   0.0s
[CV] ..................... C=100, gamma=0.5, kernel=rbf, total=   0.0s
[CV] C=100, gamma=0.001, kernel=rbf ..................................
[CV] C=100, gamma=0.5, kernel=rbf ....................................
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] .................... C=10, gamma=0.001, kernel=rbf, total=   0.0s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] ...................... C=5, gamma=0.01, kernel=rbf, total=   0.0s
[CV] C=5, gamma=0.01, kernel=rbf .....................................
[CV] .................... C=1, gamma=0.0001, kernel=rbf, total=   0.0s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................
[CV] .

[CV] C=100, gamma=0.1, kernel=rbf ....................................
[CV] ...................... C=10, gamma=0.1, kernel=rbf, total=   0.0s
[CV] ..................... C=100, gamma=0.1, kernel=rbf, total=   0.0s
[CV] C=100, gamma=0.01, kernel=rbf ...................................
[CV] .................... C=100, gamma=0.01, kernel=rbf, total=   0.0s
[CV] C=100, gamma=0.01, kernel=rbf ...................................
[CV] .................... C=100, gamma=0.01, kernel=rbf, total=   0.0s
[CV] C=100, gamma=0.01, kernel=rbf ...................................
[CV] .................... C=100, gamma=0.01, kernel=rbf, total=   0.0s


[Parallel(n_jobs=-1)]: Done  36 out of 120 | elapsed:    0.5s remaining:    1.1s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    0.6s finished


In [26]:
best_parameters

{'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}

In [27]:
# k-fold cross validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=best_estimator, X=X_train, y=y_train, cv=10, n_jobs=-1,
        scoring='neg_mean_squared_error', verbose=2)
accuracies.mean(), accuracies.std()

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total=   0.0s
[CV] ................................................. , total=   0.0s
[CV] ................................................. , total=   0.0s
[CV]  ................................................................
[CV] ................................................. , total=   0.0s
[CV]  ................................................................
[CV] .

[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished


(-0.959302769540539, 0.1025659508332724)

In [28]:
evaluate(best_estimator, X_test, y_test)

Model Validation
MSE: 1.1932164563966103
R^2: 0.0008743733807523579


In [29]:
y_pred = best_estimator.predict(X_test)
np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_test.reshape(-1,1)))

1.0923444769836164

In [30]:
mean_absolute_error(y_pred=y_pred, y_true=y_test.reshape(-1,1))

0.8501663999645314

In [31]:
joblib.dump(best_estimator, './training/forecast_svr_5aa39533ae1f941be7165ecd.pkl')

['./training/forecast_svr_5aa39533ae1f941be7165ecd.pkl']

## <font color=blue> 3. Mongodb ID: 5a93e8768cbad97881597597 </font>

In [32]:
product = genData('5a93e8768cbad97881597597', conn, c, impute=True, freq='daily')
X_train, y_train, X_test, y_test, dftrain, scaler = splitDataNN(product, percent=0.2)

In [33]:
regressor = SVR(kernel='poly')

# Grid Search
from sklearn.model_selection import GridSearchCV
parameters = [{'C':[1, 5, 10, 100], 'kernel':['linear']},
        {'C':[1, 5, 10, 100], 'kernel':['rbf'], 'gamma':[0.5, 0.1, 0.01, 0.001, 0.0001]}
        ]
grid_search = GridSearchCV(estimator=regressor,
        param_grid=parameters,
        scoring=['neg_mean_squared_error', 'r2', 'explained_variance'],
        cv=5,
        n_jobs=-1,
        refit='r2',
        verbose=2)

In [34]:
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
best_estimator = grid_search.best_estimator_

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] C=5, kernel=linear ..............................................
[CV] C=5, kernel=linear ..............................................
[CV] C=10, kernel=linear .............................................
[CV] C=5, kernel=linear ..............................................
[CV] C=5, kernel=linear ..............................................
[CV] C=5, kernel=linear ..............................................
[CV] ............................... C=1, kernel=linear, total=   0.0s
[CV] ............................... C=1, kernel=linear, total=   0.0s
[CV] ..........

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.3s


[CV] ..................... C=5, gamma=0.001, kernel=rbf, total=   0.0s
[CV] ...................... C=1, gamma=0.01, kernel=rbf, total=   0.0s
[CV] C=5, gamma=0.001, kernel=rbf ....................................
[CV] ...................... C=10, gamma=0.5, kernel=rbf, total=   0.1s
[CV] C=10, gamma=0.5, kernel=rbf .....................................
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] ..................... C=10, gamma=0.01, kernel=rbf, total=   0.1s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] ...................... C=5, gamma=0.01, kernel=rbf, total=   0.0s
[CV] .................... C=5, gamma=0.0001, kernel=rbf, total=   0.0s
[CV] C=5, gamma=0.01, kernel=rbf .....................................
[CV] ....................... C=5, gamma=0.5, kernel=rbf, total=   0.1s
[CV] ...................... C=10, gamma=0.1, kernel=rbf, total=   0.1s
[CV] ................... C=10, gamma=0.0001, kernel=rbf, total=   0.0s
[CV] C

[Parallel(n_jobs=-1)]: Done  36 out of 120 | elapsed:    0.8s remaining:    1.9s


[CV] ..................... C=100, gamma=0.1, kernel=rbf, total=   0.0s
[CV] C=100, gamma=0.1, kernel=rbf ....................................
[CV] ....................... C=5, gamma=0.1, kernel=rbf, total=   0.1s
[CV] ..................... C=10, gamma=0.01, kernel=rbf, total=   0.0s
[CV] ..................... C=100, gamma=0.5, kernel=rbf, total=   0.1s
[CV] .................. C=100, gamma=0.0001, kernel=rbf, total=   0.0s
[CV] C=100, gamma=0.0001, kernel=rbf .................................
[CV] .................... C=100, gamma=0.01, kernel=rbf, total=   0.1s
[CV] C=100, gamma=0.001, kernel=rbf ..................................
[CV] ...................... C=10, gamma=0.1, kernel=rbf, total=   0.1s
[CV] ..................... C=100, gamma=0.1, kernel=rbf, total=   0.1s
[CV] C=100, gamma=0.01, kernel=rbf ...................................
[CV] ..................... C=100, gamma=0.5, kernel=rbf, total=   0.1s
[CV] C=100, gamma=0.5, kernel=rbf ....................................
[CV] .

[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    1.5s finished


In [35]:
best_parameters

{'C': 5, 'gamma': 0.0001, 'kernel': 'rbf'}

In [36]:
# k-fold cross validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=best_estimator, X=X_train, y=y_train, cv=10, n_jobs=-1,
        scoring='neg_mean_squared_error', verbose=2)
accuracies.mean(), accuracies.std()

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total=   0.0s
[CV] ................................................. , total=   0.0s
[CV] ................................................. , total=   0.0s
[CV] ................................................. , total=   0.0s
[CV] .

[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished


(-1.002993266255709, 0.25252971306029276)

In [37]:
evaluate(best_estimator, X_test, y_test)

Model Validation
MSE: 1.1131170264648211
R^2: -0.04062831072066131


In [38]:
y_pred = best_estimator.predict(X_test)
np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_test.reshape(-1,1)))

1.0550436135368155

In [39]:
mean_absolute_error(y_pred=y_pred, y_true=y_test.reshape(-1,1))

0.842470383022783

In [40]:
joblib.dump(best_estimator, './training/forecast_svr_5a93e8768cbad97881597597.pkl')

['./training/forecast_svr_5a93e8768cbad97881597597.pkl']

## <font color=blue> 4. Mongodb ID: 5a95d7ae35d6d33d3fea56ff </font>

In [41]:
product = genData('5a95d7ae35d6d33d3fea56ff', conn, c, impute=True, freq='daily')
X_train, y_train, X_test, y_test, dftrain, scaler = splitDataNN(product, percent=0.2)

In [42]:
regressor = SVR(kernel='poly')

# Grid Search
from sklearn.model_selection import GridSearchCV
parameters = [{'C':[1, 5, 10, 100], 'kernel':['linear']},
        {'C':[1, 5, 10, 100], 'kernel':['rbf'], 'gamma':[0.5, 0.1, 0.01, 0.001, 0.0001]}
        ]
grid_search = GridSearchCV(estimator=regressor,
        param_grid=parameters,
        scoring=['neg_mean_squared_error', 'r2', 'explained_variance'],
        cv=5,
        n_jobs=-1,
        refit='r2',
        verbose=2)

In [43]:
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
best_estimator = grid_search.best_estimator_

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] C=5, kernel=linear ..............................................
[CV] C=5, kernel=linear ..............................................
[CV] C=5, kernel=linear ..............................................
[CV] C=5, kernel=linear ..............................................
[CV] C=10, kernel=linear .............................................
[CV] ............................... C=1, kernel=linear, total=   0.0s
[CV] ............................... C=1, kernel=linear, total=   0.0s
[CV] .............................. C=10, kernel=linear, total=   0.0s
[CV] C=10, kern

[CV] C=10, gamma=0.1, kernel=rbf .....................................
[CV] ....................... C=1, gamma=0.1, kernel=rbf, total=   0.0s
[CV] C=1, gamma=0.1, kernel=rbf ......................................
[CV] ................... C=100, gamma=0.001, kernel=rbf, total=   0.0s
[CV] ....................... C=5, gamma=0.1, kernel=rbf, total=   0.0s
[CV] C=5, gamma=0.1, kernel=rbf ......................................
[CV] C=100, gamma=0.001, kernel=rbf ..................................
[CV] .................... C=1, gamma=0.0001, kernel=rbf, total=   0.0s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................
[CV] ....................... C=1, gamma=0.1, kernel=rbf, total=   0.0s
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] .................... C=5, gamma=0.0001, kernel=rbf, total=   0.0s
[CV] .................... C=10, gamma=0.001, kernel=rbf, total=   0.0s
[CV] C=5, gamma=0.0001, kernel=rbf ...................................
[CV] C

[CV] ..................... C=100, gamma=0.5, kernel=rbf, total=   0.1s
[CV] .................. C=100, gamma=0.0001, kernel=rbf, total=   0.0s
[CV] ..................... C=100, gamma=0.1, kernel=rbf, total=   0.0s
[CV] C=100, gamma=0.01, kernel=rbf ...................................
[CV] .................... C=100, gamma=0.01, kernel=rbf, total=   0.0s
[CV] C=100, gamma=0.01, kernel=rbf ...................................
[CV] .................... C=100, gamma=0.01, kernel=rbf, total=   0.0s
[CV] C=100, gamma=0.01, kernel=rbf ...................................


[Parallel(n_jobs=-1)]: Done  36 out of 120 | elapsed:    0.6s remaining:    1.3s


[CV] .................... C=100, gamma=0.01, kernel=rbf, total=   0.0s


[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    0.8s finished


In [44]:
best_parameters

{'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}

In [45]:
# k-fold cross validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=best_estimator, X=X_train, y=y_train, cv=10, n_jobs=-1,
        scoring='neg_mean_squared_error', verbose=2)
accuracies.mean(), accuracies.std()

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total=   0.0s
[CV] ................................................. , total=   0.0s
[CV] ................................................. , total=   0.0s
[CV] ................................................. , total=   0.0s
[CV] .

[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished


(-1.0317259763003246, 0.17769098162666547)

In [46]:
evaluate(best_estimator, X_test, y_test)

Model Validation
MSE: 0.8759423462726603
R^2: -0.035538879740279805


In [47]:
y_pred = best_estimator.predict(X_test)
np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_test.reshape(-1,1)))

0.9359179164182403

In [48]:
mean_absolute_error(y_pred=y_pred, y_true=y_test.reshape(-1,1))

0.7514466197605094

In [49]:
joblib.dump(best_estimator, './training/forecast_svr_5a95d7ae35d6d33d3fea56ff.pkl')

['./training/forecast_svr_5a95d7ae35d6d33d3fea56ff.pkl']

## <font color=blue> 5. Mongodb ID: 5aa2c35e35d6d34b0032a796 </font>

In [50]:
product = genData('5aa2c35e35d6d34b0032a796', conn, c, impute=True, freq='daily')
X_train, y_train, X_test, y_test, dftrain, scaler = splitDataNN(product, percent=0.2)

In [51]:
regressor = SVR(kernel='poly')

# Grid Search
from sklearn.model_selection import GridSearchCV
parameters = [{'C':[1, 5, 10, 100], 'kernel':['linear']},
        {'C':[1, 5, 10, 100], 'kernel':['rbf'], 'gamma':[0.5, 0.1, 0.01, 0.001, 0.0001]}
        ]
grid_search = GridSearchCV(estimator=regressor,
        param_grid=parameters,
        scoring=['neg_mean_squared_error', 'r2', 'explained_variance'],
        cv=5,
        n_jobs=-1,
        refit='r2',
        verbose=2)

In [52]:
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
best_estimator = grid_search.best_estimator_

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] C=5, kernel=linear ..............................................
[CV] C=5, kernel=linear ..............................................
[CV] C=5, kernel=linear ..............................................
[CV] C=5, kernel=linear ..............................................
[CV] ............................... C=1, kernel=linear, total=   0.0s
[CV] ............................... C=1, kernel=linear, total=   0.0s
[CV] ....................... C=1, gamma=0.5, kernel=rbf, total=   0.0s
[CV] C=10, kernel=linear .............................................
[CV] C=5, kerne

[CV] C=5, gamma=0.01, kernel=rbf .....................................
[CV] C=10, gamma=0.5, kernel=rbf .....................................
[CV] ...................... C=1, gamma=0.01, kernel=rbf, total=   0.0s
[CV] ................... C=100, gamma=0.001, kernel=rbf, total=   0.0s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] C=100, gamma=0.001, kernel=rbf ..................................
[CV] ...................... C=10, gamma=0.5, kernel=rbf, total=   0.0s
[CV] .................... C=10, gamma=0.001, kernel=rbf, total=   0.0s
[CV] ..................... C=100, gamma=0.5, kernel=rbf, total=   0.0s
[CV] C=10, gamma=0.5, kernel=rbf .....................................
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] ...................... C=1, gamma=0.01, kernel=rbf, total=   0.0s
[CV] ................... C=100, gamma=0.001, kernel=rbf, total=   0.0s
[CV] C

[CV] ..................... C=5, gamma=0.001, kernel=rbf, total=   0.0s
[CV] ....................... C=5, gamma=0.1, kernel=rbf, total=   0.0s
[CV] C=5, gamma=0.0001, kernel=rbf ...................................
[CV] .................... C=100, gamma=0.01, kernel=rbf, total=   0.0s
[CV] C=100, gamma=0.01, kernel=rbf ...................................
[CV] .................... C=5, gamma=0.0001, kernel=rbf, total=   0.0s
[CV] .................... C=100, gamma=0.01, kernel=rbf, total=   0.0s
[CV] C=100, gamma=0.01, kernel=rbf ...................................
[CV] .................... C=100, gamma=0.01, kernel=rbf, total=   0.0s


[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    0.2s finished


In [53]:
best_parameters

{'C': 100, 'kernel': 'linear'}

In [54]:
# k-fold cross validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=best_estimator, X=X_train, y=y_train, cv=10, n_jobs=-1,
        scoring='neg_mean_squared_error', verbose=2)
accuracies.mean(), accuracies.std()

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total=   0.0s
[CV] ................................................. , total=   0.0s
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total=   0.0s
[CV] ................................................. , total=   0.0s
[CV] ................................................. , total=   0.0s
[CV]  ................................................................
[CV] ................................................. , total=   0.0s
[CV]  

[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished


(-0.8372595422238931, 1.01318188164846)

In [55]:
evaluate(best_estimator, X_test, y_test)

Model Validation
MSE: 1.6452804904425775
R^2: -0.00579370059787343


In [56]:
y_pred = best_estimator.predict(X_test)
np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_test.reshape(-1,1)))

1.2826848757362728

In [57]:
mean_absolute_error(y_pred=y_pred, y_true=y_test.reshape(-1,1))

0.29270053782035244

In [58]:
joblib.dump(best_estimator, './training/forecast_svr_5aa2c35e35d6d34b0032a796.pkl')

['./training/forecast_svr_5aa2c35e35d6d34b0032a796.pkl']

## <font color=blue> 6. Mongodb ID: 5a92474635d6d32207bcd343 </font>

In [59]:
product = genData('5a92474635d6d32207bcd343', conn, c, impute=True, freq='daily')
X_train, y_train, X_test, y_test, dftrain, scaler = splitDataNN(product, percent=0.2)

In [60]:
regressor = SVR(kernel='poly')

# Grid Search
from sklearn.model_selection import GridSearchCV
parameters = [{'C':[1, 5, 10, 100], 'kernel':['linear']},
        {'C':[1, 5, 10, 100], 'kernel':['rbf'], 'gamma':[0.5, 0.1, 0.01, 0.001, 0.0001]}
        ]
grid_search = GridSearchCV(estimator=regressor,
        param_grid=parameters,
        scoring=['neg_mean_squared_error', 'r2', 'explained_variance'],
        cv=5,
        n_jobs=-1,
        refit='r2',
        verbose=2)

In [61]:
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
best_estimator = grid_search.best_estimator_

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] C=5, kernel=linear ..............................................
[CV] C=5, kernel=linear ..............................................
[CV] C=5, kernel=linear ..............................................
[CV] ............................... C=1, kernel=linear, total=   0.0s
[CV] C=5, kernel=linear ..............................................
[CV] ............................... C=1, kernel=linear, total=   0.0s
[CV] ............................... C=1, kernel=linear, total=   0.0s
[CV] C=10, kernel=linear .............................................
[CV] ..........

[CV] C=5, gamma=0.5, kernel=rbf ......................................
[CV] ..................... C=100, gamma=0.1, kernel=rbf, total=   0.0s
[CV] ...................... C=1, gamma=0.01, kernel=rbf, total=   0.0s
[CV] .................. C=100, gamma=0.0001, kernel=rbf, total=   0.0s
[CV] C=100, gamma=0.1, kernel=rbf ....................................
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] ..................... C=10, gamma=0.01, kernel=rbf, total=   0.0s
[CV] .................... C=5, gamma=0.0001, kernel=rbf, total=   0.0s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] C=5, gamma=0.0001, kernel=rbf ...................................
[CV] ....................... C=5, gamma=0.5, kernel=rbf, total=   0.0s
[CV] C=5, gamma=0.5, kernel=rbf ......................................
[CV] ..................... C=100, gamma=0.1, kernel=rbf, total=   0.0s
[CV] C=100, gamma=0.1, kernel=rbf ....................................
[CV] .

[CV] C=100, gamma=0.001, kernel=rbf ..................................
[CV] ..................... C=100, gamma=0.5, kernel=rbf, total=   0.0s
[CV] ...................... C=5, gamma=0.01, kernel=rbf, total=   0.0s
[CV] ................... C=100, gamma=0.001, kernel=rbf, total=   0.0s
[CV] ...................... C=10, gamma=0.1, kernel=rbf, total=   0.0s
[CV] C=5, gamma=0.01, kernel=rbf .....................................
[CV] ...................... C=5, gamma=0.01, kernel=rbf, total=   0.0s
[CV] C=5, gamma=0.01, kernel=rbf .....................................
[CV] ...................... C=5, gamma=0.01, kernel=rbf, total=   0.0s


[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    0.2s finished


In [62]:
best_parameters

{'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}

In [63]:
# k-fold cross validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=best_estimator, X=X_train, y=y_train, cv=10, n_jobs=-1,
        scoring='neg_mean_squared_error', verbose=2)
accuracies.mean(), accuracies.std()

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total=   0.0s
[CV] ................................................. , total=   0.0s
[CV] ................................................. , total=   0.0s
[CV]  ................................................................
[CV] ................................................. , total=   0.0s
[CV]  ................................................................
[CV]  ................................................................
[CV] .

[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished


(-1.0938157732555889, 0.3955759623341329)

In [64]:
evaluate(best_estimator, X_test, y_test)

Model Validation
MSE: 0.9417633417226808
R^2: -0.050192073302380624


In [65]:
y_pred = best_estimator.predict(X_test)
np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_test.reshape(-1,1)))

0.9704449194687357

In [66]:
mean_absolute_error(y_pred=y_pred, y_true=y_test.reshape(-1,1))

0.3931393727624891

In [67]:
joblib.dump(best_estimator, './training/forecast_svr_5a92474635d6d32207bcd343.pkl')

['./training/forecast_svr_5a92474635d6d32207bcd343.pkl']

## <font color=blue> 7. Mongodb ID: 5a9347b98cbad97074cb1890 </font>

In [8]:
product = genData('5a9347b98cbad97074cb1890', conn, c, impute=True, freq='daily')
X_train, y_train, X_test, y_test, dftrain, scaler = splitDataNN(product, percent=0.2)

In [9]:
from sklearn.svm import SVR
regressor = SVR(kernel='poly')

In [10]:
# Grid Search
from sklearn.model_selection import GridSearchCV
parameters = [{'C':[1, 5, 10, 100], 'kernel':['linear']},
        {'C':[1, 5, 10, 100], 'kernel':['rbf'], 'gamma':[0.5, 0.1, 0.01, 0.001, 0.0001]}
        ]
grid_search = GridSearchCV(estimator=regressor,
        param_grid=parameters,
        scoring=['neg_mean_squared_error', 'r2', 'explained_variance'],
        cv=5,
        n_jobs=-1,
        refit='r2',
        verbose=2)

In [11]:
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
best_estimator = grid_search.best_estimator_

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] C=1, kernel=linear ..............................................
[CV] ............................... C=1, kernel=linear, total=   0.0s
[CV] ............................... C=1, kernel=linear, total=   0.0s
[CV] ............................... C=1, kernel=linear, total=   0.0s
[CV] C=1, kernel=linear ..............................................
[CV] C=5, kernel=linear ..............................................
[CV] ............................... C=1, kernel=linear, total=   0.0s
[CV] C=5, kernel=linear ..............................................
[CV] ............................... C=1, kernel=linear, total=   0.0s
[CV] C=5, kernel=linear ..............................................
[CV] C=5, kerne

[CV] C=5, gamma=0.01, kernel=rbf .....................................
[CV] ...................... C=10, gamma=0.5, kernel=rbf, total=   0.0s
[CV] ..................... C=10, gamma=0.01, kernel=rbf, total=   0.0s
[CV] ...................... C=5, gamma=0.01, kernel=rbf, total=   0.0s
[CV] C=10, gamma=0.5, kernel=rbf .....................................
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] C=5, gamma=0.01, kernel=rbf .....................................
[CV] ..................... C=10, gamma=0.01, kernel=rbf, total=   0.0s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ...................... C=10, gamma=0.5, kernel=rbf, total=   0.0s
[CV] ..................... C=10, gamma=0.01, kernel=rbf, total=   0.0s
[CV] C=10, gamma=0.5, kernel=rbf .....................................
[CV] ...................... C=5, gamma=0.01, kernel=rbf, total=   0.0s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] .

[CV] C=100, gamma=0.001, kernel=rbf ..................................
[CV] ..................... C=100, gamma=0.1, kernel=rbf, total=   0.0s
[CV] C=100, gamma=0.1, kernel=rbf ....................................
[CV] ................... C=100, gamma=0.001, kernel=rbf, total=   0.0s
[CV] C=100, gamma=0.0001, kernel=rbf .................................
[CV] ..................... C=100, gamma=0.1, kernel=rbf, total=   0.0s
[CV] C=100, gamma=0.1, kernel=rbf ....................................
[CV] .................. C=100, gamma=0.0001, kernel=rbf, total=   0.0s
[CV] ..................... C=100, gamma=0.1, kernel=rbf, total=   0.0s


[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    0.9s finished


In [12]:
best_parameters

{'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}

In [13]:
# k-fold cross validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=best_estimator, X=X_train, y=y_train, cv=10, n_jobs=-1,
        scoring='neg_mean_squared_error', verbose=2)
accuracies.mean(), accuracies.std()

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total=   0.0s
[CV] ................................................. , total=   0.0s
[CV] ................................................. , total=   0.0s
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total=   0.0s
[CV] ................................................. , total=   0.0s
[CV]  ................................................................
[CV] ................................................. , total=   0.0s
[CV] .

[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished


(-1.1072330048963406, 0.38579898405716984)

In [14]:
evaluate(best_estimator, X_train, y_train)

Model Validation
MSE: 1.105404120154213
R^2: -0.1129939815146419


In [15]:
y_pred = best_estimator.predict(X_train)
np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_train.reshape(-1,1)))

1.0513820048651266

In [16]:
mean_absolute_error(y_pred=y_pred, y_true=y_train.reshape(-1,1))

0.5016701006616904

In [17]:
joblib.dump(best_estimator, './training/forecast_svr_5a9347b98cbad97074cb1890.pkl')

['./training/forecast_svr_5a9347b98cbad97074cb1890.pkl']

## performansi pada train set

In [8]:
from sklearn.svm import SVR

In [9]:
def evalForecast(model, X, y, inverse=False, scaler=None):
    '''
    Evaluate time series forecasting model
    '''
    if inverse and scaler:
        # make prediction
        ypred = model.predict(X)
        # invert scaling predicted data
        inv_ypred = np.concatenate((X[:, :], ypred.reshape((-1,1))), axis=1)
        inv_ypred = scaler.inverse_transform(inv_ypred)
        inv_ypred = inv_ypred[:, -1]
        # invert scaling for actual data
        inv_y = np.concatenate((X[:, :], y.reshape((-1,1))), axis=1)
        inv_y = scaler.inverse_transform(inv_y)
        inv_y = inv_y[:, -1]
        # RMSE
        rmse = np.sqrt(mean_squared_error(y_pred=inv_ypred, y_true=inv_y))
        # MAE
        mae = mean_absolute_error(y_pred=inv_ypred, y_true=inv_y)
    else:
        # make prediction
        ypred = model.model.predict(X)
        # RMSE
        rmse = np.sqrt(mean_squared_error(y_pred=ypred, y_true=y))
        # MAE
        mae = mean_absolute_error(y_pred=ypred, y_true=y)

    print('Validasi RMSE: {0:.5f}'.format(rmse))
    print('Validasi MAE: {0:.5f}'.format(mae))

In [10]:
product = genData('5a9347b98cbad97074cb1890', conn, c, impute=True, freq='daily')
X_train, y_train, X_test, y_test, dftrain, scaler = splitDataNN(product, percent=0.2)

In [11]:
best_estimator = joblib.load('./training/forecast_svr_5a9347b98cbad97074cb1890.pkl')

In [12]:
evaluate(best_estimator, X_train, y_train)

Model Validation
MSE: 1.105404120154213
R^2: -0.1129939815146419


In [13]:
y_pred = best_estimator.predict(X_train)
np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_train.reshape(-1,1)))

1.0513820048651266

In [14]:
mean_absolute_error(y_pred=y_pred, y_true=y_train.reshape(-1,1))

0.5016701006616904

In [15]:
evalForecast(best_estimator, X_train, y_train, inverse=True, scaler=scaler)

Validasi RMSE: 0.42198
Validasi MAE: 0.20135
