In [15]:
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from pathlib import Path

from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.metrics import *

In [16]:
df_Ch001 = pd.DataFrame(pd.read_pickle(Path("C:/Users/Victor/Desktop/PIR/process/Ch001.pkl")))
df_Ch009 = pd.DataFrame(pd.read_pickle(Path("C:/Users/Victor/Desktop/PIR/process/Ch009.pkl")))
df_Ch020 = pd.DataFrame(pd.read_pickle(Path("C:/Users/Victor/Desktop/PIR/process/Ch020.pkl")))
df_Ch022 = pd.DataFrame(pd.read_pickle(Path("C:/Users/Victor/Desktop/PIR/process/Ch022.pkl")))

In [17]:
from model_util import data_scale
from sklearn.preprocessing import MinMaxScaler, StandardScaler

df_input=[df_Ch001, df_Ch009, df_Ch020]
df_input_scaled=[]

LOG_SCALE= True
scaler=MinMaxScaler()

for df in df_input:
    df_input_scaled.append(data_scale(df, scaler, LOG_SCALE))

df_output_scaled, output_scaler = data_scale(df_Ch022, scaler, LOG_SCALE, output= True)

In [18]:
size=len(df_Ch022.index)
split=int(size*0.15)

MAE, MSE, RMSE = [], [], []
df_pred = pd.DataFrame(0, index= df_Ch022.index[:split], columns=df_Ch022.columns)

In [19]:
#format data
data_scaled=np.dstack(df_input_scaled)
target_scaled=df_output_scaled.copy().values

for p in range(len(df_Ch022.columns)):

    #select data
    X_train_p = data_scaled[split:,p,:]
    y_train_p = target_scaled[split:,p]
    X_test_p = data_scaled[:split,p,:]
    y_test_p = target_scaled[:split,p]

    #gridsearch
    svr_grid = GridSearchCV(SVR(kernel='rbf', gamma=0.1),
                   param_grid = {"C": [1e0, 1e1, 1e2, 1e3],
                               "gamma": np.logspace(-2, 2, 5)})

    svr_grid.fit(X_train_p, y_train_p)
    C = svr_grid.best_params_['C']
    gamma = svr_grid.best_params_['gamma']

    #learning curve
    svr_lc = SVR(kernel='rbf', C=C, gamma=gamma)
    train_sizes, train_scores_svr, test_scores_svr = \
    learning_curve(svr_lc, X_train_p, y_train_p, train_sizes=np.linspace(0.1, 1, 10),
                   scoring="neg_mean_squared_error", cv=10)

    split2 = train_sizes[np.argmin(-test_scores_svr.mean(1))]
    if split2 >int(size*0.5):

        X_train_p = data_scaled[size-split2:,p,:]
        y_train_p = target_scaled[size-split2:,p]

    #calcul predict test, error
    svr=SVR(kernel='rbf', C=C, gamma=gamma)
    svr.fit(X_train_p, y_train_p)
    y_pred_test_p = svr.predict(X_test_p)

    min = output_scaler.data_min_[p]
    max = output_scaler.data_max_[p]
    y_test_p= (max-min)*y_test_p+min
    y_pred_test_p = (max-min)*y_pred_test_p+min

    if LOG_SCALE:
        y_test_p = 10 ** y_test_p
        y_pred_test_p = 10 ** y_pred_test_p

    MAE.append(mean_absolute_error(y_test_p,y_pred_test_p))
    MSE.append(mean_squared_error(y_test_p,y_pred_test_p))
    RMSE.append(mean_squared_error(y_test_p, y_pred_test_p, squared= False))

    df_pred[df_pred.columns[p]] = y_pred_test_p
    df_secure[df_pred.columns[p]] = y_test_p

In [20]:
# a = df_Ch022.iloc[:split].values.max(axis=0)
# b = df_Ch022.iloc[split:].values.max(axis=0)

# print(a>b)

In [21]:
df_pred.to_pickle(Path("C:/Users/Victor/Desktop/PIR/model/saved_models/log_SVM_predict_test.pkl"))
df_metrics=pd.DataFrame({"MAE": MAE, "MSE": MSE, "RMSE": RMSE})
df_metrics.to_pickle(Path("C:/Users/Victor/Desktop/PIR/model/saved_models/log_SVM_metrics_test.pkl"))

In [22]:
df_metrics

Unnamed: 0,MAE,MSE,RMSE
0,0.733081,0.845234,0.919366
1,1.243578,2.222639,1.490852
2,1.271607,2.558101,1.599407
3,2.113443,8.245485,2.871495
4,3.729374,30.52588,5.525023
5,4.781286,39.933241,6.319275
6,5.242276,46.736677,6.836423
7,5.231295,45.486176,6.744344
8,4.360381,33.776415,5.811748
9,3.088563,16.842098,4.103912
