# Polynomial Regression

In [7]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, cross_validate, RepeatedKFold
import numpy as np
from numpy import sqrt
import time
from sklearn.metrics import mean_squared_error,mean_absolute_error,explained_variance_score,r2_score
import joblib
import os
from sklearn.model_selection import KFold
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model

In [8]:
df = pd.read_csv('../data/df_pkill.csv', delimiter = ',', header=0)

In [9]:
samples_per_bin, bins = np.histogram(df['pkill'], bins='doane') # Doane's method
df['bins'] = pd.DataFrame(np.digitize(df['pkill'], bins))
X = df.drop(['pkill','bins'],axis=1)
y = df[['pkill']]
b = df['bins']

### Train-test Split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y.values, test_size=0.2, random_state=42, stratify=b)         

### Hyperparameters

In [11]:
degree_list = np.arange(4,5)

### Training Process

In [None]:
for degree in degree_list:
    print('------------------------------------------------------------------------')
    print(f'Training degree {degree}')

    poly = PolynomialFeatures(degree=degree)
    X_train_poly = poly.fit_transform(X_train)
    
    print('Degree: %d, Features: %d' % (degree, X_train_poly.shape[1]))    
    
    metrics = pd.DataFrame(columns=['fold', 'mae', 'mse', 'rmse', 'r2', 'training_time', 'inference_time'])

    # Define the K-fold Cross Validator
    kfold = KFold(n_splits=5, shuffle=True)

    # K-fold Cross Validation model evaluation
    fold_no = 1
    
    # Training
    for train, val in kfold.split(X_train_poly, y_train):
        # Define the model as the best model from the random search
        model = linear_model.LinearRegression(n_jobs=-1)

        # Generate a print
        print('------------------------------------------------------------------------')
        print(f'Training for fold {fold_no} ...')

        # get the start time
        st_wall = time.time()

        # Fit data to model
        model.fit(X_train_poly[train], y_train[train])

        # get the end time
        et_wall = time.time()

        # get execution time
        wall_time = et_wall - st_wall

        print('Training Execution time:', wall_time, 'seconds')

        # get the start time
        st_wall_inf = time.time()

        # Generate generalization metrics
        y_pred = model.predict(poly.fit_transform(X_test))

        # get the end time
        et_wall_inf = time.time()

        # get execution time
        wall_time_inf = et_wall_inf - st_wall_inf

        print('Inference Execution time:', wall_time_inf, 'seconds')

        scores = [fold_no, mean_absolute_error(y_test, y_pred), mean_squared_error(y_test, y_pred), 
                  sqrt(mean_squared_error(y_test, y_pred)), r2_score(y_test, y_pred), wall_time, wall_time_inf]

        metrics.loc[len(metrics)] = scores

        # Increase fold number
        fold_no = fold_no + 1


    metrics.loc[len(metrics)] = ['mean', metrics['mae'].mean(), metrics['mse'].mean(), metrics['rmse'].mean(), metrics['r2'].mean(), metrics['training_time'].mean(), metrics['inference_time'].mean()]
    metrics.loc[len(metrics)] = ['std', metrics['mae'].iloc[:-1].std(), metrics['mse'].iloc[:-1].std(), metrics['rmse'].iloc[:-1].std(), metrics['r2'].iloc[:-1].std(), metrics['training_time'].iloc[:-1].std(), metrics['inference_time'].iloc[:-1].std()]  
    metrics.loc[len(metrics)] = ['sum', metrics['mae'].iloc[:-2].sum(), metrics['mse'].iloc[:-2].sum(), metrics['rmse'].iloc[:-2].sum(), metrics['r2'].iloc[:-2].sum(), metrics['training_time'].iloc[:-2].sum(), metrics['inference_time'].iloc[:-2].sum()]
    metrics = metrics.set_index('fold')

    path_to_save = f'./results/'

    if not os.path.exists(path_to_save):
        os.makedirs(path_to_save)

    metrics.to_csv(f'{path_to_save}{degree}_degree.csv')

    path_to_save = f'./models/'

    if not os.path.exists(path_to_save):
        os.makedirs(path_to_save)
    # save model to file
    joblib.dump(model, f'{path_to_save}{degree}_degree.sav');

------------------------------------------------------------------------
Training degree 4
Degree: 4, Features: 1001
------------------------------------------------------------------------
Training for fold 1 ...
