In [341]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, RBF, Matern, RationalQuadratic, WhiteKernel, ExpSineSquared, ConstantKernel
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
from joblib import dump, load
import os

In [105]:
MODEL_PATH = 'models'

In [106]:
SEASON = 'season'

LABEL = 'price_CHF'

# Data

In [107]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [108]:
train_df = train_df[train_df[LABEL].notna()]

In [109]:
X = train_df.loc[:, train_df.columns != LABEL]
Y = train_df[LABEL]

X_sub = test_df

# Preprocessing

In [110]:
def preprocess(X: pd.DataFrame) -> np.ndarray:
  X[SEASON] = X[SEASON].astype('category').cat.codes
  X.fillna(X.mean(axis='rows'), inplace=True)
  return X
  # return StandardScaler().fit_transform(X)

In [111]:
X = preprocess(X)
X_sub = preprocess(X_sub)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[SEASON] = X[SEASON].astype('category').cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(X.mean(axis='rows'), inplace=True)


In [112]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

# Training

In [410]:
param_grid = [
    # {
    #     "alpha":  [1, 1e-1, 1e-2, 1e-3],
    #     "kernel": [RBF(l) for l in np.logspace(-2, -1, 3)]
    # },
    # {
    #     "alpha":  [1e-1, 1e-2, 1e-3],
    #     "kernel": [Matern(l) for l in np.logspace(-2, -1, 3)]
    # },
    # {
    #     "alpha":  [1e-1, 1e-2],
    #     "kernel": [Matern(l) * ExpSineSquared() for l in np.logspace(-2, -1, 3)]
    # },
    {
        "alpha":  [0.1],
        # "kernel": [Matern(0.4) * ExpSineSquared(length_scale=1, periodicity=1.5)]
        "kernel": [Matern(0.4, nu=1.5) * ExpSineSquared(length_scale=1, periodicity=1.5) + ConstantKernel(10) * Matern(1000)]
    }
]

In [411]:
gp = GaussianProcessRegressor()
regr = GridSearchCV(estimator=gp, param_grid=param_grid, cv=4, scoring="r2")
regr.fit(X, Y)
print(regr.best_params_)

{'alpha': 0.1, 'kernel': Matern(length_scale=0.4, nu=1.5) * ExpSineSquared(length_scale=1, periodicity=1.5) + 3.16**2 * Matern(length_scale=1e+03, nu=1.5) * ExpSineSquared(length_scale=100, periodicity=150)}


In [412]:
np.max(regr.cv_results_["mean_test_score"])

0.20520468321127594

In [397]:
def train_regr(X_train: np.ndarray, Y_train: pd.DataFrame, version='', force_new=True) -> SVR:
  file_name = f'{MODEL_PATH}/{version}.joblib'

  if not force_new and os.path.isfile(file_name):
    return load(file_name)
  else:

    # regr = SVR(epsilon=0.5)
    regr = GaussianProcessRegressor(kernel=Matern(0.4, nu=1.5) * ExpSineSquared(length_scale=1, periodicity=1.5) + ConstantKernel(10) * Matern(1000), alpha=0.1, random_state=0)
    regr.fit(X_train, Y_train)

    print(regr.score(X_train, Y_train))

    dump(regr, file_name)

    return regr

In [398]:
train_model = train_regr(X_train, Y_train, 'train')

0.999999462018581


In [399]:
train_model

# Prepare Submission

In [400]:
train_regr(X, Y, 'sub')

0.9967935844188958


In [401]:
def prepare_regr(X_test: np.ndarray, version='') -> pd.DataFrame:
  regr = load(f'{MODEL_PATH}/{version}.joblib')
  y_pred = regr.predict(X_test)
  return pd.DataFrame(y_pred, columns=[LABEL])

In [402]:
y_pred = prepare_regr(X_sub, 'sub')

In [403]:
X_sub

Unnamed: 0,season,price_AUS,price_CZE,price_GER,price_ESP,price_FRA,price_UK,price_ITA,price_POL,price_SVK
0,1,-1.331262,0.472985,0.707957,-3.924473,-1.136441,-0.596703,-3.163394,3.298693,1.921886
1,2,-1.184837,0.358019,-0.951635,-3.199028,-1.069695,-2.190843,-1.420091,3.238307,-1.941443
2,0,-1.116459,-0.997273,0.780460,-3.338948,-1.053149,-0.586339,-3.163394,3.207398,2.020570
3,3,-1.331262,0.353066,0.833429,-3.924473,-1.322626,-0.628873,-1.304240,3.159858,-1.941443
4,1,-1.331262,0.269644,-0.951635,-3.245495,-1.362051,-0.717914,-1.341538,3.205007,-1.941443
...,...,...,...,...,...,...,...,...,...,...
95,3,-2.030894,-0.997273,-1.332104,-3.838154,-3.322586,-2.319565,-3.163394,-2.134084,-2.880557
96,1,-1.817763,-0.997273,-0.951635,-3.995247,-3.760752,-2.190843,-3.903510,-2.269992,-3.007311
97,2,-1.331262,-0.997273,-1.140538,-4.084448,-3.968988,-2.555133,-4.058773,-2.146487,-2.816678
98,0,-1.331262,-1.029762,-0.951635,-4.216490,-3.705548,-2.487751,-3.745480,-0.767164,-2.849707


In [404]:
y_pred.to_csv('submission.csv', index=False, float_format='%.3f')

# Analysis

In [54]:
import sklearn.metrics as metrics

In [55]:
def evaluate_regr(Y_test: pd.DataFrame, y_pred: np.ndarray) -> float:
  return metrics.r2_score(Y_test, y_pred)

In [56]:
y_pred = prepare_regr(X_test, 'train')

In [57]:
y_pred, Y_test

(     price_CHF
 0     2.976842
 1     8.287131
 2     8.443576
 3    -0.578157
 4     9.221172
 ..         ...
 122   5.126854
 123   8.404327
 124   2.783058
 125   7.355990
 126   7.274661
 
 [127 rows x 1 columns],
 542    2.744384
 132    8.221076
 69     8.730717
 417   -2.483055
 153    9.243569
          ...   
 737    5.214904
 381    8.099139
 602    2.741465
 677    7.697540
 755    8.129605
 Name: price_CHF, Length: 127, dtype: float64)

In [58]:
evaluate_regr(Y_test, y_pred)

0.944863758350182