In [None]:
import pandas
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, chi2,mutual_info_regression
import sklearn
from sklearn.metrics import make_scorer, r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor

## Support functions

In [None]:
def mean_euclidean_error_func(T, O):
    sum = 0
    for t, o in zip(T, O):
        sum += np.linalg.norm(t - o) / T.shape[0]
    return sum 
mean_euclidean_error = make_scorer(mean_euclidean_error_func, greater_is_better=False)

In [None]:
def get_evaluations(y_pred, y_test):
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mee = mean_euclidean_error_func(y_test, y_pred)
    return r2, mse, mae, mee

def print_evaluations(r2, mse, mae, mee):
    print('R2: %.3f' % r2)
    print('MSE: %.3f' % mse)
    print('MAE: %.3f' % mae)
    print('MEE: %.3f' % mee)

## Partitioning

In [None]:
train = pandas.read_csv('ML-CUP22-TR.csv',sep=",", header=None, skipinitialspace= None, skiprows=7,index_col=0)
X = train[[i for i in range(1,10)]]
y = train[[10,11]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True,random_state=0)

In [None]:
X_train = X_train.to_numpy()
y_train0 = y_train[[10]].values
y_train1 = y_train[[11]].values
y_train = y_train.to_numpy()

In [None]:
X_test = X_test.to_numpy()
y_test0 = y_test[[10]].values
y_test1 = y_test[[11]].values
y_test = y_test.to_numpy()

## Model Selection

In [None]:
param_grid = {
        'weights': ['distance', 'uniform'],
        'n_neighbors' : [i for i in range(1,61)],
        'algorithm' : ['brute','ball_tree','kd_tree'],
        'metric' : ['cityblock','euclidean','chebyshev','minkowski']}

In [None]:
skfold_inner=KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
grid0 = GridSearchCV(       
    KNeighborsRegressor(),
    param_grid=param_grid,
    cv=skfold_inner,
    n_jobs=4,
    error_score='raise',
    scoring = mean_euclidean_error
 )
grid0.fit(X_train, y_train0)
print(grid0.best_estimator_)
print(grid0.best_score_)

In [None]:
grid1 = GridSearchCV(       
    KNeighborsRegressor(),
    param_grid=param_grid,
    cv=skfold_inner,
    n_jobs=4,
    error_score='raise',
    scoring = mean_euclidean_error
 )
grid1.fit(X_train, y_train1)
print(grid1.best_estimator_)
print(grid1.best_score_)

## Model Assessment

In [None]:
pred0 = grid0.best_estimator_.predict(X_test)
pred1 = grid1.best_estimator_.predict(X_test)

In [None]:
print("First target\n")
r2, mse, mae, mee = get_evaluations(pred0,y_test0)
print_evaluations(r2, mse, mae, mee)
print()
print("Second target\n")
r2, mse, mae, mee = get_evaluations(pred1,y_test1)
print_evaluations(r2, mse, mae, mee)
print()
print("Both targets\n")
r2, mse, mae, mee = get_evaluations(np.column_stack((pred0,pred1)),y_test)
print_evaluations(r2, mse, mae, mee)