# Underfit

Evaluation of RETRO performance for predictions from underfit models.

### Preliminaries

In [None]:
# stopping sklearn warnings
def warn(*args, **kwargs):
    pass

import warnings
warnings.warn = warn

In [None]:
import sys
sys.path.append('../')

import sklearn as sk
import matplotlib.pyplot as plt
import numpy as np

from math import sqrt

%matplotlib inline

from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.metrics import mean_squared_error, r2_score

from src.load_data import DataLoader
from src.retro_score import RetroScore, run_retro_score
from src.evaluation import rs_at_threshold_plot, overlapping_points
from src.dimensionality_reduction import get_activations

np.random.seed(42)

### Load data

In [None]:
data = DataLoader()

# select one dataset (comment others out)
data.cyclepower()
#data.airfoil()
#data.toxicfish()
#data.abalone()
#data.autompg2()
#data.california_housing()
#data.energyefficiency()
#data.diabetes()
#data.winequality2()
#data.boston()
#data.superconductor()
#data.communities()

# randomize the order of the data
data.randomize_order()

# split into train and test
data.split_train_test(test_size=0.2, random=False)

# scale features
data.scale_min_max(-1,1)

X_train, X_test, y_train, y_test = data.get_split_data()

In [None]:
# choose one regression model (comment others out)
#reg = MLPRegressor(hidden_layer_sizes=(10),max_iter=3).fit(X_train, y_train)
reg = DecisionTreeRegressor(max_depth=1).fit(X_train, y_train)

# obtain predictions on train and test set
y_pred = reg.predict(X_test).reshape(-1,1)
y_train_pred = reg.predict(X_train).reshape(-1,1)

# show performance on train and test set
print(f"{sqrt(mean_squared_error(y_test, y_pred))} - RMSE test")
print(f"{r2_score(y_test, y_pred)} - r2 test")
print()
print(f"{sqrt(mean_squared_error(y_train, y_train_pred))} - RMSE train")
print(f"{r2_score(y_train, y_train_pred)} - r2 train")

### Calculating the RETRO scores

#### Without reduced dimensionality
For all except the largest datasets (communities and superconductor), we calculate the RETRO score without reducing the dimensionality of the data first.

In [None]:
# calculate RETRO score (without reducing dimensionality)
rs = RetroScore(k=5)
retro_score, unnormalized_score, nbs_x, nbs_y = run_retro_score(rs, X_train, y_train, X_test, y_pred, y_train_pred)

#### With reducing dimensionality

For larger datasets (superconductor and communities), we reduce the dimensionality of the data before applying the RETRO score. Only run the cell below if this applies.

In [None]:
# reduce dimensionality (for superconductor and communities only)
dimreducer = MLPRegressor(hidden_layer_sizes=(500,20,10),max_iter=100).fit(X_train, y_train)
X_train_ll = get_activations(dimreducer, X_train)[-2]
X_test_ll = get_activations(dimreducer, X_test)[-2]

# calculate RETRO score with reduced dimensionality
rs = RetroScore(k=5)
retro_score, unnormalized_score, nbs_x, nbs_y = run_retro_score(rs, X_train_ll, y_train, X_test_ll, y_pred, y_train_pred)

### Evaluation

In [None]:
# plot error vs. RS and Pearson correlation coefficient
rs_at_threshold_plot(unnormalized_score, y_test, y_pred, normalized=False)

In [None]:
# fraction of overlapping points between top 50% highest errors and top 50% lowest RETRO-scores (1 is best)
errors = abs(y_test-y_pred.reshape(-1))
overlap = overlapping_points(unnormalized_score, errors, frac=50)
overlap