In [None]:
from rfgap import RFGAP
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from dataset import load_regression
from sklearn.model_selection import train_test_split

sns.set_theme()

In [None]:
data_name = 'AirQuality'
x, y = load_regression('../datasets/regression/' + data_name)
n, d   = x.shape

In [None]:
def is_in_interval(y, lwr, upr):
    return np.logical_and(y >= lwr, y <= upr)

def get_coverage(y, y_lwr, y_upr):
    return np.mean((y >= y_lwr) & (y <= y_upr))


def get_width_stats(y_lwr, y_upr):
    widths = y_upr - y_lwr
    mean = np.mean(widths)
    sd = np.std(widths)
    min = np.min(widths)
    q1 = np.quantile(widths, 0.25)
    q2 = np.quantile(widths, 0.50)
    q3 = np.quantile(widths, 0.75)
    max = np.max(widths)
    return mean, sd, min, q1, q2, q3, max


In [None]:
x_train, x_test, y_train, y_test, inds_train, inds_test = train_test_split(x, y, np.arange(n), test_size=0.2, random_state = 42)

In [None]:
# Random forest on all data for PHATE embedding visualization
rfphate = RFGAP(oob_score = True, non_zero_diagonal = True, random_state = 42, y = y)
rfphate.fit(x, y)
prox_phate = rfphate.get_proximities()

In [None]:
# Random forest for uncertainty measures; using split datasets
rf = RFGAP(oob_score = True, random_state = 42, y = y)
rf.fit(x_train, y_train)
# prox = rf.get_proximities()

In [None]:
y_pred_test, y_pred_lwr_test, y_pred_upr_test = rf.predict_interval(X_test = x_test,
                                                                    n_neighbors = 'all',
                                                                    level = 0.90)

In [None]:
sort_idx = y_pred_test.argsort()
y_grid = np.linspace(np.min(y_test), np.max(y_test), len(y_test))
is_covered = is_in_interval(y_test, y_pred_lwr_test, y_pred_upr_test)

In [None]:
sns.scatterplot(x = y_pred_test, y = y_test)
plt.fill_between(np.sort(y_pred_test), y_pred_lwr_test[sort_idx],
                 y_pred_upr_test[sort_idx], color='skyblue', alpha=0.4)

plt.plot(y_grid, y_grid, color = 'darkblue')
plt.scatter(y_pred_test[~is_covered], y_test[~is_covered], color = 'red')

In [None]:
plt.plot(y_pred_test[sort_idx]) # Add intervals
plt.fill_between(range(len(y_test)), y_pred_lwr_test[sort_idx], 
                 y_pred_upr_test[sort_idx], color='skyblue', alpha=0.4, label='Confidence Interval')