# Imports

In [10]:
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm

#!pip install ucimlrepo
from ucimlrepo import fetch_ucirepo, list_available_datasets

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from data import Dataset

myparams = {
    'text.usetex': True,
    'text.latex.preamble': r'\usepackage{amsfonts}',
    'font.family': 'Djvu Serif',
    'font.size': 16,
    'axes.grid': True,
    'grid.alpha': 0.1,
    'lines.linewidth': 2
}
plt.rcParams.update(myparams)

%config InlineBackend.figure_format = "retina"
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
from utils import (
    save_object,
    load_object
)

# Regression

In [12]:
datasets_regression = {
    1: 'Abalone',
    9: 'Auto MPG',
    10: 'Automobile',
    60: 'Liver Disorders',
    87: 'Servo',
    162: 'Forest Fires',
    186: 'Wine Quality',
    242: 'Energy Efficiency',
    320: 'Student Performance',
    368: 'Facebook Metrics',
    477: 'Real Estate Valuation',
    519: 'Heart Failure Clinical Records',
    565: 'Bone marrow transplant: children',
}

In [9]:
results_regression = {}

for key in tqdm(datasets_regression.keys()):
    
    # ==========================
    data = fetch_ucirepo(id=key)
    # ==========================
    df = data.variables[['name', 'role', 'type']]
    target = df[df.role == 'Target'].name.values[0]
    columns = df[df.role == 'Feature'][['name', 'type']]
    num_columns = columns.loc[(columns.type == 'Continuous') | (columns.type == 'Integer')].name.values
    cat_columns = columns.loc[(columns.type == 'Categorical') | (columns.type == 'Binary')].name.values
    columns = columns.name.values
        
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), num_columns),
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_columns)
        ]
    )
        
    pipe = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('linreg', LinearRegression())
        ]
    )

    df = data.data.original

    if data.metadata.has_missing_values:
        df = df.dropna(ignore_index=True)

    X = df.drop(columns=[target])
    y = df[target].to_numpy().flatten()
        
    dataset = Dataset(X, y)
    m, n = X.shape
    sample_sizes = np.linspace(n+1, m, dtype=int)
    B = 100

    model = pipe
    loss = mean_squared_error

    means = []
    variances = []

    for k in sample_sizes:
        tmp = []
        for _ in range(B):
            X_k, y_k = dataset.sample(k)
            model.fit(X_k, y_k)
            y_pred = model.predict(X)
            tmp.append(loss(y, y_pred))
        tmp = np.array(tmp)
        means.append(tmp.mean())
        variances.append(tmp.var())

    means = np.array(means)
    variances = np.array(variances)

    results_regression[data.metadata.name] = {}
    results_regression[data.metadata.name]['sample_sizes'] = sample_sizes
    results_regression[data.metadata.name]['means'] = means
    results_regression[data.metadata.name]['variances'] = variances

  0%|          | 0/13 [00:02<?, ?it/s]


NameError: name 'Dataset' is not defined

In [None]:
save_object(results_regression, "plots/datasets_regression_new.pkl")