In [93]:
%load_ext autoreload
%autoreload 2

import warnings
import collections
import time
import datetime

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import joblib as jl
from sklearn import datasets
import pandas as pd

from sklearn.linear_model import ARDRegression
from src.ulnml.least_square_regression import RidgeULNML
from src.gridsearch.least_square_regression import RidgeRandomSearch, RidgeCVProb as RidgeCV, LassoCVProb as LassoCV
from src.util.plotting import rmse, logloss, run_and_plot

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [51]:
memory = jl.Memory(cachedir="tmp-real", verbose=False)
# memory.clear(warn=False)

In [90]:
def load_million_songs_dataset_proper():
    X, y = load_million_songs_dataset()
    train = 463715
    test = 51630
    X_train, y_train = X[:train], y[:train]
    X_test, y_test = X[test:], y[test:]
    return X_train, y_train, X_test, y_test

@memory.cache
def load_residential_building_dataset_original():
    rbd = pd.read_excel("asset/Residential-Building-Data-Set.xlsx", header=[0, 1], index_col=[0, 1, 2, 3])
    Xyy = rbd.values
    return Xyy[:, :-2], Xyy[:, -2:]


@memory.cache
def load_million_songs_dataset():
    msd = pd.read_csv("asset/YearPredictionMSD.txt", index_col=0, header=None)
    y = np.array(msd.index)
    X = msd.values
    return X, y

def load_residential_building_dataset0():
    X, yy = load_residential_building_dataset_original()
    return X, yy[:, 0]

def load_residential_building_dataset1():
    X, yy = load_residential_building_dataset_original()
    return X, yy[:, 1]

def load_boston():
    return datasets.load_boston(return_X_y=True)

def load_diabetes():
    return datasets.load_diabetes(return_X_y=True)


In [91]:
def get_loaders():
    loaders = collections.OrderedDict()
    loaders["MSD"] = memory.cache(load_million_songs_dataset)
    loaders["RBD"] = memory.cache(load_residential_building_dataset)
    loaders["Boston"] = memory.cache(load_boston)
    loaders["Diabetes"] = memory.cache(load_diabetes)
    return loaders

def get_methods():
    alphas = 10.0 ** np.linspace(-4, 0, 20)
    cv = 5
    methods = collections.OrderedDict()
    methods["uLNML"] = RidgeULNML(fit_intercept=False, n_iter=10000)
    methods["Ridge+CV"] = RidgeCV(alphas=alphas, cv=cv)
    methods["Lasso+CV"] = LassoCV(alphas=alphas, cv=cv)
    methods["RVM"] = ARDRegression()
    methods["RandomSearch"] = RidgeRandomSearch(lam_min=1e-4, lam_max=1, num_grid=20, num_cv=cv, random_state=42)
    return methods

def get_print_diff():
    offset = time.time()
    def print_diff():
        nonlocal offset
        diff = datetime.timedelta(seconds=int(time.time() - offset))
        offset = time.time()
        print(" ... (took {}).".format(diff))
    return print_diff

def get_cost_table(cost_fn):
    num_train = 100
    print_diff = get_print_diff()
    table = []
    index = "Data"
    for key_data, loader in get_loaders().items():
        print("loading {}".format(key_data))
        X_train, y_train, X_test, y_test = loader()
        X_train, y_train = X_train[:num_train], y_train[:num_train]
        print_diff()
        row = collections.OrderedDict()
        row[index] = key_data
        for key_method, method in get_methods().items():
            print("processing {} .. {}".format(key_data, key_method))
            method.fit(X_train, y_train)
            row[key_method] = cost_fn(method, X_test, y_test)
            print_diff()
        table.append(row)
    df = pd.DataFrame.from_records(table, index=index)
    return df

In [None]:
X, y = load_million_songs_dataset()
run_and_plot(X, y, "Test Log-loss", logloss)

KeyboardInterrupt: 