In [99]:
from numpy import mean
from numpy import std
from numpy import absolute
from pandas import read_csv
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Lasso
from matplotlib import pyplot as plt
from sklearn import linear_model
from numpy import arange
from sklearn.linear_model import LassoCV
import pandas as pd
import numpy as np

In [100]:
def read_data(fname):
    data = pd.read_csv(fname)
    x_data = np.array([ row[0:-1] for row in data.values])
    y_data = np.array([ row[-1] for row in data.values])
    return x_data, y_data

def standardize(data):
    d_mean = np.mean(data)
    d_stddev = np.std(data, ddof=1)
    if d_stddev == 0:
        return data
    return np.array([ (d-d_mean)/d_stddev for d in data ])

months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
def month_indicator(i, month):
    midx = np.where(months == month)
    return 1 if i == midx else 0

def data_design_mat(data):
    design = []
    for row in data:
        numerical = row[0:-1]
        month = row[-1]
        drow = list(numerical)
        for i in range(12):
            drow.append(month_indicator(i, month))
        design.append(standardize(drow))
    return np.array(design)

def mse(y, y_hat):
    n = y.shape[0]
    assert(n == y_hat.shape[0])
    assert(len(y.shape) == 1)
    assert(len(y_hat.shape) == 1)
    return sum([ (yi-yi_hat)**2 for (yi,yi_hat) in zip(y,y_hat) ]) / n

def validate(y_data, design, model):
    y_hats = np.array([ model @ xi for xi in design ])
    return mse(y_data, y_hats)

def float_eq(a,b):
    return abs(a-b) < 0.001

def test_file(folder, file):
    X, y = read_data('data/{}/{}'.format(folder,file))
    X = data_design_mat(X)

    eps = 5e-5 
    lambdas, coeffs, _= linear_model.lasso_path(X, y, eps=5e-05, fit_intercept=False)
    coeffs = coeffs.T
    #for i in reversed(range(lambdas.shape[0])):
    #    if all([float_eq(c,0.) for c in coeffs[i]]):
    #        coeffs = np.delete(coeffs, [i], axis=0)
    #        lambdas = np.delete(lambdas, [i], axis=0)

    # we give the library impl's the benefit of the doubt and choose the lowest TEST mse
    X_test, y_test = read_data('data/{}/test.csv'.format(folder))
    X_test = data_design_mat(X_test)
    mses = []
    for model in coeffs:
        mses.append(validate(y_test, X_test, model))

    best_idx = np.argmin(mses)
    print("{}/{} test".format(folder,file))
    print("best mse: {}".format(mses[best_idx]))
    #print("best model: {}".format(coeffs[best_idx]))

    best_model = coeffs[best_idx]
    best_params = []
    for i,c in enumerate(best_model):
        if not float_eq(c,0):
            best_params.append(i)
    print("best_params: {}".format(best_params))
    print()

In [101]:
folder = 'ortho'
test_file(folder, 'n_lt_p.csv')
test_file(folder, 'n_eq_p.csv')
test_file(folder, 'n_gt_p.csv')

ortho/n_lt_p.csv test
best mse: 8.149840460199352
best_params: [1, 4, 5, 8, 9, 11]

ortho/n_eq_p.csv test
best mse: 6.933299639115785
best_params: [1, 2, 6, 8, 14]

ortho/n_gt_p.csv test
best mse: 7.554553029571514
best_params: [1, 4, 7, 8, 13, 20]



In [102]:
folder = 'corr'
test_file(folder, 'n_lt_p.csv')
test_file(folder, 'n_eq_p.csv')
test_file(folder, 'n_gt_p.csv')

corr/n_lt_p.csv test
best mse: 1.4160738183074826
best_params: []

corr/n_eq_p.csv test
best mse: 1.4160346628567504
best_params: [14]

corr/n_gt_p.csv test
best mse: 1.4160738183074826
best_params: []

