In [33]:
%matplotlib inline

import sys, os
sys.path.append(os.path.abspath(os.path.join('..', 'src/')))
from sgpi.util import get_dataset, get_xy, get_r_squared, split, to_data_matrix, get_Phi

from sgpi import model
from sgpi.learner import SGRegressionLearner
import sqlalchemy as sa
from sqlalchemy import orm, func, or_, and_

from sgpi.plot import *
from scipy.linalg import svd

import pandas as pd
import pysgpp as sg; sg.omp_set_num_threads(4)

In [2]:
def duplicate_estimator(db_res):
    learner = SGRegressionLearner(db_res.grid_config, db_res.regularization_config,
                               db_res.solver_config, db_res.final_solver_config,
                               db_res.adaptivity_config)
    df = get_dataset(db_res.experiment.dataset)
    df, _ = split(df)
    X, y = get_xy(df)
    return X, y, learner

In [7]:
def get_df_lasso(weights, grid, X):
    active_set = weights.nonzero()[0]
    Phi = get_Phi(grid, X, svd=False)
    active_Phi = Phi[:,active_set]
    rank = np.linalg.matrix_rank(active_Phi)
    return rank

def get_df_ridge(singular_values, lambda_reg):
    df = 0.0
    for sv in singular_values:
        df = df + (sv**2)/(sv**2+lambda_reg)
    return df

def get_AIC(train_mse, num_data, df):
    return 2*df + num_data * np.log(train_mse)

In [4]:
ses = model.make_session()
ridge_t = sg.RegularizationType_Identity
diag_t = sg.RegularizationType_Diagonal
lasso_t = sg.RegularizationType_Lasso
en_t = sg.RegularizationType_ElasticNet
crit_ridge = or_(model.RegularizationConfig.type==ridge_t,\
               and_(model.RegularizationConfig.type==diag_t,model.RegularizationConfig.exponent_base==1.0 ))
crit_lasso = or_(model.RegularizationConfig.type==lasso_t,\
               and_(model.RegularizationConfig.type==en_t,model.RegularizationConfig.l1_ratio==1.0 ))
best_res = lambda crit: ses.query(model.Result, func.min_(model.Result.validation_mse)).\
    join(model.Experiment, model.RegularizationConfig, model.GridConfig).\
    filter(model.Experiment.dataset == 'concrete').\
    filter(crit).\
    filter(model.GridConfig.T == 0.0).\
    filter(model.GridConfig.level >= 4).\
    group_by(model.GridConfig.level).\
    order_by(model.Result.validation_mse)
best_ridge = best_res(crit_ridge)
best_lasso = best_res(crit_lasso)

best_ridge.count(), best_lasso.count()

(2, 2)

In [5]:
for res, err in best_ridge:
    res.regularization_config.l1_ratio = 0.0
    X, y, estimator = duplicate_estimator(res)
    estimator.fit(X,y)
    grid = estimator._learner.getGrid()
    svd = get_Phi(grid,X, svd=True)
    mse = -estimator.score(X,y)
    df = get_df_ridge(svd, res.regularization_config.lambda_reg)
    aic = get_AIC(mse, X.shape[0], df)
    print df, aic, res.train_mse, res.validation_mse

716.696498942 2796.22239282 5.22741775804 22.1747751191
558.032312601 2991.22306366 9.73377312072 25.0739124998


In [8]:
for res, err in best_lasso:
    if res.regularization_config.l1_ratio is None:
        res.regularization_config.l1_ratio = 1.0
    X, y, estimator = duplicate_estimator(res)
    estimator.fit(X,y)
    grid = estimator._learner.getGrid()
    weights = estimator.get_weights()
    df = get_df_lasso(weights, grid, X)
    mse = -estimator.score(X,y)
    aic = get_AIC(mse, X.shape[0], df)
    print df, aic, res.train_mse, res.validation_mse

754 2997.93500472 6.09931123154 20.9901331732
518 2797.09469557 8.47610322823 23.8522703997


In [25]:
for res, err in best_ridge:
    le = str(res.grid_config.level)
    m = "Ridge"
    l = "{:1.2e}".format(res.regularization_config.lambda_reg)
    g = str(int(res.train_grid_points))
    m_tr = "{:2.3f}".format(np.sqrt(res.train_mse))
    m_te = "{:2.3f}".format(np.sqrt(res.test_mse))
    print le + " & " + m + " & " + l + " & " + g + " & df & " + m_tr + " & aic & " + m_te + r" \\"

5 & Ridge & 1.96e-02 & 6650 & df & 2.286 & aic & 4.184 \\
4 & Ridge & 1.84e-02 & 1470 & df & 3.120 & aic & 4.198 \\


In [27]:
for res, err in best_lasso:
    le = str(res.grid_config.level)
    m = "Lasso"
    l = "{:1.2e}".format(res.regularization_config.lambda_reg)
    g = str(int(res.train_grid_points))
    m_tr = "{:2.3f}".format(np.sqrt(res.train_mse))
    m_te = "{:2.3f}".format(np.sqrt(res.test_mse))
    print le + " & " + m + " & " + l + " & " + g + " & df & " + m_tr + " & aic & " + m_te + r" \\"

5 & Lasso & 1.00e-02 & 6632 & df & 2.470 & aic & 3.737 \\
4 & Lasso & 1.00e-02 & 1382 & df & 2.911 & aic & 3.850 \\


In [52]:
for res, mse in best_ridge:
    reg_type = "Ridge"
    print "{} & {} & {:1.0e} & {:2.3f} & {} & {:2.3f} & {:2.3f}\\\\".\
        format(res.grid_config.level, reg_type, res.regularization_config.lambda_reg,
                np.sqrt(res.validation_mse), int(res.train_grid_points), np.sqrt(res.train_mse), np.sqrt(res.test_mse))

5 & Ridge & 2e-02 & 4.709 & 6650 & 2.286 & 4.184\\
4 & Ridge & 2e-02 & 5.007 & 1470 & 3.120 & 4.198\\


In [50]:
np.sqrt(20.99)

4.5814844755821227