In [1]:
import pysgpp as sg
import pandas as pd
from math import sqrt
from collections import namedtuple
from operator import itemgetter
from sklearn.externals.joblib import Parallel, delayed
from sklearn.base import clone
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.cross_validation import ShuffleSplit
from sklearn.grid_search import ParameterGrid
import itertools
from copy import copy
import numpy as np
import sys
sys.path.append("../src/")
from sgpi.learner import SGRegressionLearner, DataVector
from sgpi import model
from sgpi.util import get_dataset, get_xy, split

In [2]:
df = get_dataset('diag_test_low_noise')
train, _ = split(df)
X, y = get_xy(train)
linear = ElasticNet(0.0010, l1_ratio=0.5)
#Parallel(n_jobs=2)(delayed(sqrt)(i**2) for i in range(10))
type(y)

numpy.ndarray

In [3]:
linear.fit(X,y)
linear.score(X,y)

0.55932451980577813

In [4]:
cv = ShuffleSplit(100, n_iter=10)
params = {'alpha': np.linspace(0, 0.1,num=50), 'l1_ratio': [0, 0.1, 0.5, 1.0]}

In [5]:
Grid_score = namedtuple('Grid_score', 'parameters mean_validation_score cv_validation_scores cv_grid_sizes')

def evaluate_one(estimator, params, X, y, train, test):
    train_X = X[train]
    train_y = y[train]
    test_X = X[test]
    test_y = y[test]
    grid_size = 2
    estimator.set_params(**params)
    estimator.fit(train_X,train_y)
    error = estimator.score(test_X, test_y)
    return (error, grid_size)

def evaluate(estimator, params, cv, X, y):
    cv_results = Parallel(
                n_jobs=-2, verbose=0,
                pre_dispatch=3
             )(
                delayed(evaluate_one)(clone(estimator), params, X, y, train, test)
                    for (train, test) in cv)
    errors = []
    grid_sizes = []
    for (err, size) in cv_results:
        errors.append(err)
        grid_sizes.append(size)       
    return Grid_score(params, np.mean(errors), errors, grid_sizes)


class GridSearch:
    grid_scores_ = []

    def __init__(self, estimator, param_grid, cv, verbose=1):
        self.base_estimator_ = clone(estimator)
        self.verbose_ = verbose
        if not isinstance(param_grid, ParameterGrid):
            param_grid = ParameterGrid(param_grid)

            self.param_grid_ = param_grid
        self.cv_ = cv

    def fit(self, X, y):
        base_estimator = clone(self.base_estimator_)
        self.grid_scores_ = []
        no_params = self.param_grid_.__len__()
        for i, params in enumerate(self.param_grid_):
            self.grid_scores_.append(evaluate(self.base_estimator_, params, self.cv_, X, y))
            if (self.verbose_ > 0):
                print("Trained estimator no. {}, {} remaining.".format(i, no_params - i -1))
        self.grid_scores_ = [evaluate(self.base_estimator_, params, self.cv_, X, y)
                    for params in self.param_grid_]
        return self


In [13]:
level = 5
T = 0

grid_config = model.GridConfig(type=6, level=level, T=T)
adaptivity_config = model.AdaptivityConfig(num_refinements=0, no_points=4, treshold=0.0, percent=0.0)
solver_config = model.SolverConfig(type=0, max_iterations=100, epsilon=10e-3)
final_solver_config = model.SolverConfig(type=0, max_iterations=20, epsilon=10e-6)
regularization_config = model.RegularizationConfig(lambda_reg = 0.01, exponent_base=0.25, type=2)
estimator = SGRegressionLearner(grid_config, regularization_config, solver_config, final_solver_config, adaptivity_config)

lambda_grid = np.logspace(0, -3, num=4)
params = {'regularization_config__lambda_reg': lambda_grid}
cv = ShuffleSplit(X.shape[0], n_iter=10, random_state=42)
gs = GridSearch(estimator, params, cv)

estimator.fit(X,y)
estimator.score(X,y)

-0.07203347818265914

In [14]:
gs.fit(X,y)

Trained estimator no. 0, 3 remaining.
Trained estimator no. 1, 2 remaining.
Trained estimator no. 2, 1 remaining.
Trained estimator no. 3, 0 remaining.


<__main__.GridSearch instance at 0x7f607a0b3680>

In [15]:
for i in sorted(gs.grid_scores_, key=itemgetter(1), reverse=True):
    print i

Grid_score(parameters={'regularization_config__lambda_reg': 0.001}, mean_validation_score=-0.080919451149886121, cv_validation_scores=[-0.08130172914788128, -0.07248082250631492, -0.09185582689990607, -0.07720397269719728, -0.07073293768526753, -0.08498007261186415, -0.08430822808787497, -0.07262285573253834, -0.08013562332268766, -0.09357244280732907], cv_grid_sizes=[2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
Grid_score(parameters={'regularization_config__lambda_reg': 0.01}, mean_validation_score=-0.081160478003468109, cv_validation_scores=[-0.08169254905382808, -0.07215794256900201, -0.0922077908645528, -0.07713206294829147, -0.07062284226645257, -0.08537204473737199, -0.08464718292638386, -0.07311671970124027, -0.08102168963078736, -0.09363395533677071], cv_grid_sizes=[2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
Grid_score(parameters={'regularization_config__lambda_reg': 0.10000000000000001}, mean_validation_score=-0.085635659315753482, cv_validation_scores=[-0.08778341488275802, -0.07434730733696465, -0.09