In [1]:
%matplotlib inline
import api
import helpers
import evaluation
import implementations
import split

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

# Models comparison

Let's compare 6 differents models to have a basic score allowing to compare them.

In [8]:
# load train set

# scaled between -1 (background) and 1 (Higgs)
y, x, _ = helpers.load_csv_data('data/train.csv', sub_sample = True, background_value = -1)

# scaled between 0 (background) and 1 (Higgs)
# used in logistic regression
y_rescaled, x_rescaled, _ = helpers.load_csv_data('data/train.csv', sub_sample = True, background_value = 0)
x_rescaled,_ , _ = helpers.standardize(x_rescaled)

In [7]:
x.min(0)

array([ -9.99000000e+02,   1.10000000e-02,   9.46300000e+00,
         0.00000000e+00,  -9.99000000e+02,  -9.99000000e+02,
        -9.99000000e+02,   2.56000000e-01,   0.00000000e+00,
         4.62270000e+01,   1.36000000e-01,  -1.41400000e+00,
        -9.99000000e+02,   2.00010000e+01,  -2.46800000e+00,
        -3.14100000e+00,   2.60050000e+01,  -2.48200000e+00,
        -3.14100000e+00,   5.37000000e-01,  -3.13900000e+00,
         2.44160000e+01,   0.00000000e+00,  -9.99000000e+02,
        -9.99000000e+02,  -9.99000000e+02,  -9.99000000e+02,
        -9.99000000e+02,  -9.99000000e+02,   0.00000000e+00])

In [3]:

def choose_best(cv, best):
    if cv[2] < best[2]:
        return cv
    else:
        return best
    
    
def cross_validate_parameters(y, x, function_name, method_name,cut=0.5, gammas=None, lambdas=None, *args, **kwargs):
    best = 0,0, float('inf'), 0
    
    if (lambdas is None) and (gammas is not None):
        for i in gammas:
            cv = api.train(y, x, poly=0, split_method=None, replace=None, cv=True, cut=cut, \
                           model_func=method_name, gamma = i, *args)
            best = choose_best(cv, best)
            
    elif (gammas is None) and (lambdas is not None):
        for i in lambdas:
            cv = api.train(y, x, poly=0, split_method=None, replace=None, cv=True, cut=cut, \
                           model_func=method_name, lambda_ = i, *args)
            best = choose_best(cv, best)
            
    elif (gammas is not None) and (lambdas is not None):
        for i in gammas:
            for j in lambdas:
                cv = api.train(y, x, poly=0, split_method=None, replace=None, cv=True, cut=cut, \
                               model_func=method_name, gamma = i, lambda_ = j, *args)
                best = choose_best(cv, best)
                
    else:
        best = api.train(y, x, poly=0, split_method=None, replace=None, cv=True, cut=cut, \
                               model_func=method_name, *args)
        
    print("-------------------------")
    print(function_name)
    print("te_loss = %.4f" % best[2])
    print("accuracy = %.4f" % best[3])

    return best

# range parameter
gammas = np.linspace(start=0.00001, stop=1, num=10)
lambdas = np.logspace(start=-8, stop=0, num=10)
max_iters = 10000


# least squares
cross_validate_parameters(y, x, 'Least Square', implementations.least_squares)

# least squares GD
cross_validate_parameters(y, x,'Least squares GD',  implementations.least_squares_GD, \
                                              gammas = gammas, max_iters = max_iters)

# least squares Stochastic GD
cross_validate_parameters(y, x,'Least squares SGD', implementations.least_squares_SGD, \
                                               gammas = gammas, max_iters = max_iters)

# RR
cross_validate_parameters(y, x, 'Ridge Regression', implementations.ridge_regression, lambdas = lambdas)


# LR
cross_validate_parameters(y_rescaled, x_rescaled,'Logistic Regression', implementations.logistic_regression, \
                                                cut = 0.5, gammas = gammas, max_iters = max_iters)

# RLR
cross_validate_parameters(y_rescaled, x_rescaled, 'Regularized Logistic Regression', \
                        implementations.reg_logistic_regression, cut = 0.5, gammas = gammas, \
                        lambdas = lambdas, max_iters = max_iters)



-------------------------
Least Square
te_loss = 0.8122
accuracy = 0.6978
-------------------------
Least squares GD
te_loss = 0.8292
accuracy = 0.6894
-------------------------
Least squares SGD
te_loss = 4.3905
accuracy = 0.3286
-------------------------
Ridge Regression
te_loss = 0.8181
accuracy = 0.6966


AttributeError: 'tuple' object has no attribute 'min'

In [None]:
# test all the models

# least square
cv_least_squares = api.train(y, x, poly=0, split_method=None, replace=None, cv=True, cut=0., \
              model_func=implementations.least_squares)

print("-------------------------")
print("Least squares: ")
print("te_loss = %.4f" % cv_least_squares[2])
print("accuracy = %.4f" % cv_least_squares[3])


cv_least_squares_GD = api.train(y, x, poly=0, split_method=None, replace=None, cv=True, cut=0., \
              model_func=implementations.least_squares_GD, max_iters = 10000, gamma = 0.01)

print("-------------------------")
print("Least squares GD: ")
print("te_loss = %.4f" % cv_least_squares_GD[2])
print("accuracy = %.4f" % cv_least_squares_GD[3])

# least squares Stochastic GD
cv_least_squares_SGD = api.train(y, x, poly=0, split_method=None, replace=None, cv=True, cut=0., \
              model_func=implementations.least_squares_SGD, max_iters = 10000, gamma = 0.1)

print("-------------------------")
print("Least squares SGD: ")
print("te_loss = %.4f" % cv_least_squares_SGD[2])
print("accuracy = %.4f" % cv_least_squares_SGD[3])

# RR
cv_ridge_regression = api.train(y, x, poly=0, split_method=None, replace=None, cv=True, cut=0., \
              model_func=implementations.ridge_regression, lambda_ = 0.00000001)

print("-------------------------")
print("Ridge Regression: ")
print("te_loss = %.4f" % cv_ridge_regression[2])
print("accuracy = %.4f" % cv_ridge_regression[3])

# LR
cv_logistic_regression = api.train(y_rescaled, x_rescaled, poly=0, split_method=None, replace=None, cv=True, cut=0.5, \
              model_func=implementations.logistic_regression, max_iters = 10000, gamma = 0.00001)

print("-------------------------")
print("Logistic Regression: ")
print("te_loss = %.4f" % cv_logistic_regression[2])
print("accuracy = %.4f" % cv_logistic_regression[3])

# Regularized LR
cv_reg_logistic_regression = api.train(y_rescaled, x_rescaled, poly=0, split_method=None, replace=None, cv=True, cut=0.5, \
              model_func=implementations.reg_logistic_regression, lambda_ = 0.1, max_iters = 10000, gamma = 0.00001)

print("-------------------------")
print("Logistic Regression: ")
print("te_loss = %.4f" % cv_reg_logistic_regression[2])
print("accuracy = %.4f" % cv_reg_logistic_regression[3])

In [None]:
np.logspace(start=-8, stop=10, num=10)

In [None]:
# test all the models
gammas = np.linspace(start=0.000001, stop=1, num=10)
lambdas = np.logspace(start=-8, stop=0, num=10)
max_iters = 10000


# least squares
cross_validate_parameters(y, x, 'Least Square', implementations.least_squares)

# least squares GD
cross_validate_parameters(y, x,'Least squares GD',  implementations.least_squares_GD, \
                                                gammas = gammas, max_iters = max_iters)

# least squares Stochastic GD
cross_validate_parameters(y, x,'Least squares SGD', implementations.least_squares_SGD, \
                                                 gammas = gammas, max_iters = max_iters)

# RR
cross_validate_parameters(y, x, 'Ridge Regression', implementations.ridge_regression, lambdas = lambdas)


# LR
cross_validate_parameters(y_rescaled, x_rescaled,'Logistic Regression', implementations.logistic_regression, \
                                                  cut = 0.5, gammas = gammas, max_iters = max_iters)

# RLR
cross_validate_parameters(y_rescaled, x_rescaled, 'Regularized Logistic Regression', \
                          implementations.reg_logistic_regression, cut = 0.5, gammas = gammas, \
                          lambdas = lambdas, max_iters = max_iters)

In [None]:
import implementations
# least squares GD

def choose_best(cv, best):
    if cv[2] < best[2]:
        return cv
    else:
        return best
    
    
def cross_validate_parameters(y, x, function_name, method_name,cut=0.5, gammas=None, lambdas=None, *args, **kwargs):
    best = 0,0, float('inf'), 0
    
    if (lambdas is None) and (gammas is not None):
        for i in gammas:
            cv = api.train(y, x, poly=0, split_method=None, replace=None, cv=True, cut=cut, \
                           model_func=method_name, gamma = i, *args)
            best = choose_best(cv, best)
            
    elif (gammas is None) and (lambdas is not None):
        for i in lambdas:
            cv = api.train(y, x, poly=0, split_method=None, replace=None, cv=True, cut=cut, \
                           model_func=method_name, lambda_ = i, *args)
            best = choose_best(cv, best)
            
    elif (gammas is not None) and (lambdas is not None):
        for i in gammas:
            for j in lambdas:
                cv = api.train(y, x, poly=0, split_method=None, replace=None, cv=True, cut=cut, \
                               model_func=method_name, gamma = i, lambda_ = j, *args)
                best = choose_best(cv, best)
                
    else:
        best = api.train(y, x, poly=0, split_method=None, replace=None, cv=True, cut=cut, \
                               model_func=method_name, *args)
        
    print("-------------------------")
    print(function_name)
    print("te_loss = %.4f" % best[2])
    print("accuracy = %.4f" % best[3])

    return best


In [None]:
cross_validate_parameters(y, x, implementations.ridge_regression, lambdas = np.arange(0.0001, 1, 10))