# Auto-sklearn classifier

## Setup

In [1]:
import pickle
import pandas as pd

In [2]:
from autosklearn.classification import AutoSklearnClassifier
from autosklearn.metrics import log_loss

In [3]:
import sys
sys.path.append('..')
from utils import *

In [4]:
x_train, y_train, eras_train = train_data(weras=True)
x_val, y_val = val_data()
bl = baseline(x_train, y_train, x_val, y_val)
bl

0.69245942891354983

## First try

### Training

In [5]:
model = AutoSklearnClassifier(
    resampling_strategy='cv',
    resampling_strategy_arguments={'folds': 3},
    tmp_folder='autosklearn/classifier/tmp/',
    output_folder='autosklearn/classifier/out/',
    delete_tmp_folder_after_terminate=False,
    delete_output_folder_after_terminate=False,
    shared_mode=True,
)

In [6]:
# model.fit(x_train, y_train, metric=log_loss, dataset_name='numerai')

In [7]:
# with open('autosklearn/classifier/model.pkl', 'wb') as auto_sklearn_file:
#      pickle.dump(model, auto_sklearn_file)

### Testing

In [8]:
with open('autosklearn/classifier/model.pkl', 'rb') as auto_sklearn_file:
     model = pickle.load(auto_sklearn_file)

In [9]:
print(model.show_models())

[(0.860000, MyDummyClassifier(configuration=1, init_params=None, random_state=None)),
(0.140000, SimpleClassificationPipeline({'preprocessor:feature_agglomeration:linkage': 'complete', 'balancing:strategy': 'none', 'preprocessor:feature_agglomeration:affinity': 'manhattan', 'classifier:__choice__': 'gaussian_nb', 'one_hot_encoding:use_minimum_fraction': 'False', 'preprocessor:__choice__': 'feature_agglomeration', 'preprocessor:feature_agglomeration:n_clusters': 298, 'imputation:strategy': 'mean', 'rescaling:__choice__': 'standardize', 'preprocessor:feature_agglomeration:pooling_func': 'max'},
dataset_properties={
  'signed': False,
  'sparse': False,
  'task': 1,
  'multiclass': False,
  'multilabel': False,
  'target_type': 'classification'})),
]


Most of the weight is given to a model called DummyClassifier. Never a good sign :). I guess it makes sense due to the random nature of the dataset. It's mostly noise with a small modifier.

The actual chosen classifier is GaussianNB with an Agglomerative Clustering preprocessor.

Apparently, prediction is not supported when doing CV.

In [10]:
model.resampling_strategy = 'holdout'
model.refit(x_train, y_train)
validate(y_train, model.predict_proba(x_train))

0.69198138057469816

Surprisingly, 0.691 is better that what I'm getting from the TPoT preliminary results. The current best is on 0.692 and uses LogReg with some RDF preprocessing.

In [11]:
pd.DataFrame(model.cv_results_)

Unnamed: 0,mean_fit_time,mean_test_score,param_balancing:strategy,param_classifier:__choice__,param_classifier:adaboost:algorithm,param_classifier:adaboost:learning_rate,param_classifier:adaboost:max_depth,param_classifier:adaboost:n_estimators,param_classifier:bernoulli_nb:alpha,param_classifier:bernoulli_nb:fit_prior,...,param_preprocessor:random_trees_embedding:n_estimators,param_preprocessor:select_percentile_classification:percentile,param_preprocessor:select_percentile_classification:score_func,param_preprocessor:select_rates:alpha,param_preprocessor:select_rates:mode,param_preprocessor:select_rates:score_func,param_rescaling:__choice__,params,rank_test_scores,status
0,292.189848,0.0,none,random_forest,,,,,,,...,,,,,,,standardize,"{'classifier:random_forest:max_depth': 'None',...",1,Unknown
1,360.019453,-2147484000.0,none,random_forest,,,,,,,...,,,,,,,standardize,"{'classifier:random_forest:max_depth': 'None',...",11,Unknown
2,360.014553,-2147484000.0,none,random_forest,,,,,,,...,,,,,,,standardize,"{'classifier:random_forest:max_depth': 'None',...",11,Unknown
3,1.106124,0.0,none,sgd,,,,,,,...,,,,,,,none,"{'classifier:sgd:n_iter': 20, 'one_hot_encodin...",1,Unknown
4,360.014503,-2147484000.0,none,random_forest,,,,,,,...,,,,,,,normalize,"{'classifier:random_forest:max_depth': 'None',...",11,Unknown
5,360.019212,0.0,none,extra_trees,,,,,,,...,,,,,,,normalize,{'one_hot_encoding:use_minimum_fraction': 'Tru...,1,Unknown
6,184.322468,0.0,none,extra_trees,,,,,,,...,,,,,,,standardize,{'one_hot_encoding:use_minimum_fraction': 'Tru...,1,Unknown
7,239.02157,-2147484000.0,weighting,libsvm_svc,,,,,,,...,,,,,,,none,{'one_hot_encoding:use_minimum_fraction': 'Fal...,11,Unknown
8,111.88184,0.0,none,random_forest,,,,,,,...,,,,,,,none,"{'classifier:random_forest:max_depth': 'None',...",1,Unknown
9,360.017498,-2147484000.0,none,libsvm_svc,,,,,,,...,,,,,,,minmax,"{'classifier:__choice__': 'libsvm_svc', 'class...",11,Unknown


Most of the fit times seem to be pushing the upper limit. This clearly needs more time.

## Longer time

In [12]:
model = AutoSklearnClassifier(
    ml_memory_limit=10000,
    time_left_for_this_task=4*24*60*60,
    per_run_time_limit=1*60*60,
    resampling_strategy='cv',
    resampling_strategy_arguments={'folds': 5},
    tmp_folder='autosklearn/classifier_long/tmp/',
    output_folder='autosklearn/classifier_long/out/',
    delete_tmp_folder_after_terminate=False,
    delete_output_folder_after_terminate=False,
    shared_mode=True,
)

In [13]:
model.fit(x_train, y_train, metric=log_loss, dataset_name='numerai')

ValueError: Metric must be instance of autosklearn.metrics.Scorer.

In [None]:
with open('autosklearn/classifier_long/model.pkl', 'wb') as auto_sklearn_file:
     pickle.dump(model, auto_sklearn_file)