# Dataset4 Modeling

## 1. Load Library

In [14]:
import numpy as np
import pandas as pd

## 2. Load Dataset

In [15]:
# load train dataset4
data = pd.read_csv('./datasets/train4.csv')
y = data.loc[:, "100"].copy()
data = data.drop("100", axis = 1).copy()

In [16]:
data.shape

(200, 100)

In [17]:
y.shape

(200,)

In [18]:
# load test dataset4
testdata = pd.read_csv('./datasets/test4.csv')
testy = testdata.loc[:, "100"].copy()
testdata = testdata.drop("100", axis = 1).copy()

In [19]:
testdata.shape

(200, 100)

In [20]:
testy.shape

(200,)

## 3. Modeling

In [24]:
import autosklearn.classification
import autosklearn.metrics

In [25]:
# Find optimized model in 7200 secs using autosklearn
# Train 90% and Validation 10%
clf = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task = 7200,
    per_run_time_limit = 720,
    ensemble_size = 100,
    ensemble_nbest = 100,
    max_models_on_disc = 100,
    seed=1,
    memory_limit=6144,
    resampling_strategy='holdout',
    resampling_strategy_arguments={'train_size':0.9},
    tmp_folder = './data4_holdout_9_random_1',
    delete_tmp_folder_after_terminate = False,
    n_jobs = -1,
    metric=autosklearn.metrics.accuracy)

In [26]:
# Train classifier
clf.fit(X = data, y = y, dataset_name="dataset4_holdout_9_random_1")

  self.metafeatures = self.metafeatures.append(metafeatures)
  self.algorithm_runs[metric].append(runs)














































































AutoSklearnClassifier(delete_tmp_folder_after_terminate=False,
                      ensemble_nbest=100, ensemble_size=100,
                      max_models_on_disc=100, memory_limit=6144,
                      metric=accuracy, n_jobs=-1, per_run_time_limit=720,
                      resampling_strategy_arguments={'train_size': 0.9},
                      time_left_for_this_task=7200,
                      tmp_folder='./data4_holdout_9_random_1')

## 4. Result

In [32]:
# show model leader board, sort by ensemble weight
pd.set_option('display.max_rows', None)
clf.leaderboard(detailed=True, sort_by='ensemble_weight')

Unnamed: 0_level_0,rank,ensemble_weight,type,cost,duration,config_id,train_loss,seed,start_time,end_time,budget,status,data_preprocessors,feature_preprocessors,balancing_strategy,config_origin
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
215,63,0.04,lda,0.0,0.565253,214,0.0,0,1653392000.0,1653392000.0,0.0,StatusType.SUCCESS,[],[nystroem_sampler],weighting,Local Search
361,28,0.04,lda,0.0,0.549816,360,0.0,0,1653393000.0,1653393000.0,0.0,StatusType.SUCCESS,[],[nystroem_sampler],weighting,Local Search
164,42,0.03,lda,0.0,0.662376,163,0.0,0,1653392000.0,1653392000.0,0.0,StatusType.SUCCESS,[],[nystroem_sampler],weighting,Local Search
141,37,0.03,lda,0.0,54.763352,140,0.0,0,1653392000.0,1653392000.0,0.0,StatusType.SUCCESS,[],[polynomial],none,Random Search
293,5,0.03,lda,0.0,0.65738,292,0.0,0,1653392000.0,1653392000.0,0.0,StatusType.SUCCESS,[],[nystroem_sampler],weighting,Local Search
213,65,0.03,lda,0.0,0.582082,212,0.0,0,1653392000.0,1653392000.0,0.0,StatusType.SUCCESS,[],[nystroem_sampler],weighting,Local Search
319,15,0.03,lda,0.0,0.535382,318,0.0,0,1653393000.0,1653393000.0,0.0,StatusType.SUCCESS,[],[nystroem_sampler],weighting,Local Search
311,13,0.03,lda,0.0,0.544596,310,0.0,0,1653392000.0,1653393000.0,0.0,StatusType.SUCCESS,[],[nystroem_sampler],weighting,Local Search
248,55,0.02,lda,0.0,0.568721,247,0.0,0,1653392000.0,1653392000.0,0.0,StatusType.SUCCESS,[],[nystroem_sampler],none,Local Search
364,29,0.02,gaussian_nb,0.0,0.529774,363,0.005556,0,1653393000.0,1653393000.0,0.0,StatusType.SUCCESS,[],[fast_ica],none,Random Search


In [28]:
# print model component in detail
clf.show_models()

{67: {'model_id': 67,
  'rank': 1,
  'cost': 0.0,
  'ensemble_weight': 0.01,
  'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice at 0x7f5081f55580>,
  'balancing': Balancing(random_state=1, strategy='weighting'),
  'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice at 0x7f51ed6ee5e0>,
  'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice at 0x7f51ed69a700>,
  'sklearn_classifier': BernoulliNB(alpha=9.96028779263453, fit_prior=False)},
 110: {'model_id': 110,
  'rank': 2,
  'cost': 0.0,
  'ensemble_weight': 0.02,
  'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice at 0x7f5038874850>,
  'balancing': Balancing(random_state=1, strategy='weighting'),
  'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice at 0x7f5046da1670>,
  'classifier': <autosklearn.pipeline.componen

## 5. Test Data Inference

In [29]:
from sklearn.metrics import accuracy_score

predictions = clf.predict(testdata)
print("Accuracy score:", accuracy_score(testy, predictions))

Accuracy score: 0.995


## 6. Save & Load Model, Reproduce Result

In [33]:
import pickle

In [35]:
with open('./dataset4_9_1_995_model.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [36]:
with open('./dataset4_9_1_995_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [37]:
loaded_model.score(testdata, testy)

0.995