# Dataset4 Modeling

## 1. Load Library

In [1]:
import numpy as np
import pandas as pd

## 2. Load Dataset

In [2]:
# load train dataset4
data = pd.read_csv('./datasets/train4.csv')
y = data.loc[:, "100"].copy()
data = data.drop("100", axis = 1).copy()

In [3]:
data.shape

(200, 100)

In [4]:
y.shape

(200,)

In [5]:
# load test dataset4
testdata = pd.read_csv('./datasets/test4.csv')
testy = testdata.loc[:, "100"].copy()
testdata = testdata.drop("100", axis = 1).copy()

In [6]:
testdata.shape

(200, 100)

In [7]:
testy.shape

(200,)

## 3. Modeling

In [8]:
import autosklearn.classification
import autosklearn.metrics

In [9]:
# Find optimized model in 36000 secs using autosklearn
# Train 90% and Validation 10%
clf = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task = 36000,
    per_run_time_limit = 3600,
    ensemble_size = 100,
    ensemble_nbest = 100,
    max_models_on_disc = 100,
    seed=1,
    memory_limit=10240,
    resampling_strategy='cv',
    resampling_strategy_arguments={'train_size':0.9, 'fold':3},
    tmp_folder = './data4_cv_9_random_1',
    delete_tmp_folder_after_terminate = False,
    n_jobs = -1,
    metric=autosklearn.metrics.accuracy)

In [11]:
# Train classifier
clf.fit(X = data, y = y, dataset_name="dataset4_cv9_random_1")

  self.metafeatures = self.metafeatures.append(metafeatures)
  self.algorithm_runs[metric].append(runs)










































































































































































































































AutoSklearnClassifier(delete_tmp_folder_after_terminate=False,
                      ensemble_nbest=100, ensemble_size=100,
                      max_models_on_disc=100, memory_limit=10240,
                      metric=accuracy, n_jobs=-1, per_run_time_limit=3600,
                      resampling_strategy='cv',
                      resampling_strategy_arguments={'fold': 3, 'folds': 5,
                                                     'train_size': 0.9},
                      time_left_for_this_task=36000,
                      tmp_folder='./data4_cv_9_random_1')

## 4. Result

In [12]:
# show model leader board, sort by ensemble weight
pd.set_option('display.max_rows', None)
clf.leaderboard(detailed=True, sort_by='ensemble_weight')

Unnamed: 0_level_0,rank,ensemble_weight,type,cost,duration,config_id,train_loss,seed,start_time,end_time,budget,status,data_preprocessors,feature_preprocessors,balancing_strategy,config_origin
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
306,63,0.04,qda,0.0,2.140178,305,0.0,0,1653653000.0,1653653000.0,0.0,StatusType.SUCCESS,[],[pca],none,Local Search
443,28,0.04,qda,0.0,1.601399,442,0.0,0,1653654000.0,1653654000.0,0.0,StatusType.SUCCESS,[],[pca],none,Local Search
271,42,0.03,qda,0.0,2.083739,270,0.0,0,1653653000.0,1653653000.0,0.0,StatusType.SUCCESS,[],[pca],weighting,Local Search
257,37,0.03,qda,0.0,2.936898,256,0.0,0,1653653000.0,1653653000.0,0.0,StatusType.SUCCESS,[],[pca],weighting,Local Search
380,5,0.03,qda,0.0,2.7932,379,0.0,0,1653654000.0,1653654000.0,0.0,StatusType.SUCCESS,[],[pca],none,Local Search
302,65,0.03,qda,0.0,2.073994,301,0.0,0,1653653000.0,1653653000.0,0.0,StatusType.SUCCESS,[],[pca],none,Local Search
411,15,0.03,qda,0.0,1.52678,410,0.0,0,1653654000.0,1653654000.0,0.0,StatusType.SUCCESS,[],[pca],none,Local Search
407,13,0.03,qda,0.0,1.500182,406,0.0,0,1653654000.0,1653654000.0,0.0,StatusType.SUCCESS,[],[pca],none,Local Search
329,55,0.02,qda,0.0,1.646588,328,0.0,0,1653653000.0,1653653000.0,0.0,StatusType.SUCCESS,[],[pca],none,Local Search
444,29,0.02,qda,0.0,2.047481,443,0.0,0,1653654000.0,1653654000.0,0.0,StatusType.SUCCESS,[],[pca],weighting,Local Search


In [13]:
# print model component in detail
clf.show_models()

{247: {'model_id': 247,
  'rank': 1,
  'cost': 0.0,
  'ensemble_weight': 0.01,
  'voting_model': VotingClassifier(estimators=None, voting='soft'),
  'estimators': [{'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice at 0x7ff41f417cd0>,
    'balancing': Balancing(random_state=1),
    'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice at 0x7ff41f055b50>,
    'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice at 0x7ff41f8353a0>,
    'sklearn_classifier': QuadraticDiscriminantAnalysis(reg_param=0.10061076748752248)},
   {'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice at 0x7ff41eb95640>,
    'balancing': Balancing(random_state=1),
    'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice at 0x7ff41eaf02e0>,
    'classifier': <autosklearn.pipeline.components.classifica

## 5. Test Data Inference

In [14]:
from sklearn.metrics import accuracy_score

predictions = clf.predict(testdata)
print("Accuracy score:", accuracy_score(testy, predictions))

Accuracy score: 1.0


## 6. Save & Load Model, Reproduce Result

In [15]:
import pickle

In [16]:
with open('./dataset4_95_1__model.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [19]:
with open('./dataset4_95_1__model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [20]:
loaded_model.score(testdata, testy)

1.0

In [22]:
clf.leaderboard(detailed=True, sort_by='rank').to_csv("result.csv", index=None)

In [24]:
y_pred = clf.predict(testdata)

In [23]:
from sklearn.metrics import classification_report

In [27]:
print(classification_report(y_true = testy, y_pred = y_pred,  digits=5))

              precision    recall  f1-score   support

         0.0    1.00000   1.00000   1.00000       100
         1.0    1.00000   1.00000   1.00000       100

    accuracy                        1.00000       200
   macro avg    1.00000   1.00000   1.00000       200
weighted avg    1.00000   1.00000   1.00000       200

