# Refactor
Refactoring the Adaboost Notebook to use the new library

In [15]:
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA

from pml.experiment.experiment import Grouping, Experiment
from pml.experiment.model import Model
from pml.data.data import DataFile, BrainData
from pml.utility.default import get_baseline_models
from sklearn.model_selection import LeaveOneOut

print('KEY: ')
KEY= input()
print('DESCRIPTION: ')
DESCRIPTION= input()
CV=5

# DATA_FILE= BrainData(file_url='data/all_data.xlsx', name='All Data')
# VALIDATION_FILE= BrainData(file_url='data/Validation_3.0.xlsx', name='Validation Data')

DATA_FILE= BrainData(file_url='data/Training_with_Patterns.xlsx', name='All Data with engineered features')
VALIDATION_FILE= BrainData(file_url='data/Validation_with_Patterns.xlsx', name='Validation Data with engineered features')

RESULTS_FILE= DataFile(file_url='data/results_test.xlsx', name='Test Results')

GROUPINGS = [
    Grouping(name='cvp', description='Control vs. All Parkinsons', grouping={0:0, 1:1, 2:1, 3:1}),
    Grouping(name='iva', description='Idiopathic vs. Atypical Parkinsons', grouping={1:0, 2:1, 3:1}),
    Grouping(name='mvp', description='MSA vs. PSP', grouping={2:0, 3:1}),
    Grouping(name='ipvm', description='Idiopathic and PSP vs. MSA', grouping={1:0, 2:1, 3:0},),
    Grouping(name='imvp', description='Idiopathic and MSA vs. PSP', grouping={1:0, 2:0, 3:1})
]

# list = ['UPDRS', 'pSN_FA', 'pSN_FW', 'Putamen_FA', 'GroupID']

MODELS = get_baseline_models()

for model in MODELS:
    model.parameter_grid['classifier__class_weight'] = ['balanced']
    model.parameter_grid['kbest__k'] = range(1,30)

MODELS = [model for model in MODELS if model.name == 'log']

PIPELINE = Pipeline([
    ("standardization", StandardScaler()),
#     ('pca', PCA()),
#     ('oversampler', RandomOverSampler()),
    ('kbest', SelectKBest(score_func=f_classif)),
])

experiment = Experiment(
    data_file=DATA_FILE, validation_file=VALIDATION_FILE,
    description=DESCRIPTION, cv=CV, groupings=GROUPINGS, key=KEY,
    models=MODELS, pipeline=PIPELINE, results_file=RESULTS_FILE
)

# experiment.data_file = experiment.data_file.df[list]
# experiment.validation_file = experiment.validation_file.df[list]

_ = experiment.run()

====
====
====                   Starting log model with Control vs. All Parkinsons grouping
====
====                   (LOG_K1_30)- Log. k best 1 throug 30
====
====
[2018-08-06 08:15:40.463240] Starting model: log with grouping: cvp
Fitting 5 folds for each of 1740 candidates, totalling 8700 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   24.4s
[Parallel(n_jobs=-1)]: Done 1178 tasks      | elapsed:   26.1s
[Parallel(n_jobs=-1)]: Done 5338 tasks      | elapsed:   31.1s
[Parallel(n_jobs=-1)]: Done 8700 out of 8700 | elapsed:   35.7s finished


[2018-08-06 08:16:16.517499] Done with model: log with grouping: cvp (Took 0.600 minutes)
Saving results of model run (log) to data/results_test.xlsx
====
====
====                   Starting log model with Idiopathic vs. Atypical Parkinsons grouping
====
====                   (LOG_K1_30)- Log. k best 1 throug 30
====
====
[2018-08-06 08:16:16.723556] Starting model: log with grouping: iva
Fitting 5 folds for each of 1740 candidates, totalling 8700 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 1163 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 5003 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done 8700 out of 8700 | elapsed:   16.5s finished
  f = msb / msw
  f = msb / msw


[2018-08-06 08:16:33.664934] Done with model: log with grouping: iva (Took 0.267 minutes)
Saving results of model run (log) to data/results_test.xlsx
====

[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 1163 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 5003 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done 8700 out of 8700 | elapsed:   15.2s finished



====
====                   Starting log model with MSA vs. PSP grouping
====
====                   (LOG_K1_30)- Log. k best 1 throug 30
====
====
[2018-08-06 08:16:33.852948] Starting model: log with grouping: mvp
Fitting 5 folds for each of 1740 candidates, totalling 8700 fits
[2018-08-06 08:16:49.315029] Done with model: log with grouping: mvp (Took 0.250 minutes)
Saving results of model run (log) to data/results_test.xlsx

[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 3756 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done 7788 tasks      | elapsed:   16.5s
[Parallel(n_jobs=-1)]: Done 8700 out of 8700 | elapsed:   18.0s finished
  f = msb / msw
  f = msb / msw



====
====
====                   Starting log model with Idiopathic and PSP vs. MSA grouping
====
====                   (LOG_K1_30)- Log. k best 1 throug 30
====
====
[2018-08-06 08:16:49.506018] Starting model: log with grouping: ipvm
Fitting 5 folds for each of 1740 candidates, totalling 8700 fits
[2018-08-06 08:17:07.927899] Done with model: log with grouping: ipvm (Took 0.300 minutes)
Saving results of model run (log) to data/results_test.xlsx

[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 1163 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 5003 tasks      | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Done 8700 out of 8700 | elapsed:   16.4s finished
  f = msb / msw
  f = msb / msw



====
====
====                   Starting log model with Idiopathic and MSA vs. PSP grouping
====
====                   (LOG_K1_30)- Log. k best 1 throug 30
====
====
[2018-08-06 08:17:08.118562] Starting model: log with grouping: imvp
Fitting 5 folds for each of 1740 candidates, totalling 8700 fits
[2018-08-06 08:17:24.757227] Done with model: log with grouping: imvp (Took 0.267 minutes)
Saving results of model run (log) to data/results_test.xlsx
