# Parameter tuning for Decision Tree Classifier

In [1]:
import pandas as pd
import numpy as np
import openml
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV, KFold, cross_val_score
from skopt import BayesSearchCV

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_selector

import warnings
warnings.filterwarnings("ignore")

In [3]:
dataset_ids = [1489, 1464, 1462, 37]

def get_data(id):
    df = openml.datasets.get_dataset(id).get_data(dataset_format="dataframe")[0]
    X, y = df.iloc[:, :-1], df.iloc[:, -1]
    if id != 37:
        y = y.apply(lambda y: 1 if y == '1' else 0)
    else:
        y = y.apply(lambda y: 1 if y == 'tested_positive' else 0)
    return X, y

In [18]:
numpipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

catpipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

column_transformer = ColumnTransformer(
    transformers=[
        ('num', numpipe, make_column_selector(dtype_include=['int64', 'float64'])),
        ('cat', catpipe, make_column_selector(dtype_include='object'))
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessing', column_transformer),
    ('modelinho', DecisionTreeClassifier(random_state=2137))
])

# Parameters and their ranges
### Search space for parameters has been chosen considering ranges used by authors of [this paper](https://jmlr.org/papers/volume20/18-444/18-444.pdf) that introduced ***tunability***. Other resource was documentation for [sklearn implementation](https://scikit-learn.org/dev/modules/generated/sklearn.tree.DecisionTreeClassifier.html) of Decision Tree Classifier.

* max_depth: [1, 30]    *(The maximum depth of the tree)*
* min_samples_split [2, 60] *(The minimum number of samples required to split an internal node)*
* min_samples_leaf [1, 60]  *(The minimum number of samples required to be at a leaf node.)*
* max_features: [None, log2(n), sqrt(n)]    *(The number of features to consider when looking for the best split, n=amount of instances in the dataset)*
* criterion: ['gini', 'entropy']    *(The function to measure the quality of a split)*

In [5]:
decision_tree_parameters = {
    'modelinho__max_depth': np.arange(1, 30), 
    'modelinho__min_samples_split': np.arange(2, 60),  
    'modelinho__min_samples_leaf': np.arange(1, 60),

    'modelinho__criterion': ['gini', 'entropy'],
    'modelinho__max_features': [None, 'log2', 'sqrt'],
}

In [6]:
randomized_search = RandomizedSearchCV(
    pipeline,
    param_distributions=decision_tree_parameters,
    n_iter=50,
    cv=5,
    random_state=2137,
    scoring='roc_auc',
    verbose=True
)

bayesian_search = BayesSearchCV(
    pipeline,
    search_spaces=decision_tree_parameters,
    n_iter=50, 
    cv=5,
    random_state=2137,
    scoring='roc_auc',
    verbose=True
)

In [7]:
for id in dataset_ids:
    X, y = get_data(id)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2137)

    model_r = randomized_search.fit(X_train, y_train)
    pd.DataFrame(model_r.cv_results_).to_excel(f"./DecisionTreeResults/df_{id}_random.xlsx", index=0)

    model_b = bayesian_search.fit(X_train, y_train)
    pd.DataFrame(model_b.cv_results_).to_excel(f"./DecisionTreeResults/df_{id}_bayes.xlsx", index=0)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5

### Default models - as default models we will treat models with default parameters from sklearn implementation of Decision Tree Classifier.

A general measure of the tunability of an algorithm per dataset can then be computed
based on the difference between the risk of an overall reference configuration (e.g., either the
software defaults or definition from section 3.2 of aforementioned paper) and the risk of the best possible configuration on that
dataset.<br>

For each algorithm, this gives rise to an empirical distribution of performance differences
over datasets, which might be directly visualized or summarized to an aggregated tunability
measure d by using mean, median or quantiles.

In [21]:
baseline_scores = {}

for id in dataset_ids:
    X, y = get_data(id)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2137)

    roc_auc_scores = cross_val_score(
        pipeline, 
        X,
        y,
        cv=KFold(
            n_splits=5,
            shuffle=True,
            random_state=2137
        ),
        scoring='roc_auc',
        verbose=True
    )
    
    baseline_scores[id] = roc_auc_scores.mean()
    print(f'Mean ROC AUC score for dataset {id}: ', roc_auc_scores.mean())

Mean ROC AUC score for dataset 1489:  0.8364736891387141
Mean ROC AUC score for dataset 1464:  0.5998592583050133
Mean ROC AUC score for dataset 1462:  0.9875568835047673
Mean ROC AUC score for dataset 37:  0.6580535603748968


In [24]:
pd.DataFrame(baseline_scores.items(), columns=['Dataset ID', 'Mean ROC AUC']).to_excel('./DecisionTreeResults/baseline_scores.xlsx', index=0)