In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Auto-sklearn
Auto-sklearn is an automated machine learning toolkit and a drop-in replacement for a scikit-learn estimator.

For more information about the framework, please visit the documentation [here](https://automl.github.io/auto-sklearn/master/).

## What you will need to run the code

In order to run this code, we are going to first install Auto-sklearn using pip. For more instructions on how to install Auto-sklearn, for example using conda, please check [this](https://automl.github.io/auto-sklearn/master/installation.html).

In [2]:
!pip install scipy==1.8.1
!pip install Cython==0.29.35
# restart kernel!

Collecting scipy==1.8.1
  Downloading scipy-1.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (42.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy<1.25.0,>=1.17.3 (from scipy==1.8.1)
  Downloading numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy, scipy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.25.2
    Uninstalling numpy-1.25.2:
      Successfully uninstalled numpy-1.25.2
  Attempting uninstall: scipy
    Found existing installation: scipy 1.11.4
    Uninstalling scipy-1.11.4:
      Successfully uninstalled scipy-1.11.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the 

Collecting Cython==0.29.35
  Downloading Cython-0.29.35-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Cython
  Attempting uninstall: Cython
    Found existing installation: Cython 3.0.10
    Uninstalling Cython-3.0.10:
      Successfully uninstalled Cython-3.0.10
Successfully installed Cython-0.29.35


In [1]:
!pip install scikit-learn==0.24.2 --no-build-isolation

Collecting scikit-learn==0.24.2
  Using cached scikit-learn-0.24.2.tar.gz (7.5 MB)
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-learn
  Building wheel for scikit-learn (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-learn: filename=scikit_learn-0.24.2-cp310-cp310-linux_x86_64.whl size=22196049 sha256=c9b03de661f00a12e88e471170117a62ca79e6d599c37f80eac032f0416890f2
  Stored in directory: /root/.cache/pip/wheels/13/a4/68/4e78865652fa14db4a162b491e5138565f97646f9e1f2ab8cc
Successfully built scikit-learn
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 1.4.0 

In [3]:
#this needs to run twice!
!pip install auto-sklearn
import autosklearn

print("Using Auto-sklearn version {}".format(autosklearn.__version__))

Using Auto-sklearn version 0.15.0


## First Step: Load data

Auto-sklearn can work with multiple input data formats (python lists, numpy arrays, sparse arrays and pandas data-frames).  

For this example we are going to be using the [credit-g dataset](https://www.openml.org/d/31) which is a binary classification problem. This means that we have to find an estimator that is able to predict between 2 categories, *'bad'* and *'good'*.


In [4]:
import sklearn.datasets
import sklearn.model_selection

# We fetch the data using the openml.org
X, y = sklearn.datasets.fetch_openml(data_id=31, return_X_y=True, as_frame=True)

# Split the data into train and test
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    X, y, test_size=0.4, random_state=42
)

X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 600 entries, 24 to 102
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   checking_status         600 non-null    category
 1   duration                600 non-null    float64 
 2   credit_history          600 non-null    category
 3   purpose                 600 non-null    category
 4   credit_amount           600 non-null    float64 
 5   savings_status          600 non-null    category
 6   employment              600 non-null    category
 7   installment_commitment  600 non-null    float64 
 8   personal_status         600 non-null    category
 9   other_parties           600 non-null    category
 10  residence_since         600 non-null    float64 
 11  property_magnitude      600 non-null    category
 12  age                     600 non-null    float64 
 13  other_payment_plans     600 non-null    category
 14  housing                 600 no

## Second Step: Manually build a pipeline

For this tutorial, we are going to implement some traditional machine learning models ([GradientBoosting](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html), [Support Vector Machines](https://scikit-learn.org/stable/modules/svm.html), [Decision Tree Classifier](https://scikit-learn.org/stable/modules/tree.html)) using [scikit-learn](https://scikit-learn.org/stable/index.html). Then we are going to show how we can achieve an even better performance than these traditional models, by using Auto-Sklearn.


### C-Support Vector Classification



In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC


# Create the estimator using the default parameters from the library
estimator_svc = SVC(
    C=1.0, kernel='rbf', gamma='scale', shrinking=True, tol=1e-3,
    cache_size=200, verbose=False, max_iter=-1, random_state=42
)

# build and fit the pipeline
categorical_columns = [col for col in X_train.columns
                       if X[col].dtype.name == 'category']
encoder = ColumnTransformer(transformers = [
  ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
], remainder='passthrough')
pipeline_svc = Pipeline([
  ('encoder', encoder),
  ('scaler', StandardScaler()),
  ('svc', estimator_svc),
])
pipeline_svc.fit(X_train, y_train)

# Score the model
prediction = pipeline_svc.predict(X_test)
performance_svc = accuracy_score(y_test, prediction)
print(f"SVC performance is {performance_svc}")

SVC performance is 0.7675


### GradientBoostingClassifier

In [6]:
from sklearn.ensemble import GradientBoostingClassifier

# Create the estimator using default parameters from the library
estimator_gradboost = GradientBoostingClassifier(
    learning_rate=0.1, n_estimators=100, subsample=1.0,
    criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1,
    min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0,
    random_state=42)

# Translate the categorical columns to
# a numerical value
categorical_columns = [col for col in X_train.columns
                       if X[col].dtype.name == 'category']
encoder = ColumnTransformer(transformers = [
  ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
], remainder='passthrough')


# Build and fit the pipeline
pipeline_gradboost = Pipeline([
  ('encoder', encoder),
  ('gradboost', estimator_gradboost),
])
pipeline_gradboost.fit(X_train, y_train)

# Score the model
prediction = pipeline_gradboost.predict(X_test)
performance_gradboost = accuracy_score(y_test, prediction)
print(f"GradientBooster performance is {performance_gradboost}")

GradientBooster performance is 0.735


### Decision tree classifier

In [7]:
from sklearn.tree import DecisionTreeClassifier

# Create the estimator using the default parameters from the library
estimator_tree = DecisionTreeClassifier(random_state=42)

# build and fit the pipeline
encoder = ColumnTransformer(transformers = [
  ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
], remainder='passthrough')
pipeline_tree = Pipeline([
  ('encoder', encoder),
  ('DecisionTree', estimator_tree),
])
pipeline_tree.fit(X_train, y_train)

# Predict on the training data
prediction = pipeline_tree.predict(X_test)

# Evaluate the performance of the model
performance_tree = accuracy_score(y_test, prediction)
print(f"Decision Tree performance is {performance_tree}")

Decision Tree performance is 0.7075


# Third Step: Use Auto-sklearn as a drop-in-replacement


In [None]:
import autosklearn.classification

# Create and train the estimator
estimator_askl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=300,
    seed=42,
    resampling_strategy='cv',
    n_jobs=1,
)
estimator_askl.fit(X_train, y_train)

# Score the model
prediction = estimator_askl.predict(X_test)
performance_askl = accuracy_score(y_test, prediction)
print(f"Auto-Sklearn Classifier performance is {performance_askl}")



In [None]:
import pandas as pd
from google.colab import data_table

# By using Auto-Sklearn on can achieve a better performance!
data_table.DataTable(
pd.DataFrame(
    [
     {'Model': 'Auto-Sklearn Classifier', 'Accuracy': performance_askl},
     {'Model': 'GradientBoosting', 'Accuracy': performance_gradboost},
     {'Model': 'Decision Tree Classifier', 'Accuracy': performance_tree},
     {'Model': 'Support Vector Classifier', 'Accuracy': performance_svc},
     ]
))

## Exercise:
* Setup MLFlow (see [Week 3](https://colab.research.google.com/github/keuperj/MLSystems24/blob/main/week_3/MLFlow_Tutorial.ipynb) ) and tack the different hyper-parameter runs in the example above
* generate plots to find the best model and hyperparameters