# Tune Tutorial

Fugue Tune [repository](https://github.com/fugue-project/tune)

[PyData Talk](https://www.youtube.com/watch?v=MRa0ao4tfWc)

<img src="https://github.com/fugue-project/fugue/raw/master/images/logo.svg" style="height:200px">

## Space

In [1]:
from tune import Space, Grid, Rand, RandInt, Choice

In [2]:
space = Space(a=1, b=2)

list(space) 

[{'a': 1, 'b': 2}]

## Grid

In [3]:
space = Space(a=1, b=Grid(2,3), c=Grid("a","b"))

list(space)

[{'a': 1, 'b': 2, 'c': 'a'},
 {'a': 1, 'b': 2, 'c': 'b'},
 {'a': 1, 'b': 3, 'c': 'a'},
 {'a': 1, 'b': 3, 'c': 'b'}]

## Random

In [4]:
space = Space(a=1, b=Rand(0,1), c=Choice("a","b"))

list(space)

[{'a': 1, 'b': Rand(low=0, high=1, q=None, log=False, include_high=True), 'c': Choice('a', 'b')}]

In [5]:
space = Space(a=1, b=Rand(0,1), c=Choice("a","b"))

list(space.sample(3, seed=10))

[{'a': 1, 'b': 0.771320643266746, 'c': 'b'},
 {'a': 1, 'b': 0.0207519493594015, 'c': 'a'},
 {'a': 1, 'b': 0.6336482349262754, 'c': 'b'}]

## Random Search without Sampling

In [6]:
space = Space(a=1, b=Rand(0,1), c=Choice("a","b"))

list(space)

[{'a': 1, 'b': Rand(low=0, high=1, q=None, log=False, include_high=True), 'c': Choice('a', 'b')}]

## Grid + Random

In [7]:
space = Space(a=1, b=Grid(1,2), c=Rand(0,1))

list(space)

[{'a': 1, 'b': 1, 'c': Rand(low=0, high=1, q=None, log=False, include_high=True)},
 {'a': 1, 'b': 2, 'c': Rand(low=0, high=1, q=None, log=False, include_high=True)}]

In [8]:
space = Space(a=1, b=Grid(1,2), c=Rand(0,1)).sample(3)

list(space)

[{'a': 1, 'b': 1, 'c': 0.5833217369377363},
 {'a': 1, 'b': 2, 'c': 0.5833217369377363},
 {'a': 1, 'b': 1, 'c': 0.02517172841774562},
 {'a': 1, 'b': 2, 'c': 0.02517172841774562},
 {'a': 1, 'b': 1, 'c': 0.709208009843012},
 {'a': 1, 'b': 2, 'c': 0.709208009843012}]

## Summary

In [9]:
space1 = Space(model="model1", a=1, b=Grid(2,3))  # Grid search
space2 = Space(model="model2", x=Rand(3,4)).sample(2) # Random search
space3 = Space(model="model3", y=Rand(3,4))  # Bayesian Optimization

In [10]:
union_space = space1 + space2 + space3

list(union_space)

[{'model': 'model1', 'a': 1, 'b': 2},
 {'model': 'model1', 'a': 1, 'b': 3},
 {'model': 'model2', 'x': 3.265566126772097},
 {'model': 'model2', 'x': 3.2636028460289523},
 {'model': 'model3', 'y': Rand(low=3, high=4, q=None, log=False, include_high=True)}]

In [11]:
union_space = sum([space1, space2, space3])

list(union_space)

[{'model': 'model1', 'a': 1, 'b': 2},
 {'model': 'model1', 'a': 1, 'b': 3},
 {'model': 'model2', 'x': 3.265566126772097},
 {'model': 'model2', 'x': 3.2636028460289523},
 {'model': 'model3', 'y': Rand(low=3, high=4, q=None, log=False, include_high=True)}]

## Cross Product

In [12]:
space = sum([space1, space2, space3]) * Space(random_state=0, fit_intercept=Grid(True, False))

list(space)

[{'model': 'model1', 'a': 1, 'b': 2, 'random_state': 0, 'fit_intercept': True},
 {'model': 'model1', 'a': 1, 'b': 2, 'random_state': 0, 'fit_intercept': False},
 {'model': 'model1', 'a': 1, 'b': 3, 'random_state': 0, 'fit_intercept': True},
 {'model': 'model1', 'a': 1, 'b': 3, 'random_state': 0, 'fit_intercept': False},
 {'model': 'model2', 'x': 3.265566126772097, 'random_state': 0, 'fit_intercept': True},
 {'model': 'model2', 'x': 3.265566126772097, 'random_state': 0, 'fit_intercept': False},
 {'model': 'model2', 'x': 3.2636028460289523, 'random_state': 0, 'fit_intercept': True},
 {'model': 'model2', 'x': 3.2636028460289523, 'random_state': 0, 'fit_intercept': False},
 {'model': 'model3', 'y': Rand(low=3, high=4, q=None, log=False, include_high=True), 'random_state': 0, 'fit_intercept': True},
 {'model': 'model3', 'y': Rand(low=3, high=4, q=None, log=False, include_high=True), 'random_state': 0, 'fit_intercept': False}]

## Using tune

In [13]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split

train_set = pd.read_parquet("titanic.parquet")

train, test = train_test_split(train_set, test_size=0.20, random_state=0)
train.head()

Unnamed: 0,Fare,Age,Pclass,gender,C,Q,S,Family,Title,T_A4,...,T_SOC,T_SOPP,T_SOTONO2,T_SOTONOQ,T_STONO,T_STONO2,T_WC,T_WEP,T_X,label
140,-0.341452,-0.104637,0.827377,-1.355574,2.074505,-0.307562,-1.623803,1.231645,-0.947423,-0.088986,...,-0.082339,-0.058124,-0.047431,-0.130856,-0.116841,-0.082339,-0.10654,-0.058124,0.589879,0
439,-0.437007,0.125912,-0.369365,0.737695,-0.482043,-0.307562,0.615838,-0.811922,0.666638,-0.088986,...,-0.082339,-0.058124,-0.047431,-0.130856,-0.116841,-0.082339,-0.10654,-0.058124,-1.695262,0
817,0.096646,0.125912,-0.369365,0.737695,2.074505,-0.307562,-1.623803,1.231645,0.666638,-0.088986,...,-0.082339,-0.058124,-0.047431,-0.130856,-0.116841,-0.082339,-0.10654,-0.058124,-1.695262,0
378,-0.567631,-0.719436,0.827377,0.737695,2.074505,-0.307562,-1.623803,-0.811922,0.666638,-0.088986,...,-0.082339,-0.058124,-0.047431,-0.130856,-0.116841,-0.082339,-0.10654,-0.058124,0.589879,0
491,-0.502445,-0.642586,0.827377,0.737695,-0.482043,-0.307562,0.615838,-0.811922,0.666638,-0.088986,...,-0.082339,-0.058124,-0.047431,7.641989,-0.116841,-0.082339,-0.10654,-0.058124,-1.695262,0


In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from tune import TUNE_OBJECT_FACTORY
from tune import Space, Grid, Rand, RandInt, Choice
from tune_sklearn import sk_space, suggest_sk_models, suggest_sk_models_by_cv

TUNE_OBJECT_FACTORY.set_temp_path("/tmp")

In [15]:
space = sk_space(LogisticRegression)
print(list(space))

result = suggest_sk_models(
    space,
    train, test,
    scoring="accuracy",
)

# The following code is just for demo purpose
for r in result:
    print(r.metric, r.trial.keys, r.trial.params)

NativeExecutionEngine doesn't respect num_partitions ROWCOUNT


[{'__space__model': 'sklearn.linear_model._logistic.LogisticRegression'}]
0.7988826815642458 [] {'__space__model': 'sklearn.linear_model._logistic.LogisticRegression'}


In [16]:
space = sum([
    sk_space(LogisticRegression),
    sk_space(RandomForestClassifier, n_estimators=10)
])

result = suggest_sk_models(
    space,
    train, test,
    scoring="accuracy",
)

for r in result:
    print(r.metric, r.trial.keys, r.trial.params)

NativeExecutionEngine doesn't respect num_partitions ROWCOUNT


0.8435754189944135 [] {'__space__model': 'sklearn.ensemble._forest.RandomForestClassifier', 'n_estimators': 10}


In [17]:
space = sum([
    sk_space(LogisticRegression),
    sk_space(GradientBoostingClassifier, n_estimators=Grid(5,10), max_depth=Grid(5,10), learning_rate=Rand(0.01,0.99)).sample(5,1)
])

result = suggest_sk_models(
    space,
    train, test,
    scoring="accuracy",
)

for r in result:
    print(r.metric, r.trial.keys, r.trial.params)

NativeExecutionEngine doesn't respect num_partitions ROWCOUNT


0.8435754189944135 [] {'__space__model': 'sklearn.ensemble._gb.GradientBoostingClassifier', 'n_estimators': 10, 'max_depth': 5, 'learning_rate': 0.4186815646085225}


In [18]:
from tune import NonIterativeObjectiveLocalOptimizer
from tune_hyperopt import HyperoptLocalOptimizer
from tune_optuna import OptunaLocalOptimizer
import optuna

optuna.logging.disable_default_handler()

def to_optimizer(obj):
    if isinstance(obj, NonIterativeObjectiveLocalOptimizer):
        return obj
    if obj is None or "hyperopt"==obj:
        return HyperoptLocalOptimizer(max_iter=20, seed=0)
    if "optuna" == obj:
        return OptunaLocalOptimizer(max_iter=20)
    raise NotImplementedError

# make default level 2 optimizer HyperoptLocalOptimizer, so you will not need to set again
TUNE_OBJECT_FACTORY.set_noniterative_local_optimizer_converter(to_optimizer)

In [19]:
space = sum([
    sk_space(LogisticRegression),
    sk_space(GradientBoostingClassifier, n_estimators=Grid(5,10), max_depth=Grid(5,10), learning_rate=Rand(0.01,0.99))
])

result = suggest_sk_models(
    space,
    train, test,
    scoring="accuracy",
)

for r in result:
    print(r.metric, r.trial.keys, r.trial.params)

NativeExecutionEngine doesn't respect num_partitions ROWCOUNT
_9 _State.RUNNING -> _State.FAILED  template contains tuning expressions


ValueError: template contains tuning expressions

## Cross Product

In [None]:
space = sum([
    sk_space(LogisticRegression),
    sk_space(GradientBoostingClassifier, n_estimators=Grid(5,10), learning_rate=Rand(0.01,0.99)) * Space(max_depth=RandInt(1,100)).sample(5,0)
])

result = suggest_sk_models(
    space,
    train, test,
    scoring="accuracy",
)

for r in result:
    print(r.metric, r.trial.keys, r.trial.params)

## Partition

In [None]:
space = sum([
    sk_space(LogisticRegression),
    sk_space(GradientBoostingClassifier, n_estimators=Grid(5,10), max_depth=Grid(5,10), learning_rate=Rand(0.01,0.99))
])

result = suggest_sk_models(
    space,
    train, test,
    scoring="accuracy",
    partition_keys=["gender"]
)

for r in result:
    print(r.metric, r.trial.keys, r.trial.params)

## Bringing to Dask

In [None]:
import fugue_dask
space = sum([
    sk_space(LogisticRegression),
    sk_space(GradientBoostingClassifier, n_estimators=Grid(5,10), max_depth=Grid(5,10), learning_rate=Rand(0.01,0.99))
])

result = suggest_sk_models(
    space,
    train, test,
    scoring="accuracy",
    partition_keys=["gender"],
    execution_engine="dask"
)

for r in result:
    print(r.metric, r.trial.keys, r.trial.params)