# Data Modeling 02

In this notebook, we'll use Dask to tune a classifier with HyperbandSearch, so that we can train many models in parallel on the PRP. 

In [1]:
import dask
import numpy as np
import matplotlib.pyplot as plt 

import dask.dataframe as dd
from dask_ml.model_selection import train_test_split, HyperbandSearchCV, RandomizedSearchCV, GridSearchCV
from dask_ml.linear_model import LogisticRegression

Now, let's read in our cleaned (and for this local example, reduced) data and train a model on it 

In [42]:
X = dd.read_csv('../data/processed/primary_reduction_neighbors_15_components_3.csv')
y = dd.read_csv('../data/processed/primary_labels_neighbors_15_components_50.csv', header=None)

In [43]:
# y = y + 1

In [44]:
est = LogisticRegression()

grid = RandomizedSearchCV(
    n_iter=15,
    estimator=est,
    param_distributions={
        'penalty' : ['l1', 'l2'],
        'C' : np.linspace(0.1, 100, 50)
    },
    scoring='balanced_accuracy',
)

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

In [46]:
# best_est = grid.fit(X_train.values, y_train.values)

In [47]:
# best_est.cv_results_

Great, now let's see what the best estimator was!

In [48]:
# best_est.best_score_

Now let's define a generalized class to do this hyperparameter tuning 

In [49]:
class GeneClassifier:
    def __init__(self, est, params):
        self.est = est
        self.params = params
        
    def generate_model(self, X, y, n_iter=10):
        grid = RandomizedSearchCV(
            n_iter=n_iter,
            estimator=self.est,
            param_distributions=self.params,
            scoring='balanced_accuracy'
        )

        result = grid.fit(X, y)
        return result.best_score_, result.best_params_    

In [50]:
param_distributions = {
        'penalty' : ['l1', 'l2'],
        'C' : np.linspace(0.1, 100, 50)
    },

logistic_est = GeneClassifier(LogisticRegression(), param_distributions)

In [51]:
logistic_est.generate_model(X_train.values, y_train.values, n_iter=2)

(0.07142857142857142, {'penalty': 'l2', 'C': 93.88367346938776})

Now let's try this with a simple XGBClassifier (gradient boosted tree classifier)

In [None]:
from dask_ml.xgboost import XGBClassifier

params = {
    'eta' : np.linspace(0, 1, 20),
    'gamma': np.linspace(0, 1000, 20),
    'max_depth': np.linspace(0, 1000, 20, dtype=int),
}

xgb_est = GeneClassifier(XGBClassifier(), params)

Using the XGBClassifier from `dask_ml` requires a distributed Client, so we'll just use the default classifier instead. 

In [None]:
# xgb_est.generate_model(X_train.values, y_train.values, n_iter=2)

In [None]:
from xgboost import XGBClassifier
xgb_est = GeneClassifier(XGBClassifier(), params)
xgb_est.generate_model(X_train.values, y_train.values, n_iter=2)