In [1]:
import pandas as pd

idle_time_data = pd.read_csv('../data/df_points/df_points_18_21_class.csv')
from sklearn.preprocessing import StandardScaler

TargetVariable = ['idle_time_class']
Predictors = ['bike_id', 'lat', 'lng', 'temp', 'rain', 'snow', 'dt_start', 'hex_enc', 'start_min', 'month', 'day']

X = idle_time_data[Predictors].values
y = idle_time_data[TargetVariable].values

#PredictorScaler = StandardScaler()
#PredictorScalerFit = PredictorScaler.fit(X)
#X = PredictorScalerFit.transform(X)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, shuffle=False)
### shuffle/inorder train and test sets:
# , shuffle=False
# , random_state=42

In [2]:
pd.DataFrame(y_test).value_counts()

2    68622
1    66954
3    63246
4    55894
dtype: int64

In [4]:
sweep_configuration = {
    "name": "DT-sweep-OvR",
    "metric": {"name": "accuracy", "goal": "maximize"},
    "method": "random",
    "parameters": {
        "splitter": {
            "values": ['best','random']
        },
        "criterion": {
            "values": ['entropy','gini']
        },
        "max_depth": {
            "values": [2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200, None]
        },
        "max_features": {
            "values": ['auto', 'sqrt', 'log2']
        },
        "min_samples_leaf": {
            "values": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 25, 30]
        },
        "min_samples_split": {
            "values": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 18, 20, 25, 30]
        }
    }
}

In [7]:
from matplotlib import pyplot as plt
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import zero_one_loss, accuracy_score
from sklearn.model_selection import cross_validate, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
import wandb
import wandb

def my_train_func():
    wandb.init()

    _splitter= wandb.config.splitter
    _criterion = wandb.config.criterion
    _max_depth = wandb.config.max_depth
    _max_features = wandb.config.max_features
    _min_samples_leaf = wandb.config.min_samples_leaf
    _min_samples_split = wandb.config.min_samples_split

    model = DecisionTreeClassifier(criterion=_criterion,
                                   max_depth=_max_depth,
                                   splitter=_splitter,
                                   max_features=_max_features,
                                   min_samples_leaf=_min_samples_leaf,
                                   min_samples_split=_min_samples_split)

    clsf = OneVsRestClassifier(model)
    clsf.fit(X_train,y_train.ravel())
    y_pred = clsf.predict(X_test)

    score_training = clsf.score(X_train, y_train.ravel())
    score_validation = clsf.score(X_test, y_test.ravel())

    acc = accuracy_score(y_test.ravel(), y_pred.ravel())
    loss = zero_one_loss(y_test.ravel(), y_pred.ravel())

    #tree.plot_tree(model, filled=True)

    #wandb.sklearn.plot_feature_importances(model, Predictors)

    wandb.log({"accuracy": acc})
    wandb.log({"conf_matrix": wandb.plot.confusion_matrix(y_true=y_test.ravel(), preds=y_pred.ravel())})
    #wandb.log({"feature_imp": wandb.sklearn.plot_feature_importances(model, Predictors)})
    wandb.log({"loss": loss})
    wandb.log({"score_training":score_training, "score_validation":score_validation})


In [None]:
# INIT SWEEP
sweep_id_rfc = wandb.sweep(sweep_configuration, project="DecisionTreeClassifier")
# RUN SWEEP
wandb.agent(sweep_id_rfc, function=my_train_func)