In [1]:
import numpy as np 
import pandas as pd 

from sklearn import linear_model 
from sklearn import ensemble 
from sklearn import metrics 
from sklearn import model_selection
import hyperopt
from hyperopt import hp, fmin, tpe, Trials
import optuna

In [2]:
# Load data
data = pd.read_csv("Data/train_sem09.csv")
data.head()

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Prepare data
X = data.drop(["Activity"], axis=1)
y = data["Activity"]
X_train, X_test, y_train, y_test = model_selection.train_test_split(
                                                    X, y, 
                                                    stratify=y, 
                                                    test_size=0.2, 
                                                    random_state=42
)

In [4]:
# Create models without optimization
# LogisticRegression
model_log_r = linear_model.LogisticRegression(max_iter=1000)
model_log_r.fit(X_train, y_train)
print(
    f"LogisticRegression accuracy score on test data: \
{np.round(model_log_r.score(X_test, y_test), 2)}"
)
y_test_pred = model_log_r.predict(X_test)
print(
    f"LogisticRegression f1-score on test data: \
{np.round(metrics.f1_score(y_test, y_test_pred), 2)}"
)
print()

# RandomForestClassifier
model_rfc = ensemble.RandomForestClassifier(random_state=42)
model_rfc.fit(X_train, y_train)
y_train_pred = model_rfc.predict(X_train)
print(
    f"RandomForestClassifier f1-score on train data: \
{np.round(metrics.f1_score(y_train, y_train_pred), 2)}"
)
y_test_pred = model_rfc.predict(X_test)
print(
    f"RandomForestClassifier f1-score on test data: \
{np.round(metrics.f1_score(y_test, y_test_pred), 2)}"
)

LogisticRegression accuracy score on test data: 0.75
LogisticRegression f1-score on test data: 0.78

RandomForestClassifier f1-score on train data: 1.0
RandomForestClassifier f1-score on test data: 0.8


In [6]:
# GridSearchCV LogisticRegression

param_grid_lr = {
    "penalty": ["l2", "none"],
    "solver": ["lbfgs", "sag"], 
    "C": [0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 1]
}

grid_search_lr = model_selection.GridSearchCV(
    estimator=linear_model.LogisticRegression(random_state=42, max_iter=1000),
    param_grid=param_grid_lr,
    cv=5,
    n_jobs=-1
)

grid_search_lr.fit(X_train, y_train)
print(
    f"LogisticRegression accuracy score after GridSearchCV on test data: \
{np.round(grid_search_lr.score(X_test, y_test), 2)}"
)
y_test_pred = grid_search_lr.predict(X_test)
print(f"LogisticRegression f1-score after GridSearchCV on test data: \
{np.round(metrics.f1_score(y_test, y_test_pred), 2)}"
)
print(f"LogisticRegression GridSearchCV best hypereparameters: {grid_search_lr.best_params_}")
print(
    f"LogisticRegression GridSearchCV accuracy score on cross-validation: \
{np.round(grid_search_lr.best_score_, 2)}"
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegression accuracy score after GridSearchCV on test data: 0.76
LogisticRegression f1-score after GridSearchCV on test data: 0.78
LogisticRegression GridSearchCV best hypereparameters: {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
LogisticRegression GridSearchCV accuracy score on cross-validation: 0.76


In [7]:
# GridSearchCV RandomForestClassifier

param_grid_rfc = {
    "n_estimators": list(np.linspace(50, 400, 8, dtype=int)),
    "min_samples_leaf": [5, 7],
    "max_depth": list(np.linspace(20, 40, 4, dtype=int))
}

grid_search_rfc = model_selection.GridSearchCV(
    estimator=ensemble.RandomForestClassifier(random_state=42),
    param_grid=param_grid_rfc,
    cv=5,
    n_jobs=-1
)

grid_search_rfc.fit(X_train, y_train)
y_train_pred = grid_search_rfc.predict(X_train)
print(
    f"RandomForestClassifier f1-score after GridSearchCV on train data: \
{np.round(metrics.f1_score(y_train, y_train_pred), 2)}"
)
print(
    f"RandomForestClassifier accuracy score after GridSearchCV on test data: \
{np.round(grid_search_rfc.score(X_test, y_test), 2)}"
)
y_test_pred = grid_search_rfc.predict(X_test)
print(
    f"RandomForestClassifier f1-score after GridSearchCV on test data: \
{np.round(metrics.f1_score(y_test, y_test_pred), 2)}"
)
print(f"RandomForestClassifier GridSearchCV best hyperparameters: {grid_search_rfc.best_params_}")
print(
    f"RandomForestClassifier GridSearchCV accuracy score on cross-validation: \
{np.round(grid_search_rfc.best_score_, 2)}"
)

RandomForestClassifier f1-score after GridSearchCV on train data: 0.94
RandomForestClassifier accuracy score after GridSearchCV on test data: 0.78
RandomForestClassifier f1-score after GridSearchCV on test data: 0.8
RandomForestClassifier GridSearchCV best hyperparameters: {'max_depth': 20, 'min_samples_leaf': 5, 'n_estimators': 350}
RandomForestClassifier GridSearchCV accuracy score on cross-validation: 0.79


In [8]:
# RandomSearchCV LogisticRegression

param_distributions_lr = {
    "penalty": ["l2", "none"],
    "solver": ["lbfgs", "sag"],
    "C": list(np.linspace(0.01, 1, 10, dtype=float))
}

random_search_lr = model_selection.RandomizedSearchCV(
    estimator=linear_model.LogisticRegression(random_state=42, max_iter=1000), 
    param_distributions=param_distributions_lr, 
    cv=5, 
    n_iter = 10, 
    n_jobs = -1
)  
random_search_lr.fit(X_train, y_train) 
print(
    f"LogisticRegression accuracy score after RandomSearchCV on test data: \
{np.round(random_search_lr.score(X_test, y_test), 2)}"
)
y_test_pred = random_search_lr.predict(X_test)
print(
    f"LogisticRegression f1-score after RandomSearchCV on test data: \
{np.round(metrics.f1_score(y_test, y_test_pred), 2)}"
)
print(f"LogisticRegression RandomSearchCV best hyperparameters: {random_search_lr.best_params_}")
print(
    f"LogisticRegression RandomSearchCV accuracy score on cross-validation: \
{np.round(random_search_lr.best_score_, 2)}"
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression accuracy score after RandomSearchCV on test data: 0.76
LogisticRegression f1-score after RandomSearchCV on test data: 0.78
LogisticRegression RandomSearchCV best hyperparameters: {'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.01}
LogisticRegression RandomSearchCV accuracy score on cross-validation: 0.76


In [9]:
# RandomSearchCV RandomForestClassifier

param_distributions_rfc = {
    "n_estimators": list(range(80, 200, 30)),
    "min_samples_leaf": [5],
    "max_depth": list(np.linspace(20, 40, 10, dtype=int))
}
            
random_search_rfc = model_selection.RandomizedSearchCV(
    estimator=ensemble.RandomForestClassifier(random_state=42), 
    param_distributions=param_distributions_rfc, 
    cv=5,
    n_iter = 10, 
    n_jobs = -1
)  
random_search_rfc.fit(X_train, y_train) 
y_train_pred = random_search_rfc.predict(X_train)
print(
    f"RandomForestClassifier f1-score after RandomSearchCV on train data: \
{np.round(metrics.f1_score(y_train, y_train_pred), 2)}"
)
print(
    f"RandomForestClassifier accuracy score after RandomSearchCV on train data: \
{np.round(random_search_rfc.score(X_test, y_test), 2)}"
)
y_test_pred = random_search_rfc.predict(X_test)
print(
    f"RandomForestClassifier f1-score after RandomSearchCV on test data: \
{np.round(metrics.f1_score(y_test, y_test_pred), 2)}"
)
print(f"RandomForestClassifier RandomSearchCV best hyperparameters: {random_search_rfc.best_params_}")
print(
    f"RandomForestClassifier RandomSearchCV accuracy score on cross-validation: \
{np.round(random_search_rfc.best_score_, 2)}"
)

RandomForestClassifier f1-score after RandomSearchCV on train data: 0.95
RandomForestClassifier accuracy score after RandomSearchCV on train data: 0.78
RandomForestClassifier f1-score after RandomSearchCV on test data: 0.8
RandomForestClassifier RandomSearchCV best hyperparameters: {'n_estimators': 170, 'min_samples_leaf': 5, 'max_depth': 22}
RandomForestClassifier RandomSearchCV accuracy score on cross-validation: 0.79


In [10]:
# Hyperpopt LogisticRegression

# space for hyperparameters search
solvers = ["sag", "lbfgs"]
space_lr = {
    "solver" : hp.choice(label="solver", options=solvers),
    "max_iter" : hp.uniform(label="max_iter", low=700, high=1600),
    "C" : hp.uniform(label="C", low=0.01, high=1.0)
}

random_state = 42
def hyperopt_lr(params, cv=5, X=X_train, y=y_train, random_state=random_state):
    # combination of hyperparameters
    params = {
        "solver": params["solver"],
        "max_iter": int(params["max_iter"]), 
        "C": float(params["C"])
    }

    # create and train model with cross-validation
    model = linear_model.LogisticRegression(**params, random_state=random_state)
    score = model_selection.cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()
    
    # minimize metric
    return -score

trials_lr = Trials() # logging
best=fmin(
    hyperopt_lr, 
    space=space_lr, 
    max_evals=20, 
    trials=trials_lr,
    rstate=np.random.default_rng(random_state)
)
print(f"LogisticRegression Hyperopt best hyperparameters: {best}")

# Get accuracy and f1-score for LogisticRegression model
solver = solvers[best["solver"]]

model_lr_hopt = linear_model.LogisticRegression(
    random_state=random_state,
    solver=solver,
    max_iter=int(best["max_iter"]),
    C=float(best["C"]),
)
model_lr_hopt.fit(X_train, y_train)
y_train_pred = model_lr_hopt.predict(X_train)
print(
    f"LogisticRegression f1_score after Hyperpopt on train data: \
{np.round(metrics.f1_score(y_train, y_train_pred), 2)}"
)
print(
    f"LogisticRegression accuracy after Hyperpopt on test data: \
{np.round(model_lr_hopt.score(X_test, y_test), 2)}"
)
y_test_pred = model_lr_hopt.predict(X_test)
print(
    f"LogisticRegression f1_score after Hyperpopt on test data: \
{np.round(metrics.f1_score(y_test, y_test_pred), 2)}"
)

TPE is being used as the default algorithm.


100%|██████████| 20/20 [02:49<00:00,  8.47s/trial, best loss: -0.7929826808799139]
LogisticRegression Hyperopt best hyperparameters: {'C': 0.02692572304826705, 'max_iter': 1515.1584720707733, 'solver': 1}
LogisticRegression f1_score after Hyperpopt on train data: 0.84
LogisticRegression accuracy after Hyperpopt on test data: 0.75
LogisticRegression f1_score after Hyperpopt on test data: 0.78


In [11]:
# Hyperopt RandomForestClassifier

space_rfc={
    "n_estimators": hp.quniform("n_estimators", 100, 200, 1),
    "max_depth": hp.quniform("max_depth", 15, 26, 1),
    "min_samples_leaf": hp.quniform("min_samples_leaf", 2, 10, 1)
}

random_state = 42
def hyperopt_rfc(params, cv=5, X=X_train, y=y_train, random_state=random_state):
    # combination of hyperparameters
    params = {
        "n_estimators": int(params["n_estimators"]), 
        "max_depth": int(params["max_depth"]), 
        "min_samples_leaf": int(params["min_samples_leaf"])
    }

    # create and train model with cross-validation
    model = ensemble.RandomForestClassifier(**params, random_state=random_state)
    score = model_selection.cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()

    # minimize metric
    return -score

trials_rfc = Trials() # logging

best=fmin(
    hyperopt_rfc,
    space=space_rfc,
    algo=tpe.suggest,
    max_evals=20,
    trials=trials_rfc,
    rstate=np.random.default_rng(random_state)
)

print(f"RandomForestClassifier Hyperopt best hyperparameters: {best}")

# Get accuracy and f1-score for RandomForestClassifier model
model_rfc_hopt = ensemble.RandomForestClassifier(
    random_state=random_state, 
    n_estimators=int(best['n_estimators']),
    max_depth=int(best['max_depth']),
    min_samples_leaf=int(best['min_samples_leaf'])
)
model_rfc_hopt.fit(X_train, y_train)
y_train_pred = model_rfc_hopt.predict(X_train)
print(
    f"RandomForestClassifier f1_score after Hyperpopt on train data: \
{np.round(metrics.f1_score(y_train, y_train_pred), 2)}"
)
print(
    f"RandomForestClassifier accuracy after Hyperpopt on test data: \
{np.round(model_rfc_hopt.score(X_test, y_test), 2)}"
)
y_test_pred = model_rfc_hopt.predict(X_test)
print(
    f"RandomForestClassifier f1_score after Hyperpopt on test data: \
{np.round(metrics.f1_score(y_test, y_test_pred), 2)}"
)

100%|██████████| 20/20 [00:31<00:00,  1.59s/trial, best loss: -0.8160803811393121]
RandomForestClassifier Hyperopt best hyperparameters: {'max_depth': 18.0, 'min_samples_leaf': 2.0, 'n_estimators': 103.0}
RandomForestClassifier f1_score after Hyperpopt on train data: 0.99
RandomForestClassifier accuracy after Hyperpopt on test data: 0.79
RandomForestClassifier f1_score after Hyperpopt on test data: 0.81


In [12]:
# Optuna LogisticRegression

random_state=12
def optuna_lr(trial):
  
  # space for hyperparameters search
  solver = trial.suggest_categorical(name="solver", choices=["sag", "lbfgs", "saga"])
  max_iter = trial.suggest_int(name="max_iter" , low=700, high=1600, step=10)
  C = trial.suggest_float(name="C", low=0.01, high=1.0)

  # create and train model
  model = linear_model.LogisticRegression(solver=solver,
                                          max_iter=max_iter,
                                          C=C,
                                          random_state=random_state)
  model.fit(X_train, y_train)
  score = metrics.f1_score(y_train, model.predict(X_train))

  return score

# Create study object
# give argument direction="maximize"
study_lr = optuna.create_study(study_name="LogisticRefression", direction="maximize")

# search best combination of hypreparameters n_trials times
study_lr.optimize(optuna_lr, n_trials=20)

# Get best results on train data
print(f"Best values of hyperparameters for LogisticRegression: {study_lr.best_params}")
print(
  f"f1_score on train data for LogisticRegression: {np.round(study_lr.best_value, 2)}"
  )
print()

# accuracy on test data
model_lr_optuna = linear_model.LogisticRegression(**study_lr.best_params,random_state=random_state, )
model_lr_optuna.fit(X_train, y_train)
y_train_pred = model_lr_optuna.predict(X_train)
print(
    f"Accuracy on test data for LogisticRegression: {np.round(model_lr_optuna.score(X_test, y_test), 2)}")
y_test_pred = model_lr_optuna.predict(X_test)
print(f"f1_score on test data for LogisticRegression: {np.round(metrics.f1_score(y_test, y_test_pred), 2)}")

[32m[I 2022-10-30 00:58:23,497][0m A new study created in memory with name: LogisticRefression[0m
[32m[I 2022-10-30 00:58:44,718][0m Trial 0 finished with value: 0.8865291262135923 and parameters: {'solver': 'saga', 'max_iter': 910, 'C': 0.6290324966249393}. Best is trial 0 with value: 0.8865291262135923.[0m
[32m[I 2022-10-30 00:59:04,573][0m Trial 1 finished with value: 0.8813353566009106 and parameters: {'solver': 'sag', 'max_iter': 720, 'C': 0.44081449105049414}. Best is trial 0 with value: 0.8865291262135923.[0m
[32m[I 2022-10-30 00:59:17,563][0m Trial 2 finished with value: 0.84688995215311 and parameters: {'solver': 'saga', 'max_iter': 1080, 'C': 0.06643987564083981}. Best is trial 0 with value: 0.8865291262135923.[0m
[32m[I 2022-10-30 00:59:18,494][0m Trial 3 finished with value: 0.8771610555050044 and parameters: {'solver': 'lbfgs', 'max_iter': 1580, 'C': 0.34257490472396446}. Best is trial 0 with value: 0.8865291262135923.[0m
[32m[I 2022-10-30 00:59:31,759][0m

Best values of hyperparameters for LogisticRegression: {'solver': 'sag', 'max_iter': 1600, 'C': 0.9850841145540543}
f1_score on train data for LogisticRegression: 0.89

Accuracy on test data for LogisticRegression: 0.75
f1_score on test data for LogisticRegression: 0.78


In [13]:
# Optuna RandomForestClassifier

random_state=12
def optuna_rfc(trial):
  
  # space for hyperparameters search
  n_estimators = trial.suggest_int(name="n_estimators", low=80, high=300, step=10)
  max_depth = trial.suggest_int("max_depth", 15, 40, 1)
  min_samples_leaf = trial.suggest_int("min_samples_leaf", 3, 7, 1)

  # create and train model
  model = ensemble.RandomForestClassifier(n_estimators=n_estimators,
                                          max_depth=max_depth,
                                          min_samples_leaf=min_samples_leaf,
                                          random_state=random_state)
  
  model.fit(X_train, y_train)
  score = metrics.f1_score(y_train, model.predict(X_train))

  return score

study_rfc = optuna.create_study(study_name="RandomForestClassifier", direction="maximize")
# search best combination of hypreparameters n_trials times
study_rfc.optimize(optuna_rfc, n_trials=20)

# Get best results on train data
print(f"Best values of hyperparameters for RandomForestClassifier: {study_rfc.best_params}")
print(
  f"f1_score on train data for RandomForestClassifier: {np.round(study_rfc.best_value, 2)}"
  )
print()

# accuracy on test data
model_rfc_optuna = ensemble.RandomForestClassifier(**study_rfc.best_params,random_state=random_state, )
model_rfc_optuna.fit(X_train, y_train)
y_train_pred = model_rfc_optuna.predict(X_train)
print(
    f"Accuracy on test data for RandomForestClassifier: {np.round(model_rfc_optuna.score(X_test, y_test), 2)}")
y_test_pred = model_rfc_optuna.predict(X_test)
print(f"f1_score on test data for RandomForestClassifier: {np.round(metrics.f1_score(y_test, y_test_pred), 2)}")

[32m[I 2022-10-30 01:04:39,630][0m A new study created in memory with name: RandomForestClassifier[0m
[32m[I 2022-10-30 01:04:41,052][0m Trial 0 finished with value: 0.9243902439024391 and parameters: {'n_estimators': 160, 'max_depth': 34, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.9243902439024391.[0m
[32m[I 2022-10-30 01:04:43,196][0m Trial 1 finished with value: 0.9622411693057247 and parameters: {'n_estimators': 210, 'max_depth': 25, 'min_samples_leaf': 4}. Best is trial 1 with value: 0.9622411693057247.[0m
[32m[I 2022-10-30 01:04:45,772][0m Trial 2 finished with value: 0.9749847467968273 and parameters: {'n_estimators': 250, 'max_depth': 23, 'min_samples_leaf': 3}. Best is trial 2 with value: 0.9749847467968273.[0m
[32m[I 2022-10-30 01:04:46,772][0m Trial 3 finished with value: 0.9335770871419867 and parameters: {'n_estimators': 110, 'max_depth': 23, 'min_samples_leaf': 6}. Best is trial 2 with value: 0.9749847467968273.[0m
[32m[I 2022-10-30 01:04:47,594

Best values of hyperparameters for RandomForestClassifier: {'n_estimators': 240, 'max_depth': 40, 'min_samples_leaf': 3}
f1_score on train data for RandomForestClassifier: 0.98

Accuracy on test data for RandomForestClassifier: 0.78
f1_score on test data for RandomForestClassifier: 0.8
