In [3]:
import numpy as np
import pandas as pd

from sklearn import linear_model
from sklearn import model_selection
from sklearn import ensemble
from sklearn import metrics

import hyperopt
from hyperopt import hp, fmin, tpe, Trials

import optuna

In [4]:
df = pd.read_csv('data/_train_sem09 (1).csv')

In [5]:
df.head()

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


In [6]:
def print_f1_score(y_train, y_test, y_train_pred, y_test_pred):
    print('f1 score train: ', metrics.f1_score(y_train, y_train_pred))
    print('f1 score test: ', metrics.f1_score(y_test, y_test_pred))

In [7]:
X = df.drop('Activity', axis=1)
y = df['Activity']

In [8]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

LogisticRegression

In [9]:
space_lr = {
    'penalty' : ['none', 'l1', 'l2', 'elasticnet'],
    'solver'  : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'C'       : list(np.linspace(0.01, 1, 10, dtype=float))
}

GridSearch

In [None]:


grid_search = model_selection.GridSearchCV(
    estimator=linear_model.LogisticRegression(max_iter=50, random_state=42),
    param_grid=space_lr,
    cv=5,
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

y_train_pred = grid_search.predict(X_train)
y_test_pred = grid_search.predict(X_test)



In [11]:
print_f1_score(y_train, y_test, y_train_pred, y_test_pred)

f1 score train:  0.8397553516819573
f1 score test:  0.8


RandomSearch

In [None]:
random_search = model_selection.RandomizedSearchCV(
    estimator=linear_model.LogisticRegression(random_state=42, max_iter=50),
    param_distributions=space_lr,
    cv=5,
    n_iter=50,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

y_train_pred = random_search.predict(X_train)
y_test_pred = random_search.predict(X_test)



In [13]:
print_f1_score(y_train, y_test, y_train_pred, y_test_pred)

f1 score train:  0.8580227902679396
f1 score test:  0.7977142857142857


Hyperopt

In [22]:
penalties = ['none', 'l1', 'l2', 'elasticnet']
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
space_lr = {
       'penalty': hp.choice('penalty', penalties),
       'solver' : hp.choice('solver', solvers),
       'C': hp.quniform('C', 0.01, 1, 0.1)
      }

In [15]:
def hyperopt_lr(params, cv=5, X=X_train, y=y_train, random_state=42):
    params = {
        'penalty' : params['penalty'],
        'solver'  : params['solver'],
        'C'       : float(params['C'])
    }
    
    model = linear_model.LogisticRegression(**params, random_state=random_state, max_iter=2000)
    
    score = model_selection.cross_val_score(model, X, y, scoring='f1', cv=cv, n_jobs=-1).mean()
    
    return -score

In [None]:
%%time

trials = Trials()

best = fmin(
    hyperopt_lr,
    space=space_lr,
    max_evals=50,
    trials=trials,
    rstate=np.random.default_rng(42)
)

In [21]:
print(best)

{'C': 0.1, 'penalty': 2, 'solver': 1}


In [None]:
model = linear_model.LogisticRegression(
    random_state=42,
    C=best['C'],
    penalty=penalties[best['penalty']],
    solver=solvers[best['solver']]
)

model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)


In [24]:
print_f1_score(y_train, y_test, y_train_pred, y_test_pred)

f1 score train:  0.8527607361963189
f1 score test:  0.7927107061503417


Optuna

In [12]:
def optuna_lr(trial):
    penalty = trial.suggest_categorical('penalty', ['none', 'l1', 'l2', 'elasticnet'])
    solver = trial.suggest_categorical('solver', ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'])
    C = trial.suggest_uniform('C', 0.01, 0.91)
    
    model = linear_model.LogisticRegression(
        penalty=penalty,
        solver=solver,
        C=C,
        random_state=42
    )
    
    score = model_selection.cross_val_score(model, X_train, y_train, scoring='f1', cv=5, n_jobs=-1).mean()
    
    return score
    
    

In [None]:
study = optuna.create_study(study_name="LogisticRegression", direction='maximize')
study.optimize(optuna_lr, n_jobs=-1, n_trials=50)

In [14]:
model = linear_model.LogisticRegression(**study.best_params, random_state=42)

model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print_f1_score(y_train, y_test, y_train_pred, y_test_pred)

f1 score train:  0.8413329257107918
f1 score test:  0.7954285714285714




RandomForestClassifier

In [15]:
space_rf = {
    'n_estimators'      : list(range(80, 290, 30)),
    'max_depth'         : list(range(20, 40, 2)),
    'min_samples_leaf'  : list(range(1, 10, 1))
}

GridSearch

In [16]:


grid_search = model_selection.GridSearchCV(
    estimator=ensemble.RandomForestClassifier(n_jobs=-1, random_state=42),
    param_grid=space_rf,
    cv=5,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

y_train_pred = grid_search.predict(X_train)
y_test_pred = grid_search.predict(X_test)

In [17]:
print_f1_score(y_train, y_test, y_train_pred, y_test_pred)

f1 score train:  1.0
f1 score test:  0.8341013824884792


RandomSearch

In [18]:
random_search = model_selection.RandomizedSearchCV(
    estimator=ensemble.RandomForestClassifier(n_jobs=-1, random_state=42),
    param_distributions=space_rf,
    cv=5,
    n_iter=50,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

y_train_pred = random_search.predict(X_train)
y_test_pred = random_search.predict(X_test)

In [19]:
print_f1_score(y_train, y_test, y_train_pred, y_test_pred)

f1 score train:  1.0
f1 score test:  0.8300578034682079


Hyperopt

In [26]:
space_rf = {
       'n_estimators'       : hp.quniform('n_estimators', 80, 290, 30),
       'max_depth'          : hp.quniform('max_depth', 20, 40, 2),
       'min_samples_leaf'   : hp.quniform('min_samples_leaf', 1, 10, 1)
      }

In [27]:
def hyperopt_rf(params, cv=5, X=X_train, y=y_train, random_state=42):
    params = {
        'n_estimators'      : int(params['n_estimators']),
        'max_depth'         : int(params['max_depth']),
        'min_samples_leaf'  : int(params['min_samples_leaf'])
    }
    
    model = ensemble.RandomForestClassifier(**params, random_state=random_state)
    
    score = model_selection.cross_val_score(model, X, y, scoring='f1', cv=cv, n_jobs=-1).mean()
    
    return -score

In [29]:
trials = Trials()
best = fmin(
    hyperopt_rf,
    space=space_rf,
    max_evals=50,
    trials=trials,
    rstate=np.random.default_rng(42)
)

TPE is being used as the default algorithm.


100%|██████████| 50/50 [05:07<00:00,  6.14s/trial, best loss: -0.8118633701308795]


In [32]:
print(best)

{'max_depth': 34.0, 'min_samples_leaf': 2.0, 'n_estimators': 240.0}


In [33]:
model = ensemble.RandomForestClassifier(
    n_estimators=int(best['n_estimators']),
    max_depth=int(best['max_depth']),
    min_samples_leaf=int(best['min_samples_leaf']),
    random_state=42
)

model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print_f1_score(y_train, y_test, y_train_pred, y_test_pred)

f1 score train:  0.9912826899128268
f1 score test:  0.8333333333333334


Optuna

In [36]:
def optuna_rf(trial):
    n_estimators = trial.suggest_int('n_estimators', 80, 290, 30)
    max_depth = trial.suggest_int('max_depth', 20, 40, 2)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10, 1)
    
    model = ensemble.RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    
    score = model_selection.cross_val_score(model, X_train, y_train, scoring='f1', cv=5, n_jobs=-1).mean()
    
    return score

In [38]:
study = optuna.create_study(study_name="RandomForestClassifier", direction='maximize')
study.optimize(optuna_rf, n_trials=50)

[32m[I 2022-07-07 00:16:21,208][0m A new study created in memory with name: RandomForestClassifier[0m
[32m[I 2022-07-07 00:16:30,521][0m Trial 0 finished with value: 0.812298806670633 and parameters: {'n_estimators': 260, 'max_depth': 32, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.812298806670633.[0m
[32m[I 2022-07-07 00:16:34,804][0m Trial 1 finished with value: 0.7979462067492635 and parameters: {'n_estimators': 140, 'max_depth': 24, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.812298806670633.[0m
[32m[I 2022-07-07 00:16:37,216][0m Trial 2 finished with value: 0.8006754801947421 and parameters: {'n_estimators': 80, 'max_depth': 38, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.812298806670633.[0m
[32m[I 2022-07-07 00:16:44,594][0m Trial 3 finished with value: 0.8129178571410934 and parameters: {'n_estimators': 230, 'max_depth': 38, 'min_samples_leaf': 1}. Best is trial 3 with value: 0.8129178571410934.[0m
[32m[I 2022-07-07 00:16:52,471][0m

In [39]:
model = ensemble.RandomForestClassifier(**study.best_params, random_state=42)

model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print_f1_score(y_train, y_test, y_train_pred, y_test_pred)

f1 score train:  1.0
f1 score test:  0.8300578034682079
