In [33]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn import linear_model 
from sklearn import tree 
from sklearn import ensemble 
from sklearn import metrics 
from sklearn import preprocessing 
from sklearn.model_selection import train_test_split 

from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import cross_val_score
import hyperopt
from hyperopt import hp, fmin, tpe, Trials

import optuna

%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

Предварительная обработка не требуется, данные уже закодированы и нормализованы.

В качестве метрики будем использовать F1-score.

Необходимо обучить две модели: логистическую регрессию и случайный лес. Далее нужно сделать подбор гиперпараметров с помощью базовых и продвинутых методов оптимизации. Важно использовать все четыре метода (GridSeachCV, RandomizedSearchCV, Hyperopt, Optuna) хотя бы по разу, максимальное количество итераций не должно превышать 50.

In [34]:
data = pd.read_csv('./data/_train_sem09 (1).csv')

In [35]:
data.head()


Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


In [36]:
y = data['Activity']
X = data.drop(['Activity'], axis=1)

In [37]:
y.value_counts(normalize=True)

1    0.542255
0    0.457745
Name: Activity, dtype: float64

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [39]:
random_state = 42

---

### <center> **BaseLine**

In [40]:
# Logistic regression
base_logreg = linear_model.LogisticRegression(random_state=random_state)
base_logreg.fit(X_train, y_train)
y_bl_lgr_pred = base_logreg.predict(X_test)

print(
    f'f1 score for logreg baseline: {metrics.f1_score(y_test, y_bl_lgr_pred).round(2)}')

# Random forest
base_rfc = ensemble.RandomForestClassifier(random_state=random_state)
base_rfc.fit(X_train, y_train)
y_bl_rfc_pred = base_rfc.predict(X_test)

print(
    f'f1 score for randomforest baseline: {metrics.f1_score(y_test, y_bl_rfc_pred).round(2)}')


f1 score for logreg baseline: 0.8
f1 score for randomforest baseline: 0.83


> Мы можем видеть достаточно высокие метрики для базовых моделей, и на примере регрессии и на примере леса, в данном случае это объяснимо предподгоовкой данных. Следовательно дальнейшим подбором гиперпараметров, мы должны попытаться получить прирост целевой метрики и остановится на том методе оптимизации, который, что логично, этот прирост обеспечит.

---

### <center> **Grid Search CV**

In [41]:
# Linear regression
param_grid = [
    {
        'penalty': ['l2', 'none'],  
        'solver': ['lbfgs', 'sag'],  
        'C': list(np.linspace(0.01, 1, 10, dtype=float))},  

    {
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga'],
        'C': list(np.linspace(0.01, 1, 10, dtype=float))}
]


grid_search = GridSearchCV(
    estimator=linear_model.LogisticRegression(
        random_state=random_state,
        max_iter=50 
    ), 
    param_grid=param_grid, 
    cv=5, 
    n_jobs = -1
)

%time grid_search.fit(X_train, y_train) 
y_test_pred = grid_search.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print("Наилучшие значения гиперпараметров: {}".format(grid_search.best_params_))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

CPU times: user 1.82 s, sys: 507 ms, total: 2.33 s
Wall time: 56.6 s
f1_score на тестовом наборе: 0.80
Наилучшие значения гиперпараметров: {'C': 0.45, 'penalty': 'l2', 'solver': 'lbfgs'}


In [42]:
# Random forest
param_grid = {
        'n_estimators': [100, 200],  
        'criterion': ['gini', 'entropy'],  
        'max_depth': [5, 6, 7],
        'min_samples_leaf': [5, 7, 10]} 
    
grid_search = GridSearchCV(
    estimator=ensemble.RandomForestClassifier(
        random_state=random_state
    ),
    cv=5,
    n_jobs=-1,
    param_grid=param_grid
)

%time grid_search.fit(X_train, y_train) 
y_test_pred = grid_search.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print("Наилучшие значения гиперпараметров: {}".format(grid_search.best_params_))

CPU times: user 1.36 s, sys: 490 ms, total: 1.85 s
Wall time: 22.6 s
f1_score на тестовом наборе: 0.80
Наилучшие значения гиперпараметров: {'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 5, 'n_estimators': 200}


> Вывод: И логистическая регрессия и случайный лес показали одинаковое значение f1 метрики на подборе через сетку параметров. При этом мы не можем назвать прирост целевой метрики, относительно базового подхода, выходящим за пределы погрешности.

---

### <center> **RandomizedSearchCV**

In [43]:
# Logistic regression
param_distributions = [
    {
        'penalty': ['l2', 'none'],  
        'solver': ['lbfgs', 'sag'],  
        'C': list(np.linspace(0.01, 1, 10, dtype=float))},  

    {
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga'],
        'C': list(np.linspace(0.01, 1, 10, dtype=float))}
]

random_search = RandomizedSearchCV(
    estimator=linear_model.LogisticRegression(
        random_state=random_state,
        max_iter=50
    ),
    n_iter=50,
    n_jobs=-1,
    cv=5,
    param_distributions=param_distributions
)

%time random_search.fit(X_train, y_train) 
y_test_pred = random_search.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print("Наилучшие значения гиперпараметров: {}".format(random_search.best_params_))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

CPU times: user 1.47 s, sys: 382 ms, total: 1.86 s
Wall time: 35.5 s
f1_score на тестовом наборе: 0.80
Наилучшие значения гиперпараметров: {'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.45}


In [44]:
# Random forest
param_distributions = {
    'min_samples_leaf': list(np.linspace(5, 100, 50, dtype=int)),
    'max_depth': list(np.linspace(1, 30, 50, dtype=int)),
    'criterion': ['entropy', 'gini']
}

random_search_tree = RandomizedSearchCV(
    estimator=tree.DecisionTreeClassifier(random_state=random_state),
    param_distributions=param_distributions,
    cv=5,
    n_iter=50,
    n_jobs=-1
)
%time random_search_tree.fit(X_train, y_train)
y_test_pred = random_search_tree.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(
    metrics.f1_score(y_test, y_test_pred)))
print("Наилучшие значения гиперпараметров: {}".format(
    random_search_tree.best_params_))


CPU times: user 784 ms, sys: 372 ms, total: 1.16 s
Wall time: 9.83 s
f1_score на тестовом наборе: 0.80
Наилучшие значения гиперпараметров: {'min_samples_leaf': 47, 'max_depth': 4, 'criterion': 'entropy'}


> Вывод: RandomSearch дал выигрыш по времени и в случае LogReg и в случае RF, значение метрики не изменилось, а на RF даже ухудшилось (0.81 против 0.77) в сравнении с оптимизацией через сетку. Так же стоит отметить, что подбор параметров, относительно базового сценария, не дал по итогу положительных результатов.

---

### <center> Hyperopt

In [45]:
# Linear regression
space = {
    'penalty': hp.choice('penalty', ['l1', 'l2']),
    'solver': hp.choice('solver', ['liblinear', 'saga']),
    'C': hp.uniform('C', 0, 1)
}

# Не могу победить, пришлось костылить. В дальнейшем hyperopt для 
# значений penalty и solver вщзвращал позицию элемента в space а не его строковое значение, что привело к решению примостырить костыль
crutch = {
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

def hyperopt_linreg(params, cv=5, X=X_train, y=y_train, random_state=random_state):
    params = {
        'penalty': params['penalty'],
        'solver': params['solver'],
        'C': float(params['C'])
    }

    model = linear_model.LogisticRegression(
        **params, random_state=random_state, max_iter=50
    )

    model.fit(X, y)
    y_pred = model.predict(X)
    score = cross_val_score(model, X, y, cv=cv, scoring="f1").mean()

    return -score


trials = Trials()

best_params = fmin(
    hyperopt_linreg,
    space=space,
    algo=tpe.suggest,
    max_evals=20,
    trials=trials,
    rstate=np.random.default_rng(random_state)
)

print(f'Наилучшие значения гиперпараметров {best_params}.')


model_logreg = linear_model.LogisticRegression(
    penalty=crutch['penalty'][best_params['penalty']],
    solver=crutch['solver'][best_params['solver']],
    C=float(best_params['C'])
)

model_logreg.fit(X_train, y_train)

y_test_pred = model_logreg.predict(X_test)

print('f1_score на тестовом наборе: {:.2f}'.format(
    metrics.f1_score(y_test, y_test_pred)))

100%|██████████| 20/20 [01:41<00:00,  5.07s/trial, best loss: -0.7796600577461212]
Наилучшие значения гиперпараметров {'C': 0.16102410861748417, 'penalty': 1, 'solver': 1}.
f1_score на тестовом наборе: 0.81


In [46]:
# Random forest
space = {
    'n_estimators': hp.quniform('n_estimators', 100, 200, 1),
    'max_depth': hp.quniform('max_depth', 15, 26, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 2, 10, 1)
}

def hyperopt_rfc(params, cv=5, X=X_train, y=y_train, random_state=random_state):
    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'min_samples_leaf': int(params['min_samples_leaf'])
        }

    model = ensemble.RandomForestClassifier(
        **params, random_state=random_state)

    model.fit(X, y)
    score = cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()

    return -score

trials = Trials()  # используется для логирования результатов

best = fmin(hyperopt_rfc,  
            space=space,  
            algo=tpe.suggest,  
            max_evals=20, 
            trials=trials, 
            rstate=np.random.default_rng(random_state)
            )

print(f'Наилучшие значения гиперпараметров {best}.')

model = ensemble.RandomForestClassifier(
    random_state=random_state, 
    n_estimators=int(best['n_estimators']),
    max_depth=int(best['max_depth']),
    min_samples_leaf=int(best['min_samples_leaf'])
)
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)

print('f1_score на тестовом наборе: {:.2f}'.format(
    metrics.f1_score(y_test, y_test_pred)))

100%|██████████| 20/20 [00:44<00:00,  2.23s/trial, best loss: -0.8054998529695212]
Наилучшие значения гиперпараметров {'max_depth': 19.0, 'min_samples_leaf': 3.0, 'n_estimators': 132.0}.
f1_score на тестовом наборе: 0.83


> Вывод: Отличия от baseline модели не существенны, но в целом можно сказать, что по паритету скорость/прирост метрики, среди оптимизационныъ подходов пока лидирует hyperopt.

---

### <center> Optuna

In [47]:
# Linear regression
def optuna_logreg(trial):
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])
    C = trial.suggest_float('C', 0.1, 2, step=0.2)

    model = linear_model.LogisticRegression(
        penalty=penalty,
        C=C,
        solver=solver,
        random_state=random_state
    )
    model.fit(X_train, y_train)
    score = metrics.f1_score(y_train, model.predict(X_train))

    return score

study_logreg = optuna.create_study(study_name='LogisticRegression', direction='maximize')
study_logreg.optimize(optuna_logreg, n_trials=20)

print(f'Лучшие значения гиперпараметров: {study_logreg.best_params}')

model_logreg = linear_model.LogisticRegression(**study_logreg.best_params, random_state=random_state)
model_logreg.fit(X_train, y_train)

y_test_pred = model_logreg.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(
    metrics.f1_score(y_test, y_test_pred)))

[32m[I 2023-04-13 17:42:42,221][0m A new study created in memory with name: LogisticRegression[0m
[32m[I 2023-04-13 17:42:46,743][0m Trial 0 finished with value: 0.8682868286828683 and parameters: {'penalty': 'l1', 'solver': 'saga', 'C': 1.9}. Best is trial 0 with value: 0.8682868286828683.[0m
[32m[I 2023-04-13 17:42:49,334][0m Trial 1 finished with value: 0.864767616191904 and parameters: {'penalty': 'l2', 'solver': 'saga', 'C': 0.30000000000000004}. Best is trial 0 with value: 0.8682868286828683.[0m
[32m[I 2023-04-13 17:42:49,628][0m Trial 2 finished with value: 0.8904316329610625 and parameters: {'penalty': 'l2', 'solver': 'liblinear', 'C': 1.1}. Best is trial 2 with value: 0.8904316329610625.[0m
[32m[I 2023-04-13 17:42:49,818][0m Trial 3 finished with value: 0.8490284005979073 and parameters: {'penalty': 'l1', 'solver': 'liblinear', 'C': 0.5}. Best is trial 2 with value: 0.8904316329610625.[0m
[32m[I 2023-04-13 17:42:50,067][0m Trial 4 finished with value: 0.876539

Лучшие значения гиперпараметров: {'penalty': 'l2', 'solver': 'liblinear', 'C': 1.9}
f1_score на тестовом наборе: 0.80


In [48]:
# Random forest
def optuna_rfс(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 200, 1)
    max_depth = trial.suggest_int('max_depth', 10, 30, 1)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 2, 10, 1)

    model = ensemble.RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        random_state=random_state)
    
    model.fit(X_train, y_train)
    score = metrics.f1_score(y_train, model.predict(X_train))

    return score


study_rfc = optuna.create_study(study_name="RandomForestClassifier", direction="maximize")
study_rfc.optimize(optuna_rfс, n_trials=20)

print(f'Лучшие значения гиперпараметров: {study_rfc.best_params}')


model_rfc = ensemble.RandomForestClassifier(**study_rfc.best_params,random_state=random_state)
model_rfc.fit(X_train, y_train)

y_test_pred = model_rfc.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(
    metrics.f1_score(y_test, y_test_pred)))

[32m[I 2023-04-13 17:43:04,449][0m A new study created in memory with name: RandomForestClassifier[0m
[32m[I 2023-04-13 17:43:05,232][0m Trial 0 finished with value: 0.9037126471476002 and parameters: {'n_estimators': 125, 'max_depth': 11, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.9037126471476002.[0m
[32m[I 2023-04-13 17:43:06,025][0m Trial 1 finished with value: 0.9183115699969632 and parameters: {'n_estimators': 127, 'max_depth': 10, 'min_samples_leaf': 3}. Best is trial 1 with value: 0.9183115699969632.[0m
[32m[I 2023-04-13 17:43:07,378][0m Trial 2 finished with value: 0.9760242792109256 and parameters: {'n_estimators': 174, 'max_depth': 19, 'min_samples_leaf': 3}. Best is trial 2 with value: 0.9760242792109256.[0m
[32m[I 2023-04-13 17:43:08,540][0m Trial 3 finished with value: 0.9520631067961165 and parameters: {'n_estimators': 162, 'max_depth': 13, 'min_samples_leaf': 3}. Best is trial 2 with value: 0.9760242792109256.[0m
[32m[I 2023-04-13 17:43:09,299

Лучшие значения гиперпараметров: {'n_estimators': 184, 'max_depth': 29, 'min_samples_leaf': 2}
f1_score на тестовом наборе: 0.83


> Вывод: Optuna не дала прироста в метрике.

### Общий вывод: Все вышеперечисленные методы не дали видимых результатов в приросте метрики. Возможно в данном конкретном случае предустановленные гиперпараметры моделей оказались оптимальными.