In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import linear_model
from sklearn import tree
from sklearn import ensemble
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

%matplotlib inline
plt.style.use('seaborn')

In [2]:
data = pd.read_csv('data/_train_sem09 (1).csv')

In [3]:
data.head(5)

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


In [4]:
random_state = 42

In [5]:
X = data.drop(['Activity'], axis=1)
y = data['Activity']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=random_state, test_size=0.2)

## BaseLine для модели LogisticRegression

In [6]:
log_reg_base = linear_model.LogisticRegression(
    max_iter=1000,
    random_state=42
)

log_reg_base.fit(X_train, y_train)

In [7]:
y_test_pred = log_reg_base.predict(X_test)
print('F1-score на тестовом выборке {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

print(metrics.classification_report(y_test, y_test_pred))

F1-score на тестовом выборке 0.78
              precision    recall  f1-score   support

           0       0.74      0.71      0.72       344
           1       0.76      0.79      0.78       407

    accuracy                           0.75       751
   macro avg       0.75      0.75      0.75       751
weighted avg       0.75      0.75      0.75       751



Базовый показатель F1-score: 0.78

## GridSearchCV для модели LogisticRegression

In [8]:
from sklearn.model_selection import GridSearchCV

In [9]:
param_grid = [
    {
        'penalty': ['l2', 'none'],
        'solver': ['lbfgs', 'saga']
    }
]
grid_search = GridSearchCV(
    estimator=linear_model.LogisticRegression(
        max_iter=1000,
        random_state=random_state
    ), 
    param_grid=param_grid, 
    cv=5, 
    n_jobs=-1
)  
%time grid_search.fit(X_train, y_train) 
y_test_pred = grid_search.predict(X_test)
print('F1-score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print('Наилучшие значения гиперпараметров: {}'.format(grid_search.best_params_))

CPU times: total: 9.64 s
Wall time: 9min 20s
F1-score на тестовом наборе: 0.78
Наилучшие значения гиперпараметров: {'penalty': 'l2', 'solver': 'lbfgs'}


Улучшения целевой метрики добиться не удалось

## RandomizedSearchCV для модели LogisticRegression

In [10]:
from sklearn.model_selection import RandomizedSearchCV

In [11]:
param_grid_rnd = {
    'penalty': ['l2', 'none'] ,
    'solver': ['lbfgs', 'sag'],
    'C': list(np.linspace(0.01, 1, 10, dtype=float))
},
            
random_search = RandomizedSearchCV(
    estimator=linear_model.LogisticRegression(
        max_iter=1000,
        random_state=random_state
    ), 
    param_distributions=param_grid_rnd, 
    cv=5, 
    n_iter=10, 
    n_jobs=-1
)  

%time random_search.fit(X_train, y_train)
y_test_pred = random_search.predict(X_test)
print('F1-score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print('Наилучшие значения гиперпараметров: {}'.format(random_search.best_params_))

CPU times: total: 7.44 s
Wall time: 26min 35s
F1-score на тестовом наборе: 0.78
Наилучшие значения гиперпараметров: {'solver': 'sag', 'penalty': 'l2', 'C': 0.01}


За счет того, что метод работает быстрее чем GridSearchCV можно задать большее широкое пространство перебираемых параметров.

Однако данный метод, даже несмотря более широкое пространство параметров улучшение относительно BaseLine, не дал улучшения целевой метрики.

## Hyperopt для модели LogisticRegression

In [12]:
from sklearn.model_selection import cross_val_score
import hyperopt
from hyperopt import hp, fmin, tpe, Trials

print("Версия Hyperopt : {}".format(hyperopt.__version__))

Версия Hyperopt : 0.2.7


In [13]:
# зададим пространство поиска гиперпараметров
space = {
    'penalty': hp.choice('penalty', ['l2', 'none']),
    'solver' : hp.choice('solver', ['newton-cg', 'sag', 'saga', 'lbfgs']),
    'C' : hp.uniform('C', 0.01, 1)
}

In [14]:
def hyperopt_lr(params, cv=5, X=X_train, y=y_train, random_state=random_state):
    # функция получает комбинацию гиперпараметров в "params"
    params = {
        'penalty': params['penalty'], 
        'solver': params['solver'], 
        'C': float(params['C'])
    }
  
    # используем эту комбинацию для построения модели
    model = linear_model.LogisticRegression(
        **params,
        max_iter=1000,
        random_state=random_state
    )

    # обучаем модель
    # используем cross validation с тем же количеством фолдов
    score = cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()

    # метрику необходимо минимизировать, поэтому ставим знак минус
    return -score

In [15]:
%%time

trials = Trials()

best=fmin(
    hyperopt_lr,
    space=space,
    algo=tpe.suggest,
    max_evals=20,
    trials=trials,
    rstate=np.random.default_rng(random_state)
)
print('Наилучшие значения гиперпараметров {}'.format(best))

100%|██████████| 20/20 [42:30<00:00, 127.53s/trial, best loss: -0.7902171664645037]
Наилучшие значения гиперпараметров {'C': 0.05084775379720359, 'penalty': 0, 'solver': 0}
CPU times: total: 1.64 s
Wall time: 42min 30s


In [16]:
best_params = hyperopt.space_eval(space, best)
print(best_params)

{'C': 0.05084775379720359, 'penalty': 'l2', 'solver': 'newton-cg'}


In [17]:
# рассчитаем F1 метрику для тестовой выборки

model_lr = linear_model.LogisticRegression(
    penalty=best_params['penalty'],
    solver=best_params['solver'],
    C=float(best_params['C']),
    max_iter=1000,
    random_state=random_state
)
model_lr.fit(X_train, y_train)

y_test_pred = model_lr.predict(X_test)
print('F1-score на тестовом выборке {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

print(metrics.classification_report(y_test, y_test_pred))

F1-score на тестовом выборке 0.79
              precision    recall  f1-score   support

           0       0.76      0.69      0.73       344
           1       0.76      0.82      0.79       407

    accuracy                           0.76       751
   macro avg       0.76      0.76      0.76       751
weighted avg       0.76      0.76      0.76       751



Данный метод подбора гиперпараметров позволил улучшить целевую метрику с 0.78 до 0.79

## Optuna для модели LogisticRegression

In [18]:
import optuna

print("Версия Optuna: {}".format(optuna.__version__))

Версия Optuna: 3.0.3


In [19]:
def optuna_lr(trial):
    # задаем пространства поиска гиперпараметров
    penalty = trial.suggest_categorical('penalty', ['l2', 'none'])
    solver = trial.suggest_categorical('solver', ['newton-cg', 'sag', 'saga', 'lbfgs'])
    C = trial.suggest_float('C', 0.01, 1)

    # создаем модель
    model = linear_model.LogisticRegression(
        penalty=penalty,
        solver=solver,
        C=C,
        max_iter=1000,
        random_state=random_state
    )
    
    # обучаем модель
    # используем cross validation с количеством фолдов == 5
    score = cross_val_score(model, X_train, y_train, cv=5, scoring="f1", n_jobs=-1).mean()

    return score

In [20]:
%%time

# необходимо максимизировать метрику => direction="maximize"
study = optuna.create_study(study_name="LogisticRegression", direction="maximize")
study.optimize(optuna_lr, n_trials=20)

[32m[I 2022-10-19 08:09:23,543][0m A new study created in memory with name: LogisticRegression[0m
[32m[I 2022-10-19 08:12:17,913][0m Trial 0 finished with value: 0.7597800660015992 and parameters: {'penalty': 'none', 'solver': 'sag', 'C': 0.2778164748439764}. Best is trial 0 with value: 0.7597800660015992.[0m
[32m[I 2022-10-19 08:17:30,324][0m Trial 1 finished with value: 0.7626510635585347 and parameters: {'penalty': 'none', 'solver': 'saga', 'C': 0.17376873090670436}. Best is trial 1 with value: 0.7626510635585347.[0m
[32m[I 2022-10-19 08:21:23,706][0m Trial 2 finished with value: 0.7597800660015992 and parameters: {'penalty': 'none', 'solver': 'sag', 'C': 0.9476564950457328}. Best is trial 1 with value: 0.7626510635585347.[0m
[32m[I 2022-10-19 08:21:36,557][0m Trial 3 finished with value: 0.7754927645452376 and parameters: {'penalty': 'l2', 'solver': 'newton-cg', 'C': 0.9629668753862248}. Best is trial 3 with value: 0.7754927645452376.[0m
[32m[I 2022-10-19 08:24:01,0

CPU times: total: 1.7 s
Wall time: 27min 12s


In [22]:
print('Наилучшие значения гиперпараметров {}'.format(study.best_params))
print('F1-score на обучающем наборе: {:.2f}'.format(study.best_value))

Наилучшие значения гиперпараметров {'penalty': 'l2', 'solver': 'newton-cg', 'C': 0.01965899746649856}
F1-score на обучающем наборе: 0.79


In [23]:
# рассчитаем точность для тестовой выборки
model_lr = linear_model.LogisticRegression(
    **study.best_params,
    max_iter=1000,
    random_state=random_state
)
model_lr.fit(X_train, y_train)

y_test_pred = model_lr.predict(X_test)
print('F1-score на тестовом выборке {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

print(metrics.classification_report(y_test, y_test_pred))

F1-score на тестовом выборке 0.78
              precision    recall  f1-score   support

           0       0.75      0.69      0.72       344
           1       0.75      0.81      0.78       407

    accuracy                           0.75       751
   macro avg       0.75      0.75      0.75       751
weighted avg       0.75      0.75      0.75       751



На теством наборе данный метод улучшения целевой метрики не дал

## BaseLine для модели RandomForestClassifier

In [24]:
model_rf = ensemble.RandomForestClassifier(
    n_estimators=100,
    max_depth=15,
    min_samples_leaf=5,
    random_state=random_state
)

model_rf.fit(X_train, y_train)

In [25]:
y_test_pred = model_rf.predict(X_test)
print('F1-score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

print(metrics.classification_report(y_test, y_test_pred))

F1-score на тестовом наборе: 0.80
              precision    recall  f1-score   support

           0       0.76      0.74      0.75       344
           1       0.79      0.81      0.80       407

    accuracy                           0.78       751
   macro avg       0.78      0.78      0.78       751
weighted avg       0.78      0.78      0.78       751



Значение целевой метрики, которую мы пытаемся улучшить подбором гиперпараметров, для модели RandomForestClassifier - 0.80

## GridSearchCV для модели RandomForestClassifier

In [26]:
param_grid = {
    'n_estimators': list(range(100, 300, 25)),
    'max_depth': list(np.linspace(10, 50, 5, dtype=int)),
    'min_samples_leaf': list(np.linspace(2, 8, 1, dtype=int))
}

grid_search = GridSearchCV(
    estimator=ensemble.RandomForestClassifier(random_state=random_state), 
    param_grid=param_grid,
    cv=5, 
    n_jobs=-1
)

%time grid_search.fit(X_train, y_train) 
y_test_pred = grid_search.predict(X_test)
print('F1-score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print('Наилучшие значения гиперпараметров: {}'.format(grid_search.best_params_))

print(metrics.classification_report(y_test, y_test_pred))

CPU times: total: 7.64 s
Wall time: 5min 55s
F1-score на тестовом наборе: 0.80
Наилучшие значения гиперпараметров: {'max_depth': 30, 'min_samples_leaf': 2, 'n_estimators': 275}
              precision    recall  f1-score   support

           0       0.77      0.75      0.76       344
           1       0.79      0.81      0.80       407

    accuracy                           0.78       751
   macro avg       0.78      0.78      0.78       751
weighted avg       0.78      0.78      0.78       751



Данный метод не позволил улучшить целевую метрику

## RandomizedSearchCV для модели RandomForestClassifier

In [27]:
param_distributions = {
    'n_estimators': list(range(100, 300, 25)),
    'max_depth': list(np.linspace(10, 50, 5, dtype=int)),
    'min_samples_leaf': list(np.linspace(2, 8, 1, dtype=int))
}
            
random_search_forest = RandomizedSearchCV(
    estimator=ensemble.RandomForestClassifier(random_state=random_state), 
    param_distributions=param_distributions, 
    cv=5,
    n_iter=10, 
    n_jobs=-1
)

%time random_search_forest.fit(X_train, y_train) 
y_test_pred = random_search_forest.predict(X_test)
print('F1-score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print('Наилучшие значения гиперпараметров: {}'.format(random_search_forest.best_params_))

print(metrics.classification_report(y_test, y_test_pred))

CPU times: total: 3.08 s
Wall time: 1min 2s
F1-score на тестовом наборе: 0.80
Наилучшие значения гиперпараметров: {'n_estimators': 125, 'min_samples_leaf': 2, 'max_depth': 40}
              precision    recall  f1-score   support

           0       0.76      0.75      0.76       344
           1       0.79      0.81      0.80       407

    accuracy                           0.78       751
   macro avg       0.78      0.78      0.78       751
weighted avg       0.78      0.78      0.78       751



Данный метод не позволил улучшить целевую метрику

## Hyperopt для модели RandomForestClassifier

In [28]:
# зададим пространство поиска гиперпараметров
space={
    'n_estimators': hp.quniform('n_estimators', 100, 300, 25),
    'max_depth' : hp.quniform('max_depth', 10, 50, 5),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 2, 8, 1)
}

In [29]:
def hyperopt_rf(params, cv=5, X=X_train, y=y_train, random_state=random_state):
    params = {
        'n_estimators': int(params['n_estimators']), 
        'max_depth': int(params['max_depth']), 
        'min_samples_leaf': int(params['min_samples_leaf'])
    }
  
    model = ensemble.RandomForestClassifier(
        **params, 
        random_state=random_state
    )
   
    score = cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()

    return -score

In [30]:
%%time

trials = Trials()

best=fmin(
    hyperopt_rf,
    space=space,
    algo=tpe.suggest,
    max_evals=20,
    trials=trials,
    rstate=np.random.default_rng(random_state)
)

print('Наилучшие значения гиперпараметров {}'.format(best))

100%|██████████| 20/20 [02:34<00:00,  7.74s/trial, best loss: -0.8182768218494241]
Наилучшие значения гиперпараметров {'max_depth': 35.0, 'min_samples_leaf': 2.0, 'n_estimators': 125.0}
CPU times: total: 1.28 s
Wall time: 2min 34s


In [31]:
model = ensemble.RandomForestClassifier(
    random_state=random_state, 
    n_estimators=int(best['n_estimators']),
    max_depth=int(best['max_depth']),
    min_samples_leaf=int(best['min_samples_leaf'])
)
model.fit(X_train, y_train)

y_test_pred = model.predict(X_test)
print('F1-score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print(metrics.classification_report(y_test, y_test_pred))

F1-score на тестовом наборе: 0.80
              precision    recall  f1-score   support

           0       0.76      0.75      0.76       344
           1       0.79      0.81      0.80       407

    accuracy                           0.78       751
   macro avg       0.78      0.78      0.78       751
weighted avg       0.78      0.78      0.78       751



Данный метод не позволил улучшить целевую метрику

## Optuna для модели RandomForestClassifier

In [32]:
def optuna_rf(trial):
    # задаем пространства поиска гиперпараметров
    n_estimators = trial.suggest_int('n_estimators', 100, 300, 25)
    max_depth = trial.suggest_int('max_depth', 10, 50, 5)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 2, 8, 1)

    # создаем модель
    model = ensemble.RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        random_state=random_state
    )
    
    # обучаем модель
    # используем cross validation с количеством фолдов == 5
    score = cross_val_score(model, X_train, y_train, cv=5, scoring="f1", n_jobs=-1).mean()

    return score

In [33]:
%%time

# необходимо максимизировать метрику => direction="maximize"
study = optuna.create_study(study_name="RandomForestClassifier", direction="maximize")
study.optimize(optuna_rf, n_trials=20)

[32m[I 2022-10-19 09:38:30,476][0m A new study created in memory with name: RandomForestClassifier[0m
[32m[I 2022-10-19 09:38:33,823][0m Trial 0 finished with value: 0.8014827557840256 and parameters: {'n_estimators': 100, 'max_depth': 45, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.8014827557840256.[0m
[32m[I 2022-10-19 09:38:44,752][0m Trial 1 finished with value: 0.8055269772084582 and parameters: {'n_estimators': 300, 'max_depth': 35, 'min_samples_leaf': 8}. Best is trial 1 with value: 0.8055269772084582.[0m
[32m[I 2022-10-19 09:38:57,998][0m Trial 2 finished with value: 0.8149858995071237 and parameters: {'n_estimators': 200, 'max_depth': 45, 'min_samples_leaf': 2}. Best is trial 2 with value: 0.8149858995071237.[0m
[32m[I 2022-10-19 09:39:15,356][0m Trial 3 finished with value: 0.8121623022289184 and parameters: {'n_estimators': 250, 'max_depth': 30, 'min_samples_leaf': 4}. Best is trial 2 with value: 0.8149858995071237.[0m
[32m[I 2022-10-19 09:39:31,363

CPU times: total: 1.67 s
Wall time: 4min 7s


In [34]:
# рассчитаем точность для тестовой выборки
model_rf = ensemble.RandomForestClassifier(
    **study.best_params,
    random_state=random_state
)
model_rf.fit(X_train, y_train)

y_test_pred = model_rf.predict(X_test)
print('F1-score на тестовом выборке {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

print(metrics.classification_report(y_test, y_test_pred))

F1-score на тестовом выборке 0.80
              precision    recall  f1-score   support

           0       0.77      0.75      0.76       344
           1       0.80      0.81      0.80       407

    accuracy                           0.78       751
   macro avg       0.78      0.78      0.78       751
weighted avg       0.78      0.78      0.78       751



Данный метод не позволил улучшить целевую метрику

### Вывод

Модель RandomForestClassifie изначально показала более высокий показатель целевой метрики относительно модели LogisticRegression.

Путем подбора гиперпарамотров для модели LogisticRegression удалось улучшить показатель целевой метрики (с 0.78 до 0.79) при помощи метода Hyperopt.