In [1]:
import pandas as pd
import numpy as np
import sklearn.metrics as skm

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_excel(r'data.xlsx')

In [3]:
df_x = df.loc[:, ['х1', 'х2', 'х3', 'х4', 'х5', 'х6', 'x7', 'x8', 'x9']]
x_matrix = df_x.to_numpy()
scaler = MinMaxScaler()
scaler.fit(x_matrix)
x_normalized = scaler.transform(x_matrix)
print(x_normalized)

[[0.11538462 0.72222222 0.39130435 0.82222222 0.52036199 0.18119266
  0.25641026 0.225      0.74492754]
 [0.17692308 0.49382716 0.56521739 0.8        0.87782805 0.25229358
  0.70769231 0.         0.43478261]
 [0.13076923 0.         0.26086957 0.51111111 0.46606335 0.1559633
  0.46666667 0.2765625  0.78086957]
 [0.12307692 0.09876543 0.08695652 0.46666667 0.52036199 0.24311927
  0.53333333 0.1703125  0.55884058]
 [0.08461538 0.41975309 0.7826087  1.         0.         0.22706422
  0.4974359  0.2        0.20869565]
 [0.         0.69753086 1.         0.71111111 0.35746606 0.33256881
  0.36410256 0.16875    0.6057971 ]
 [0.01538462 0.83950617 0.65217391 0.93333333 0.3800905  0.14220183
  0.33846154 0.3109375  0.82898551]
 [0.06923077 0.93209877 0.2173913  0.62222222 0.62895928 0.20642202
  0.22051282 0.2125     0.79362319]
 [0.05384615 1.         0.43478261 0.73333333 0.50678733 0.10091743
  0.05128205 0.5234375  0.83188406]
 [0.16923077 0.69753086 0.82608696 0.75555556 0.48868778 0.194954

In [4]:
df_y = df.loc[:, ['Ybin']]
y_matrix = df_y.to_numpy()
print(y_matrix)

[[1]
 [0]
 [1]
 [0]
 [0]
 [1]
 [0]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [1]
 [0]
 [0]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [1]
 [0]]


In [5]:
x_train, x_test, y_train, y_test = train_test_split(x_normalized, y_matrix, test_size=0.15)

## Метод kNN

In [6]:
knn_model = KNeighborsClassifier()
knn_model.fit(x_train, y_train.ravel())

model_params = knn_model.get_params()
tuned_params = {}
for k, v in model_params.items():
    tuned_params[k] = [v]
tuned_params['n_neighbors'] = range(1, y_train.size - 1)
clf = GridSearchCV(KNeighborsClassifier(), tuned_params)
clf.fit(x_train, y_train.ravel())
best_knn_params = clf.best_params_

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 810, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 527, in __call__
    return estimator.score(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 706, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_classification.py", line 254, in predict
    probabilities = self.predict_proba(X)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_classification.py", line 355, in predict_proba
    neigh_ind = self.kneighbors(X, return_distance=False)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/neighbors/_base.py", line 808, in kneighbors
    raise ValueError(
ValueError: Expected n_neighbors <= n_samples,  but n_samples

In [7]:
knn_best_model = KNeighborsClassifier(**best_knn_params)
knn_best_model.fit(x_train, y_train.ravel())
y_knn_predict = knn_best_model.predict(x_test)

print('Параметры модели:', best_knn_params)
print('Тестовая выборка:', y_test.ravel())
print('Результат предсказания:', y_knn_predict)
print('Метрики:\n', skm.classification_report(y_test, y_knn_predict))

Параметры модели: {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 2, 'p': 2, 'weights': 'uniform'}
Тестовая выборка: [0 1 0 1 0 0 1 0]
Результат предсказания: [1 0 0 0 0 0 1 0]
Метрики:
               precision    recall  f1-score   support

           0       0.67      0.80      0.73         5
           1       0.50      0.33      0.40         3

    accuracy                           0.62         8
   macro avg       0.58      0.57      0.56         8
weighted avg       0.60      0.62      0.60         8



## Логистическая регрессия

In [8]:
logit_model = LogisticRegression()
logit_model.fit(x_train, y_train.ravel())

model_params = logit_model.get_params()
tuned_params = {}
for k, v in model_params.items():
    tuned_params[k] = [v]
tuned_params['solver'] = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
clf = GridSearchCV(LogisticRegression(), tuned_params)
clf.fit(x_train, y_train.ravel())
best_logit_params = clf.best_params_

In [9]:
logit_best_model = LogisticRegression(**best_logit_params)
logit_best_model.fit(x_train, y_train.ravel())
y_logit_predict = logit_best_model.predict(x_test)

print('Параметры модели:', best_logit_params)
print('Тестовая выборка:', y_test.ravel())
print('Результат предсказания:', y_logit_predict)
print('Метрики:\n', skm.classification_report(y_test, y_logit_predict))

Параметры модели: {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Тестовая выборка: [0 1 0 1 0 0 1 0]
Результат предсказания: [0 0 0 0 0 0 0 0]
Метрики:
               precision    recall  f1-score   support

           0       0.62      1.00      0.77         5
           1       0.00      0.00      0.00         3

    accuracy                           0.62         8
   macro avg       0.31      0.50      0.38         8
weighted avg       0.39      0.62      0.48         8



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Дерево решений

In [10]:
tree_model = DecisionTreeClassifier()
tree_model.fit(x_train, y_train.ravel())

model_params = tree_model.get_params()
tuned_params = {}
for k, v in model_params.items():
    tuned_params[k] = [v]
tuned_params['max_depth'] = range(1, y_train.size)
clf = GridSearchCV(DecisionTreeClassifier(), tuned_params)
clf.fit(x_train, y_train.ravel())
best_tree_params = clf.best_params_

In [11]:
tree_best_model = DecisionTreeClassifier(**best_tree_params)
tree_best_model.fit(x_train, y_train.ravel())
y_tree_predict = tree_best_model.predict(x_test)

print('Параметры модели:', best_tree_params)
print('Тестовая выборка:', y_test.ravel())
print('Результат предсказания:', y_tree_predict)
print('Метрики:\n', skm.classification_report(y_test, y_tree_predict))

Параметры модели: {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 25, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}
Тестовая выборка: [0 1 0 1 0 0 1 0]
Результат предсказания: [0 0 0 0 0 0 0 0]
Метрики:
               precision    recall  f1-score   support

           0       0.62      1.00      0.77         5
           1       0.00      0.00      0.00         3

    accuracy                           0.62         8
   macro avg       0.31      0.50      0.38         8
weighted avg       0.39      0.62      0.48         8



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Случайный лес

In [12]:
forest_model = RandomForestClassifier()
forest_model.fit(x_train, y_train.ravel())

model_params = forest_model.get_params()
tuned_params = {}
for k, v in model_params.items():
    tuned_params[k] = [v]
tuned_params['max_depth'] = range(1, y_train.size)
clf = GridSearchCV(RandomForestClassifier(), tuned_params)
clf.fit(x_train, y_train.ravel())
best_forest_params = clf.best_params_

In [13]:
forest_best_model = RandomForestClassifier(**best_forest_params)
forest_best_model.fit(x_train, y_train.ravel())
y_forest_predict = forest_best_model.predict(x_test)

print('Параметры модели:', best_forest_params)
print('Тестовая выборка:', y_test.ravel())
print('Результат предсказания:', y_forest_predict)
print('Метрики:\n', skm.classification_report(y_test, y_forest_predict))

Параметры модели: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
Тестовая выборка: [0 1 0 1 0 0 1 0]
Результат предсказания: [0 0 0 0 0 0 0 0]
Метрики:
               precision    recall  f1-score   support

           0       0.62      1.00      0.77         5
           1       0.00      0.00      0.00         3

    accuracy                           0.62         8
   macro avg       0.31      0.50      0.38         8
weighted avg       0.39      0.62      0.48         8



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Сравнение моделей на обучающей выборке

In [14]:
y_knn_learn_predict = knn_best_model.predict(x_train)
print('Метод kNN:')
print('Предсказания:', y_knn_learn_predict)
print('Тестовая выборка:', y_train.ravel())
print(skm.classification_report(y_train, y_knn_learn_predict))
print('ROC-AUC:', skm.roc_auc_score(y_train, y_knn_learn_predict))
print('\n--------------------------------------------------------\n')
y_logit_learn_predict = logit_best_model.predict(x_train)
print('Логистическая регрессия:')
print('Предсказания:', y_logit_learn_predict)
print('Тестовая выборка:', y_train.ravel())
print(skm.classification_report(y_train, y_logit_learn_predict))
print('ROC-AUC:', skm.roc_auc_score(y_train, y_logit_learn_predict))
print('\n--------------------------------------------------------\n')
y_tree_learn_predict = tree_best_model.predict(x_train)
print('Дерево решений:')
print('Предсказания:', y_tree_learn_predict)
print('Тестовая выборка:', y_train.ravel())
print(skm.classification_report(y_train, y_tree_learn_predict))
print('ROC-AUC:', skm.roc_auc_score(y_train, y_tree_learn_predict))
print('\n--------------------------------------------------------\n')
y_forest_learn_predict = forest_best_model.predict(x_train)
print('Случайный лес:')
print('Предсказания:', y_forest_learn_predict)
print('Тестовая выборка:', y_train.ravel())
print(skm.classification_report(y_train, y_forest_learn_predict))
print('ROC-AUC:', skm.roc_auc_score(y_train, y_forest_learn_predict))
print('\n--------------------------------------------------------\n')

Метод kNN:
Предсказания: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0]
Тестовая выборка: [1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 1 0 0 1 1 0
 0 0]
              precision    recall  f1-score   support

           0       0.74      1.00      0.85        28
           1       1.00      0.09      0.17        11

    accuracy                           0.74        39
   macro avg       0.87      0.55      0.51        39
weighted avg       0.81      0.74      0.66        39

ROC-AUC: 0.5454545454545454

--------------------------------------------------------

Логистическая регрессия:
Предсказания: [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0]
Тестовая выборка: [1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 1 0 0 1 1 0
 0 0]
              precision    recall  f1-score   support

           0       0.74      1.00      0.85        28
           1       1.00      0.09      0.17        11

## Сравнение моделей на тестовой выборке

In [15]:
print('Метод kNN:')
print('Предсказания:', y_knn_predict)
print('Тестовая выборка:', y_test.ravel())
print(skm.classification_report(y_test, y_knn_predict))
print('ROC-AUC:', skm.roc_auc_score(y_test, y_knn_predict))
print('\n--------------------------------------------------------\n')
print('Логистическая регрессия:')
print('Предсказания:', y_logit_predict)
print('Тестовая выборка:', y_test.ravel())
print(skm.classification_report(y_test, y_logit_predict))
print('ROC-AUC:', skm.roc_auc_score(y_test, y_logit_predict))
print('\n--------------------------------------------------------\n')
print('Дерево решений:')
print('Предсказания:', y_tree_predict)
print('Тестовая выборка:', y_test.ravel())
print(skm.classification_report(y_test, y_tree_predict))
print('ROC-AUC:', skm.roc_auc_score(y_test, y_tree_predict))
print('\n--------------------------------------------------------\n')
print('Случайный лес:')
print('Предсказания:', y_forest_predict)
print('Тестовая выборка:', y_test.ravel())
print(skm.classification_report(y_test, y_forest_predict))
print('ROC-AUC:', skm.roc_auc_score(y_test, y_forest_predict))
print('\n--------------------------------------------------------\n')

Метод kNN:
Предсказания: [1 0 0 0 0 0 1 0]
Тестовая выборка: [0 1 0 1 0 0 1 0]
              precision    recall  f1-score   support

           0       0.67      0.80      0.73         5
           1       0.50      0.33      0.40         3

    accuracy                           0.62         8
   macro avg       0.58      0.57      0.56         8
weighted avg       0.60      0.62      0.60         8

ROC-AUC: 0.5666666666666667

--------------------------------------------------------

Логистическая регрессия:
Предсказания: [0 0 0 0 0 0 0 0]
Тестовая выборка: [0 1 0 1 0 0 1 0]
              precision    recall  f1-score   support

           0       0.62      1.00      0.77         5
           1       0.00      0.00      0.00         3

    accuracy                           0.62         8
   macro avg       0.31      0.50      0.38         8
weighted avg       0.39      0.62      0.48         8

ROC-AUC: 0.5

--------------------------------------------------------

Дерево решений:

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Значимость весов

In [17]:
print('Веса логистической регрессии:')
print(logit_best_model.coef_)
print('\n--------------------------------------------------------\n')
print('Веса дерева решений:')
print(tree_best_model.feature_importances_)
print('\n--------------------------------------------------------\n')
print('Веса случайного леса:')
print(forest_best_model.feature_importances_)
print('\n--------------------------------------------------------\n')

Веса логистической регрессии:
[[ 0.72105075 -0.24924942  0.23307497 -0.41226816  0.03642369 -0.28384862
  -0.48706138 -0.1306563   0.22588756]]

--------------------------------------------------------

Веса дерева решений:
[0.33054204 0.11870942 0.18993506 0.         0.11661125 0.08633412
 0.         0.         0.15786811]

--------------------------------------------------------

Веса случайного леса:
[0.12 0.07 0.15 0.13 0.13 0.06 0.12 0.08 0.14]

--------------------------------------------------------

