In [4]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold , RandomizedSearchCV,cross_validate, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.metrics import classification_report,log_loss, roc_auc_score
from sklearn.decomposition import KernelPCA
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.svm import SVC

from sklearn.metrics import precision_recall_curve 

%matplotlib inline

In [5]:
df = pd.read_csv("s_binary.csv")
global_target_col_name = 'Осложнения, возникшие на этапе лечения'
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1558 entries, 0 to 1557
Data columns (total 55 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   ОАК_Гемоглобин                                1558 non-null   int64  
 1   ОАК_Гематокрит%                               1558 non-null   int64  
 2   ОАК_Тромбоциты                                1558 non-null   int64  
 3   ОАК_Эритроциты                                1558 non-null   float64
 4   ОАК_Лейкоциты                                 1558 non-null   float64
 5   ОАК_СОЭ                                       1558 non-null   float64
 6   ОАК_Цветовой показатель                       1558 non-null   float64
 7   ОАК_Ретикулоциты                              1558 non-null   float64
 8   ОАК_Тромбокрит %                              1558 non-null   int64  
 9   ОАК_Эозинофилы                                1558 non-null   i

## KNN

In [61]:
df_knn = df[['Общее_Болевой синдром',
         'УЗИ брюшной полости_диурез',
         'УЗИ брюшной полости_объем выпота',
         'Осложнения, возникшие на этапе лечения']]


X = df_knn.drop([global_target_col_name], axis=1)
y = df_knn[global_target_col_name]


In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, stratify=y, random_state=42, shuffle=True) 

In [63]:
knn_pipe = Pipeline(
    [("scaler", StandardScaler()), ("knn", KNeighborsClassifier(n_jobs=-1))]
)

In [64]:
param_grid = [
    { 'knn__n_neighbors' : [i for i in range(2,20)],
      'knn__weights' : ['uniform','distance'],
      'knn__metric' : ['euclidean','manhattan','cosine'],
    }
]

In [65]:
random_search = RandomizedSearchCV(estimator=knn_pipe, 
                           param_distributions=param_grid, 
                           n_iter=108,
                           scoring='f1', 
                           refit='f1', 
                           n_jobs=-1, 
                           cv=5, 
                           verbose=5)

random_search.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [66]:
random_search.best_params_

{'knn__weights': 'distance', 'knn__n_neighbors': 13, 'knn__metric': 'cosine'}

In [67]:
random_search.best_score_

0.9459460660559993

In [68]:
class_predictions = random_search.best_estimator_.predict(X_test)
probability_predictions = random_search.best_estimator_.predict_proba(X_test)

In [69]:
log_loss_value = log_loss(y_test, probability_predictions[:,1])
print(f'Log Loss: {log_loss_value}')

Log Loss: 0.3784005445608583


In [70]:
roc_auc = roc_auc_score(y_test, probability_predictions[:,1])
print(f'ROC AUC: {roc_auc}')

ROC AUC: 0.9863433441558442


In [71]:
class_report = classification_report(y_test, class_predictions)
print(f'Classification Report:\n {class_report}')

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.98      0.97       308
           1       0.97      0.91      0.94       160

    accuracy                           0.96       468
   macro avg       0.96      0.95      0.95       468
weighted avg       0.96      0.96      0.96       468



Возможно, KNN на данных с низкой вариативностью работает с подвохом. Precision и recall скачет на +-7 в зависимости от random_seed. Возможно, увеличение датасета улучшило бы ситуацию


## GBDT


In [6]:
df_gbdt = df[['Общее_Болевой синдром',
         'УЗИ брюшной полости_диурез',
         'УЗИ брюшной полости_объем выпота',
         'Осложнения, возникшие на этапе лечения',
        "Общее_Возраст",
        "Общее_Характер перитонита_диффузный",
        "Общее_Характер перитонита_местный",
        "Общее_Характер перитонита_распространенный",
        "УЗИ брюшной полости_Размеры печени1",
        "УЗИ брюшной полости_Размеры печени2",
        "УЗИ брюшной полости_Узи почек1",
        "УЗИ брюшной полости_Узи почек2",
        "УЗИ брюшной полости_фибрин",
        "УЗИ брюшной полости_спайки"]]


In [7]:
X = df_gbdt.drop([global_target_col_name], axis=1)
y = df_gbdt[global_target_col_name]


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, stratify=y, random_state=42, shuffle=True) 

##### KPCA


In [9]:
x_tr = X_train['Общее_Болевой синдром']
x_te = X_test['Общее_Болевой синдром']

X_train = X_train.drop('Общее_Болевой синдром', axis = 1)
X_test = X_test.drop('Общее_Болевой синдром', axis = 1)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

kpca = KernelPCA(n_components = 5, kernel = 'rbf')
X_train = kpca.fit_transform(X_train)
X_test = kpca.transform(X_test)

X_train = np.insert(X_train, 0, list(x_tr), axis=1)
X_test = np.insert(X_test, 0, list(x_te), axis=1)

In [10]:
boost_model = HistGradientBoostingClassifier(max_iter=2000, early_stopping=True)

In [11]:
param_grid = [
    {"learning_rate": stats.loguniform(10**(-3), 10**1),
     "max_depth": [i for i in range(2,11)],
     'l2_regularization': stats.loguniform(10**(-2), 10**3),
     "max_leaf_nodes": [i for i in range(2,100)],
     "min_samples_leaf": [i for i in range(1,100)],
     'class_weight': [None, 'balanced']
    }
]

In [12]:
random_search = RandomizedSearchCV(estimator=boost_model, 
                           param_distributions=param_grid, 
                           n_iter=500,
                           scoring='roc_auc', 
                           refit='roc_auc', 
                           n_jobs=-1, 
                           cv=5, 
                           verbose=5)

random_search.fit(X_train, y_train)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


In [13]:
random_search.best_params_

{'class_weight': 'balanced',
 'l2_regularization': 0.012776019949961456,
 'learning_rate': 0.09566536751847027,
 'max_depth': 6,
 'max_leaf_nodes': 39,
 'min_samples_leaf': 5}

In [14]:
random_search.best_score_

0.98707476007476

In [15]:
random_search.best_estimator_.n_iter_

56

In [16]:
class_predictions = random_search.best_estimator_.predict(X_test)
probability_predictions = random_search.best_estimator_.predict_proba(X_test)

In [17]:
log_loss_value = log_loss(y_test, probability_predictions[:,1])
print(f'Log Loss: {log_loss_value}')

Log Loss: 0.14158864154351314


In [18]:
roc_auc = roc_auc_score(y_test, probability_predictions[:,1])
print(f'ROC AUC: {roc_auc}')

ROC AUC: 0.9866578733766235


In [19]:
class_report = classification_report(y_test, class_predictions)
print(f'Classification Report:\n {class_report}')

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.96      0.97       308
           1       0.93      0.95      0.94       160

    accuracy                           0.96       468
   macro avg       0.95      0.96      0.95       468
weighted avg       0.96      0.96      0.96       468



У GBDT после понижения размерности проблем с вариативностью быть не должно. 
