# Подготовка данных пациентов

In [144]:
# standard
import pandas as pd
import numpy as np
import re
import seaborn as sbs
import matplotlib.pyplot as plt
import matplotlib
from tqdm import notebook
sbs.set_style("darkgrid")

# text processing
import nltk
from nltk.corpus import stopwords as nltk_stopwords
from pymystem3 import Mystem

# preprocessing/processing
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

# model selection
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, cross_validate,  StratifiedKFold
import imblearn
from imblearn.pipeline import Pipeline as imbpipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# metrics
from sklearn.metrics import classification_report, confusion_matrix, make_scorer
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, roc_auc_score

# base models
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

# model building
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


## Загрузка данных

In [2]:
file_path = 'chd_addmit_300.xlsx'

In [3]:
patient_data = pd.read_excel(file_path)

In [4]:
patient_data.head()

Unnamed: 0,admittion,discharge,sex,height,weight,BMI,BSA,birth,Операции (все в ИБ),Перенесенные опер. (из Анамн.),...,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41
0,2016-12-12,2017-01-10,m,76,9.7,111.27,0.46,02.01.2016,12.12.2016: (Откр./ИК) Перевязка ранее наложен...,,...,,,,,,,,,,
1,2017-01-13,2017-02-01,f,67,7.34,89.67,0.37,02.02.2016,18.01.2017: (Откр./ИК) Радикальная коррекция д...,15.08.2016г.: Транслюминальная балонная вальву...,...,,,,,,,,,,
2,2017-01-17,2017-02-09,m,74,8.9,103.46,0.43,21.02.2016,19.01.2017: (Откр./ИК) Перевязка ранее наложен...,29.02.2016 - подключично-легочный анастомоз сп...,...,,,,,,,,,,
3,2017-01-20,2017-02-21,f,67,6.97,85.15,0.36,02.03.2016,23.01.2017: (Откр./ИК) Наложение двустороннего...,,...,,,,,,,,,,
4,2017-02-13,2017-03-01,m,82,9.29,102.59,0.46,08.03.2016,15.02.2017: (Откр./ИК) Реконструкция путей отт...,13.04.2016. НАЗВАНИЕ ОПЕРАЦИИ: Транслюминальна...,...,,,,,,,,,,


In [5]:
patient_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 42 columns):
 #   Column                                           Non-Null Count  Dtype         
---  ------                                           --------------  -----         
 0   admittion                                        300 non-null    datetime64[ns]
 1   discharge                                        300 non-null    datetime64[ns]
 2   sex                                              300 non-null    object        
 3   height                                           300 non-null    int64         
 4   weight                                           300 non-null    float64       
 5   BMI                                              298 non-null    float64       
 6   BSA                                              298 non-null    float64       
 7   birth                                            300 non-null    object        
 8   Операции (все в ИБ)                     

In [6]:
patient_data.describe()

Unnamed: 0,height,weight,BMI,BSA,target,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,...,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41
count,300.0,300.0,298.0,298.0,300.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,64.89,6.29356,76.83443,0.336477,0.046667,,,,,,...,,,,,,,,,,
std,8.076012,1.951695,20.640196,0.073846,0.211276,,,,,,...,,,,,,,,,,
min,39.0,1.27,20.34,0.12,0.0,,,,,,...,,,,,,,,,,
25%,60.0,4.8685,62.0075,0.28,0.0,,,,,,...,,,,,,,,,,
50%,65.5,6.4425,78.43,0.35,0.0,,,,,,...,,,,,,,,,,
75%,71.0,7.65125,91.7975,0.39,0.0,,,,,,...,,,,,,,,,,
max,85.0,12.0,137.65,0.52,1.0,,,,,,...,,,,,,,,,,


Успешно создан датасет. Необходимо изменить тип данных некоторых признаков, убрать пропуски.

## Предобработка

### Main dataset

In [7]:
main_dataset = pd.DataFrame()

In [8]:
main_dataset[['sex',
              'height',
              'weight',
              'BMI','BSA',
              'operations',
              'target']] = patient_data[['sex',
                                         'height',
                                         'weight',
                                         'BMI',
                                         'BSA',
                                         'Операции (все в ИБ)',
                                         'target']]

### Уберем пропуски

In [9]:
main_dataset.isnull().sum()

sex           0
height        0
weight        0
BMI           2
BSA           2
operations    0
target        0
dtype: int64

In [10]:
main_dataset = main_dataset.dropna()

In [11]:
main_dataset.isnull().sum()

sex           0
height        0
weight        0
BMI           0
BSA           0
operations    0
target        0
dtype: int64

### Обработка текста

In [17]:
m = Mystem()

# Леммантизирует текст
def lemmatize(text):
    return "".join(m.lemmatize(text))

def clear_text(text):
    cleaned = re.sub(r'[^а-яА-Яa-zA-ZёЁ ]', ' ', text)
    cleaned = cleaned.split()
    return ' '.join(cleaned)

In [18]:
%%time

corpus = main_dataset['operations'].apply(lambda x: clear_text(lemmatize(x)))

CPU times: total: 422 ms
Wall time: 4min 21s


In [19]:
main_dataset['operations'] = corpus

In [20]:
main_dataset.sample(5)

Unnamed: 0,sex,height,weight,BMI,BSA,operations,target
251,m,51,3.1,43.41,0.21,откр ик ушивание аортолевожелудочкого тоннель ...,0
15,f,67,6.245,76.29,0.34,откр ик протезирование трикуспидальный клапан ...,0
64,f,65,5.5,68.22,0.32,откр ик пластика дефект межжелудочковый перего...,0
181,f,74,8.58,99.74,0.42,откр ик пластика дефект межжелудочковый перего...,0
38,m,70,6.75,80.68,0.37,откр ик перевязка ранее налагать анастомоз по ...,0


### Категории

In [21]:
main_dataset['sex'] = pd.get_dummies(main_dataset['sex'], drop_first=True)

In [22]:
main_dataset.sample(5)

Unnamed: 0,sex,height,weight,BMI,BSA,operations,target
182,0,60,4.374,56.47,0.27,закрывать операция Muller суживание легочный а...,0
206,1,52,3.4,47.15,0.22,закрывать резекция коарктация аорта с наложени...,0
115,0,64,6.29,78.62,0.34,откр ик радикальный коррекция открытый общий а...,0
172,0,58,6.55,86.01,0.33,откр ик устранение перерыв дуга аорта с помощь...,0
244,1,76,12.0,137.65,0.51,откр ик пластик дефект межпредсердный перегоро...,0


### Обучение

#### Text Features Pipeline

In [23]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Стивен\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
stopwords = nltk_stopwords.words('russian')

In [25]:
operations_transformer = TfidfVectorizer(stop_words=stopwords)

In [136]:
num_features = ['height', 'weight', 'BMI', 'BSA']

In [137]:
num_transformer = StandardScaler()

#### Column Transformer

In [138]:
preprocessor = ColumnTransformer(
    transformers=[
        ('text', operations_transformer, 'operations'),
        ('num', num_transformer, num_features)
    ],
    remainder='passthrough'
)

In [139]:
preprocessor.fit_transform(main_dataset).shape

(298, 266)

In [28]:
X, y = main_dataset.drop('target', axis=1), main_dataset['target']

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

In [30]:
X_train.shape

(208, 6)

In [31]:
X_test.shape

(90, 6)

In [32]:
y_train.value_counts()

0    198
1     10
Name: target, dtype: int64

In [33]:
y_test.value_counts()

0    86
1     4
Name: target, dtype: int64

### Выбор модели

Сделаем пайплайн на примере кэтбуст. Проверим на кросс валидации

### Pipeline + RandomSearch

In [34]:
def create_pipeline(clf):
    pipeline = imbpipeline(
        steps=[
            ('preprocess', preprocessor),
            ('undersampler', RandomUnderSampler(sampling_strategy=0.1)),
            ('smote', SMOTE()),
            ('clf', clf)
        ]
    )
    return pipeline

In [40]:
def random_search(clf, params, n_iter=10):
    clf = create_pipeline(clf)
    return RandomizedSearchCV(clf,
                              params,
                              scoring='neg_log_loss',
                              n_jobs=-1,
                              n_iter=n_iter,
                              cv=StratifiedKFold(n_splits=3, shuffle=True),
                              verbose=5
                             )

In [36]:
def to_labels(pos_probs, threshold):
    return (pos_probs >= threshold).astype('int')

def thresholds_argmax(model, x, y):
    thresholds = np.arange(0, 0.9, 0.001)
    probs = model.predict_proba(x)[:, 1]
    scores = [roc_auc_score(y, to_labels(probs, t)) for t in thresholds]
    ix = np.argmax(scores)
    threshold = thresholds[ix]
    return (threshold)

In [226]:
def fit_data(model, show_feature_importance=False):
    clf_name = model.estimator['clf'].__class__.__name__
    print(f'Classifier name: {clf_name}')
    
    model = model.fit(X_train, y_train)
    print(f'Training finished!', '-'*100, sep='\n')
    print(f'Best score: = {model.best_score_}')
    print(f'Best parameters {model.best_params_}')
    
    if show_feature_importance:   
        try:
            if clf_name == 'CatBoostClassifier':
                feature_importance = model.best_estimator_['clf'].get_feature_importance()
            elif clf_name == 'XGBClassifier':
                feature_importance = model.best_estimator_['clf'].feature_importances_
            else:
                feature_importance = feature_importance = model.best_estimator_['clf'].feature_importance_
            main_features = pd.DataFrame(data = feature_importance[-5:], index=X.columns[:5]).sort_values(by=0)
            
            plt.figure(figsize=(10, 7))
            main_features.plot(kind='barh', ax=plt.gca())
            plt.title('Feature Importance without operations')
            plt.xlabel('Importance, %')
            plt.show()
        except:
            print('no feature importance')
    
    best_model = model.best_estimator_.fit(X_train, y_train)
    th = thresholds_argmax(best_model, X_test, y_test)
    print(f'Threshold: {th}')
    predictions = best_model.predict_proba(X_test)[:,1] > th
    print(f'Confusion Matrix:\n {confusion_matrix(y_test, predictions)}')
    print(classification_report(y_test, predictions))
    f1 = f1_score(y_test, predictions, labels=[1])
    
    return best_model, th, abs(model.best_score_), f1

### Перебор моделей

In [186]:
def look_through_models(models):
    names = []
    nest_models = []
    data_list = []
    for model in notebook.tqdm(models):
        data = fit_data(model)
        names.append(data[0]['clf'].__class__.__name__)
        nest_models.append(data[0])
        data_list.append(data[1:])
        print('_'*100)
    
    data_df = pd.DataFrame(data=data_list, index=names)
    display(data_df)
    data_df[0].sort_values(0).plot(kind='barh', ax=plt.gca())
    plt.xlabel('Threshold')
    plt.show()
    data_df[1].sort_values(0, ascending=False).plot(kind='barh', ax=plt.gca())
    plt.xlabel('Log loss score')
    plt.show()
    data_df[2].sort_values(0).plot(kind='barh', ax=plt.gca())
    plt.xlabel('f1 score')
    plt.show()

In [267]:
models = []

In [268]:
#Tree
parameters = {
    'clf__max_depth': range(3, 10),
    'preprocess__text__ngram_range': ((1, 1), (1, 2), (1, 3)),
    'smote__sampling_strategy': np.arange(0.2, 0.8, 0.1)
}

svc = random_search(DecisionTreeClassifier(),
                          parameters,
                          3)
models.append(svc)

In [269]:
#SGD
parameters = {
    'preprocess__text__ngram_range': ((1, 1), (1, 2), (1, 3)),
    'smote__sampling_strategy': np.arange(0.2, 0.8, 0.1)
}

svc = random_search(SGDClassifier(loss='log_loss'),
                          parameters,
                          3)
models.append(svc)

In [270]:
#KNN
parameters = {
    'clf__n_neighbors': range(3, 10),
    'preprocess__text__ngram_range': ((1, 1), (1, 2), (1, 3)),
    'smote__sampling_strategy': np.arange(0.2, 0.8, 0.1)
}

svc = random_search(KNeighborsClassifier(),
                          parameters,
                          3)
models.append(svc)

In [271]:
#Forest
parameters = {
    'clf__max_depth': range(3, 10),
    'clf__n_estimators': range(100, 1000, 100),
    'preprocess__text__ngram_range': ((1, 1), (1, 2), (1, 3)),
    'smote__sampling_strategy': np.arange(0.2, 0.8, 0.1)
}

svc = random_search(RandomForestClassifier(),
                          parameters,
                          3)
models.append(svc)

In [272]:
#Logistic
parameters = {
    'clf__C': [1, 4, 6],
    'preprocess__text__ngram_range': ((1, 1), (1, 2), (1, 3)),
    'smote__sampling_strategy': np.arange(0.2, 0.8, 0.1)
}

svc = random_search(LogisticRegression(),
                          parameters,
                          3)
models.append(svc)

In [273]:
#SVC
parameters = {
    'clf__gamma': [.1,.5,1,2,5,10],
    'clf__C': [.1, 1, 10, 100, 1000],
    'preprocess__text__ngram_range': ((1, 1), (1, 2), (1, 3)),
    'smote__sampling_strategy': np.arange(0.2, 0.8, 0.1)
}

svc = random_search(SVC(probability=True, kernel='rbf'),
                          parameters,
                          3)
models.append(svc)

In [274]:
#CatBoost
parameters = {
    'preprocess__text__ngram_range': ((1, 1), (1, 2), (1, 3)),
    'smote__sampling_strategy': np.arange(0.2, 0.8, 0.1),
    'clf__depth': range(2, 11),
}

catboost_search = random_search(CatBoostClassifier(silent=True, iterations=750),
                                       parameters,
                                       3
                                       )
models.append(catboost_search)

In [275]:
#XGB
parameters = {
    'preprocess__text__ngram_range': ((1, 1), (1, 2), (1, 3)),
    'smote__sampling_strategy': np.arange(0.2, 0.8, 0.1),
    'clf__max_depth': range(2, 11),
}

xgb_search = random_search(XGBClassifier(),
                          parameters,
                          3)
models.append(xgb_search)

In [None]:
look_through_models(models)

  0%|          | 0/8 [00:00<?, ?it/s]

Classifier name: DecisionTreeClassifier
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Training finished!
----------------------------------------------------------------------------------------------------
Best score: = -2.4742515775740936
Best parameters {'smote__sampling_strategy': 0.2, 'preprocess__text__ngram_range': (1, 3), 'clf__max_depth': 3}
Threshold: 0.04
Confusion Matrix:
 [[83  3]
 [ 3  1]]
              precision    recall  f1-score   support

           0       0.97      0.97      0.97        86
           1       0.25      0.25      0.25         4

    accuracy                           0.93        90
   macro avg       0.61      0.61      0.61        90
weighted avg       0.93      0.93      0.93        90

____________________________________________________________________________________________________
Classifier name: SGDClassifier
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Training finished!
--------------------------------------------