# Подготовка данных пациентов

In [82]:
import pandas as pd
import numpy as np
import os
import re
import seaborn as sbs
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from pymystem3 import Mystem

from sklearn.preprocessing import OneHotEncoder
import transformers
from tqdm import notebook
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

from catboost import CatBoostClassifier

import imblearn
from imblearn.over_sampling import SMOTE

## Загрузка данных

In [2]:
file_path = 'chd_addmit_300.xlsx'

In [3]:
patient_data = pd.read_excel(file_path)

In [4]:
patient_data.head()

Unnamed: 0,admittion,discharge,sex,height,weight,BMI,BSA,birth,Операции (все в ИБ),Перенесенные опер. (из Анамн.),...,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41
0,2016-12-12,2017-01-10,m,76,9.7,111.27,0.46,02.01.2016,12.12.2016: (Откр./ИК) Перевязка ранее наложен...,,...,,,,,,,,,,
1,2017-01-13,2017-02-01,f,67,7.34,89.67,0.37,02.02.2016,18.01.2017: (Откр./ИК) Радикальная коррекция д...,15.08.2016г.: Транслюминальная балонная вальву...,...,,,,,,,,,,
2,2017-01-17,2017-02-09,m,74,8.9,103.46,0.43,21.02.2016,19.01.2017: (Откр./ИК) Перевязка ранее наложен...,29.02.2016 - подключично-легочный анастомоз сп...,...,,,,,,,,,,
3,2017-01-20,2017-02-21,f,67,6.97,85.15,0.36,02.03.2016,23.01.2017: (Откр./ИК) Наложение двустороннего...,,...,,,,,,,,,,
4,2017-02-13,2017-03-01,m,82,9.29,102.59,0.46,08.03.2016,15.02.2017: (Откр./ИК) Реконструкция путей отт...,13.04.2016. НАЗВАНИЕ ОПЕРАЦИИ: Транслюминальна...,...,,,,,,,,,,


In [5]:
patient_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 42 columns):
 #   Column                                           Non-Null Count  Dtype         
---  ------                                           --------------  -----         
 0   admittion                                        300 non-null    datetime64[ns]
 1   discharge                                        300 non-null    datetime64[ns]
 2   sex                                              300 non-null    object        
 3   height                                           300 non-null    int64         
 4   weight                                           300 non-null    float64       
 5   BMI                                              298 non-null    float64       
 6   BSA                                              298 non-null    float64       
 7   birth                                            300 non-null    object        
 8   Операции (все в ИБ)                     

In [6]:
patient_data.describe()

Unnamed: 0,height,weight,BMI,BSA,target,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,...,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41
count,300.0,300.0,298.0,298.0,300.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,64.89,6.29356,76.83443,0.336477,0.046667,,,,,,...,,,,,,,,,,
std,8.076012,1.951695,20.640196,0.073846,0.211276,,,,,,...,,,,,,,,,,
min,39.0,1.27,20.34,0.12,0.0,,,,,,...,,,,,,,,,,
25%,60.0,4.8685,62.0075,0.28,0.0,,,,,,...,,,,,,,,,,
50%,65.5,6.4425,78.43,0.35,0.0,,,,,,...,,,,,,,,,,
75%,71.0,7.65125,91.7975,0.39,0.0,,,,,,...,,,,,,,,,,
max,85.0,12.0,137.65,0.52,1.0,,,,,,...,,,,,,,,,,


Успешно создан датасет. Необходимо изменить тип данных некоторых признаков, убрать пропуски.

## Предобработка

### Main dataset

In [241]:
main_dataset = pd.DataFrame()

In [242]:
main_dataset[['sex','height','weight','BMI','BSA', 'operations', 'target']] = patient_data[['sex','height','weight','BMI','BSA', 'Операции (все в ИБ)', 'target']]

### Уберем пропуски

In [243]:
main_dataset.isnull().sum()

sex           0
height        0
weight        0
BMI           2
BSA           2
operations    0
target        0
dtype: int64

In [244]:
main_dataset = main_dataset.dropna()

In [245]:
main_dataset.isnull().sum()

sex           0
height        0
weight        0
BMI           0
BSA           0
operations    0
target        0
dtype: int64

## Обучение

### Train test split

In [246]:
X, y = main_dataset.drop('target', axis=1), main_dataset['target']

In [247]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, stratify=y)

In [248]:
X_train, y_train = X_train.reset_index(drop=True), y_train.reset_index(drop=True)

In [249]:
X_test, y_test = X_test.reset_index(drop=True), y_test.reset_index(drop=True), 

In [250]:
X_train.shape

(193, 6)

In [251]:
X_test.shape

(105, 6)

In [252]:
y_test.value_counts()

0    100
1      5
Name: target, dtype: int64

In [253]:
y_train.value_counts()

0    184
1      9
Name: target, dtype: int64

In [254]:
X_train

Unnamed: 0,sex,height,weight,BMI,BSA,operations
0,f,49,3.210,45.86,0.21,29.12.2016: (Эндоваск.) Транслюминальная балон...
1,f,60,5.500,71.00,0.31,"16.02.2017: (Закрыт.) Операция Muller, суживан..."
2,m,60,4.530,58.48,0.28,06.02.2017: (Эндоваск.) Транслюминальная балон...
3,m,64,7.000,87.50,0.36,19.05.2017: (Откр./ИК) Пластика дефекта межжел...
4,m,65,7.015,87.01,0.36,23.12.2016: (Откр./ИК) Наложение центрального ...
...,...,...,...,...,...,...
188,m,61,5.547,71.02,0.31,22.12.2016: (Закрыт.) Наложение системно-легоч...
189,m,69,7.740,93.18,0.39,03.11.2017: (Закрыт.) Наложение модифицированн...
190,f,76,7.910,90.73,0.41,11.10.2017: (Откр./ИК) Перевязка ранее наложен...
191,m,70,8.200,98.01,0.40,28.04.2017: (Откр./ИК) Перевязка ранее наложен...


### Обработка текста

In [255]:
m = Mystem()

# Леммантизирует текст
def lemmatize(text):
    return "".join(m.lemmatize(text))

def clear_text(text):
    cleaned = re.sub(r'[^а-яА-Яa-zA-ZёЁ ]', ' ', text)
    cleaned = cleaned.split()
    return ' '.join(cleaned)

In [256]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Стивен\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [257]:
stopwords = nltk_stopwords.words('russian')

In [258]:
%%time

corpus_train = X_train['operations'].apply(lambda x: clear_text(lemmatize(x)))
corpus_test = X_test['operations'].apply(lambda x: clear_text(lemmatize(x)))

CPU times: total: 484 ms
Wall time: 4min 28s


In [259]:
tf_idf = TfidfVectorizer(stop_words=stopwords).fit(corpus_train)

In [260]:
tf_idf_train = tf_idf.transform(corpus_train)
tf_idf_test = tf_idf.transform(corpus_test)

In [271]:
tf_idf_train.shape

(193, 233)

In [212]:
smote = SMOTE()

## OHE

In [272]:
X_train = pd.get_dummies(X_train, drop_first=True)

In [273]:
X_train

Unnamed: 0,height,weight,BMI,BSA,sex_m
0,49,3.210,45.86,0.21,0
1,60,5.500,71.00,0.31,0
2,60,4.530,58.48,0.28,1
3,64,7.000,87.50,0.36,1
4,65,7.015,87.01,0.36,1
...,...,...,...,...,...
188,61,5.547,71.02,0.31,1
189,69,7.740,93.18,0.39,1
190,76,7.910,90.73,0.41,0
191,70,8.200,98.01,0.40,1


In [274]:
X_test = pd.get_dummies(X_test, drop_first=True)

In [275]:
X_train_final = pd.concat([X_train.reset_index(drop=True), pd.DataFrame(tf_idf_train.toarray())], axis=1, join='inner')

In [276]:
X_test_final = pd.concat([X_test.reset_index(drop=True), pd.DataFrame(tf_idf_test.toarray())], axis=1, join='inner')

In [277]:
y_train.shape

(193,)

In [284]:
X_train

Unnamed: 0,height,weight,BMI,BSA,sex_m
0,49,3.210,45.86,0.21,0
1,60,5.500,71.00,0.31,0
2,60,4.530,58.48,0.28,1
3,64,7.000,87.50,0.36,1
4,65,7.015,87.01,0.36,1
...,...,...,...,...,...
188,61,5.547,71.02,0.31,1
189,69,7.740,93.18,0.39,1
190,76,7.910,90.73,0.41,0
191,70,8.200,98.01,0.40,1


In [303]:
X_train_final, y_train = smote.fit_resample(X_train_final.values, y_train)

### Catboost

In [304]:
model = CatBoostClassifier(verbose=100)

In [305]:
model.fit(X_train_final, y_train, eval_set=(X_test_final, y_test))

Learning rate set to 0.024756
0:	learn: 0.6559245	test: 0.6542785	best: 0.6542785 (0)	total: 4.58ms	remaining: 4.58s
100:	learn: 0.0340621	test: 0.1955811	best: 0.1898580 (70)	total: 418ms	remaining: 3.72s
200:	learn: 0.0141884	test: 0.2201026	best: 0.1898580 (70)	total: 857ms	remaining: 3.41s
300:	learn: 0.0080522	test: 0.2432631	best: 0.1898580 (70)	total: 1.3s	remaining: 3.01s
400:	learn: 0.0047656	test: 0.2707556	best: 0.1898580 (70)	total: 1.72s	remaining: 2.57s
500:	learn: 0.0034564	test: 0.2886181	best: 0.1898580 (70)	total: 2.15s	remaining: 2.15s
600:	learn: 0.0030346	test: 0.2973651	best: 0.1898580 (70)	total: 2.58s	remaining: 1.71s
700:	learn: 0.0025874	test: 0.3048476	best: 0.1898580 (70)	total: 3s	remaining: 1.28s
800:	learn: 0.0023280	test: 0.3074712	best: 0.1898580 (70)	total: 3.41s	remaining: 847ms
900:	learn: 0.0023206	test: 0.3076891	best: 0.1898580 (70)	total: 3.81s	remaining: 419ms
999:	learn: 0.0023116	test: 0.3077396	best: 0.1898580 (70)	total: 4.21s	remaining: 0us

<catboost.core.CatBoostClassifier at 0x1eaf4140eb0>

In [352]:
th = 0.1

In [353]:
predictions = model.predict_proba(X_test_final)[:,1] > th

In [354]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.97      0.92      0.94       100
           1       0.20      0.40      0.27         5

    accuracy                           0.90       105
   macro avg       0.58      0.66      0.61       105
weighted avg       0.93      0.90      0.91       105



In [355]:
confusion_matrix(y_test, predictions)

array([[92,  8],
       [ 3,  2]], dtype=int64)