# Подготовка данных пациентов

In [1]:
# standard
import pandas as pd
import numpy as np
import re
import seaborn as sbs
import matplotlib.pyplot as plt
import matplotlib
from tqdm import notebook
sbs.set_style("darkgrid")

# text processing
import nltk
from nltk.corpus import stopwords as nltk_stopwords
from pymystem3 import Mystem

# preprocessing/processing
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# model selection
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, cross_validate,  StratifiedKFold
import imblearn
from imblearn.pipeline import Pipeline as imbpipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# metrics
from sklearn.metrics import classification_report, confusion_matrix, make_scorer
from sklearn.metrics import recall_score, precision_score, accuracy_score, roc_auc_score
from sklearn.metrics import recall_score, f1_score, precision_score

# base models
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

# model building
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


## Загрузка данных

In [2]:
file_path = 'chd_addmit_300.xlsx'

In [3]:
patient_data = pd.read_excel(file_path)

In [4]:
patient_data.head()

Unnamed: 0,admittion,discharge,sex,height,weight,BMI,BSA,birth,Операции (все в ИБ),Перенесенные опер. (из Анамн.),...,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41
0,2016-12-12,2017-01-10,m,76,9.7,111.27,0.46,02.01.2016,12.12.2016: (Откр./ИК) Перевязка ранее наложен...,,...,,,,,,,,,,
1,2017-01-13,2017-02-01,f,67,7.34,89.67,0.37,02.02.2016,18.01.2017: (Откр./ИК) Радикальная коррекция д...,15.08.2016г.: Транслюминальная балонная вальву...,...,,,,,,,,,,
2,2017-01-17,2017-02-09,m,74,8.9,103.46,0.43,21.02.2016,19.01.2017: (Откр./ИК) Перевязка ранее наложен...,29.02.2016 - подключично-легочный анастомоз сп...,...,,,,,,,,,,
3,2017-01-20,2017-02-21,f,67,6.97,85.15,0.36,02.03.2016,23.01.2017: (Откр./ИК) Наложение двустороннего...,,...,,,,,,,,,,
4,2017-02-13,2017-03-01,m,82,9.29,102.59,0.46,08.03.2016,15.02.2017: (Откр./ИК) Реконструкция путей отт...,13.04.2016. НАЗВАНИЕ ОПЕРАЦИИ: Транслюминальна...,...,,,,,,,,,,


In [5]:
patient_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 42 columns):
 #   Column                                           Non-Null Count  Dtype         
---  ------                                           --------------  -----         
 0   admittion                                        300 non-null    datetime64[ns]
 1   discharge                                        300 non-null    datetime64[ns]
 2   sex                                              300 non-null    object        
 3   height                                           300 non-null    int64         
 4   weight                                           300 non-null    float64       
 5   BMI                                              298 non-null    float64       
 6   BSA                                              298 non-null    float64       
 7   birth                                            300 non-null    object        
 8   Операции (все в ИБ)                     

In [6]:
patient_data.describe()

Unnamed: 0,height,weight,BMI,BSA,target,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,...,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41
count,300.0,300.0,298.0,298.0,300.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,64.89,6.29356,76.83443,0.336477,0.046667,,,,,,...,,,,,,,,,,
std,8.076012,1.951695,20.640196,0.073846,0.211276,,,,,,...,,,,,,,,,,
min,39.0,1.27,20.34,0.12,0.0,,,,,,...,,,,,,,,,,
25%,60.0,4.8685,62.0075,0.28,0.0,,,,,,...,,,,,,,,,,
50%,65.5,6.4425,78.43,0.35,0.0,,,,,,...,,,,,,,,,,
75%,71.0,7.65125,91.7975,0.39,0.0,,,,,,...,,,,,,,,,,
max,85.0,12.0,137.65,0.52,1.0,,,,,,...,,,,,,,,,,


Успешно создан датасет. Необходимо изменить тип данных некоторых признаков, убрать пропуски.

## Предобработка

### Main dataset

In [7]:
main_dataset = pd.DataFrame()

In [8]:
main_dataset[['sex',
              'height',
              'weight',
              'BMI',
              'BSA',
              'medication',
              'diagnosis',
              'icd',
              'target']] = patient_data[['sex',
                                         'height',
                                         'weight',
                                         'BMI',
                                         'BSA',
                                         'Медикам. леч. по поводу осн. забол. (из Анамн.)',
                                         'Диагноз',
                                         'МКБ',
                                         'target']]

In [9]:
patient_data['birth'] = pd.to_datetime(patient_data['birth'])
patient_data['Дата опер.'] = pd.to_datetime(patient_data['Дата опер.'])

In [10]:
main_dataset['age'] = (patient_data['Дата опер.'] - patient_data['birth']).dt.days.abs()

In [11]:
main_dataset

Unnamed: 0,sex,height,weight,BMI,BSA,medication,diagnosis,icd,target,age
0,m,76,9.700,111.27,0.46,"дигоксин, панангин, аспирин-кардио, верошпирон...",Атрезия легочной артерии 1 тип ; Перимембраноз...,Q25.5,0,315
1,f,67,7.340,89.67,0.37,"дигоксин, верошпирон коротким курсом в периоде...",двойное отхождение аорты и легочной артерии от...,"Q20.1, Q21.0, Q22.1",0,351
2,m,74,8.900,103.46,0.43,"дигоксин, панангин, аспирин","Атрезия легочной артерии, тип II. Дефект межже...",Q25.5,0,333
3,f,67,6.970,85.15,0.36,"бисептол, бифидум, гепарин, глюкоза, кальция г...",Двойное отхождение магистральных сосудов от пр...,Q20.4,0,355
4,m,82,9.290,102.59,0.46,"верошпирон, беталок",Комбинированный стеноз клапана легочной артери...,Q22.1,0,196
...,...,...,...,...,...,...,...,...,...,...
295,f,74,7.430,86.37,0.39,"верошпирон, аспаркам","Частичный открытй атриовентрикулярный канал, о...",Q21.2,0,380
296,f,48,2.334,33.69,0.18,,Дефект межжелудочковой перегородки подаортальн...,Q21.0,0,113
297,m,51,3.120,43.69,0.21,"Верошпирпон, Аспаркам, Анаприлин",Отхождение магистральных сосудов от правого же...,"Q20.1, Q25.6",0,308
298,m,65,6.800,84.34,0.35,,Двойное отхождение сосудов от правого желудочк...,Q20.1,0,296


### Уберем пропуски

In [12]:
main_dataset.isnull().sum()

sex            0
height         0
weight         0
BMI            2
BSA            2
medication    47
diagnosis      0
icd            0
target         0
age            0
dtype: int64

In [13]:
main_dataset['medication'] = main_dataset['medication'].fillna('')

In [14]:
main_dataset = main_dataset.dropna()

In [15]:
main_dataset.isnull().sum()

sex           0
height        0
weight        0
BMI           0
BSA           0
medication    0
diagnosis     0
icd           0
target        0
age           0
dtype: int64

### Обработка текста

In [16]:
m = Mystem()

# Леммантизирует текст
def lemmatize(text):
    return "".join(m.lemmatize(text))

def clear_text(text):
    cleaned = re.sub(r'[^а-яА-Яa-zA-ZёЁ ]', ' ', text)
    cleaned = cleaned.split()
    return ' '.join(cleaned)

In [17]:
%%time

main_dataset['diagnosis'] = main_dataset['diagnosis'].apply(lambda x: clear_text(lemmatize(x)))

CPU times: total: 469 ms
Wall time: 3min 51s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [18]:
%%time

main_dataset['medication'] = main_dataset['medication'].apply(lambda x: clear_text(lemmatize(x)))

CPU times: total: 609 ms
Wall time: 3min 27s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [19]:
main_dataset['icd'] = main_dataset['icd'].apply(lambda x: ' '.join(x.split(',')))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_dataset['icd'] = main_dataset['icd'].apply(lambda x: ' '.join(x.split(',')))


In [20]:
main_dataset.sample(5)

Unnamed: 0,sex,height,weight,BMI,BSA,medication,diagnosis,icd,target,age
42,m,68,5.6,67.91,0.33,аспирин кардио дигоксин,атрезия легочный артерия тип подаортальный деф...,Q21.0 Q22.0,1,189
122,f,71,5.6,66.46,0.33,гентамицин ампициллин амписид дигоксин пананги...,атрезия легочный артерия тип подаортальный деф...,Q22.0,0,356
207,m,69,7.58,91.25,0.38,,комбинированный стеноз легочный артерия дефект...,Q22.1,0,565
151,f,65,4.9,60.78,0.3,верошпирон дигоксин аспаркам,дефект межжелудочковый перегородка открывать о...,Q21.0,0,169
197,f,69,7.655,92.16,0.39,верошпирон постоянно с мес капотный курс с мес,вторичный дефект межпредсердный перегородка НК...,Q21.1,0,222


### Категории

In [21]:
main_dataset['sex'] = pd.get_dummies(main_dataset['sex'], drop_first=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_dataset['sex'] = pd.get_dummies(main_dataset['sex'], drop_first=True)


In [22]:
main_dataset.sample(5)

Unnamed: 0,sex,height,weight,BMI,BSA,medication,diagnosis,icd,target,age
196,0,72,9.3,109.6,0.43,цефтриаксон аципол називин протаргол,выраженный клапанный стеноз легочный артерия м...,Q25.6,0,585
121,0,65,5.1,63.26,0.31,дигоксин панангин верошпирон,перимембранозный дефект межжелудочковый перего...,Q21.0 Q25.0,0,281
188,0,61,4.27,54.67,0.27,лазикс верошпирон аспаркам дигоксин,дефект межжелудочковый перегородка открывать о...,Q21.0,0,71
189,0,55,5.74,77.4,0.3,верошпирон элькар,клапанный стеноз легочный артерия открывать ов...,Q25.6,0,261
83,0,63,5.6,70.55,0.32,дигоксин верошпирон капотный аспаркам,общий открытый атриовентрикулярный канал тип а...,Q21.2,0,107


### Обучение

#### Column Transformer

In [23]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Стивен\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
stopwords = nltk_stopwords.words('russian')

In [25]:
num_features = ['sex', 'height', 'weight', 'BMI', 'BSA', 'age']

In [26]:
preprocessor = ColumnTransformer(
    transformers=[
        ('diagnosis', TfidfVectorizer(stop_words=stopwords), 'diagnosis'),
        ('medication', TfidfVectorizer(stop_words=stopwords), 'medication'),
        ('num', StandardScaler(), num_features),
        ('icd', CountVectorizer(), 'icd')
    ],
    remainder='passthrough'
)

In [27]:
preprocessor.fit_transform(main_dataset).shape

(298, 1118)

In [28]:
X, y = main_dataset.drop(['target'], axis=1), main_dataset['target']

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

In [30]:
X_train.shape

(208, 9)

In [31]:
X_test.shape

(90, 9)

In [32]:
y_train.value_counts()

0    198
1     10
Name: target, dtype: int64

In [33]:
y_test.value_counts()

0    86
1     4
Name: target, dtype: int64

### Выбор модели

Сделаем пайплайн на примере кэтбуст. Проверим на кросс валидации

### Pipeline + RandomSearch

In [34]:
def create_pipeline(clf):
    pipeline = imbpipeline(
        steps=[
            ('preprocess', preprocessor),
            ('smote', SMOTE()),
            ('clf', clf)
        ]
    )
    return pipeline

In [35]:
def random_search(clf, params, n_iter=10):
    clf = create_pipeline(clf)
    return RandomizedSearchCV(clf,
                              params,
                              scoring='neg_log_loss',
                              n_jobs=-1,
                              n_iter=n_iter,
                              cv=StratifiedKFold(n_splits=3, shuffle=True),
                              verbose=5
                             )

In [36]:
def to_labels(pos_probs, threshold):
    return (pos_probs >= threshold).astype('int')

def thresholds_argmax(model, x, y):
    thresholds = np.arange(0, 0.9, 0.001)
    probs = model.predict_proba(x)[:, 1]
    scores = [roc_auc_score(y, to_labels(probs, t)) for t in thresholds]
    ix = np.argmax(scores)
    threshold = thresholds[ix]
    return (threshold)

In [37]:
def fit_data(model, show_feature_importance=False):
    clf_name = model.estimator['clf'].__class__.__name__
    print(f'Classifier name: {clf_name}')
    
    model = model.fit(X_train, y_train)
    print(f'Training finished!', '-'*100, sep='\n')
    print(f'Best score: = {model.best_score_}')
    print(f'Best parameters {model.best_params_}')
    
    if show_feature_importance:   
        try:
            if clf_name == 'CatBoostClassifier':
                feature_importance = model.best_estimator_['clf'].get_feature_importance()
            elif clf_name == 'XGBClassifier':
                feature_importance = model.best_estimator_['clf'].feature_importances_
            else:
                feature_importance = feature_importance = model.best_estimator_['clf'].feature_importance_
            main_features = pd.DataFrame(data = feature_importance[-5:], index=X.columns[:5]).sort_values(by=0)
            
            plt.figure(figsize=(10, 7))
            main_features.plot(kind='barh', ax=plt.gca())
            plt.title('Feature Importance without operations')
            plt.xlabel('Importance, %')
            plt.show()
        except:
            print('no feature importance')
    
    best_model = model.best_estimator_.fit(X_train, y_train)
    th = thresholds_argmax(best_model, X_test, y_test)
    print(f'Threshold: {th}')
    predictions = best_model.predict_proba(X_test)[:,1] > th
    print(f'Confusion Matrix:\n {confusion_matrix(y_test, predictions)}')
    print(classification_report(y_test, predictions))
    f1 = f1_score(y_test, predictions, labels=[1])
    precision = precision_score(y_test, predictions, labels=[1])
    recall = recall_score(y_test, predictions, labels=[1])
    
    return best_model, th, abs(model.best_score_), f1, precision, recall

### Перебор моделей

In [38]:
def look_through_models(models):
    names = []
    best_models = []
    data_list = []
    for model in notebook.tqdm(models):
        name = model.estimator['clf'].__class__.__name__
        try:
            data = fit_data(model)
            names.append(name)
            best_models.append(data[0])
            data_list.append(data[1:])
        except ValueError as e:
            print(f'Classifier {name} threw an exception: {e}')
        print('_'*100)
    
    data_df = pd.DataFrame(data=data_list, index=names)
    display(data_df)
    data_df[0].sort_values(0).plot(kind='barh', ax=plt.gca())
    plt.xlabel('Threshold')
    plt.show()
    data_df[1].sort_values(0, ascending=False).plot(kind='barh', ax=plt.gca())
    plt.xlabel('Log loss score')
    plt.show()
    data_df[2].sort_values(0).plot(kind='barh', ax=plt.gca())
    plt.xlabel('f1 score')
    plt.show()
    data_df[3].sort_values(0).plot(kind='barh', ax=plt.gca())
    plt.xlabel('precision score')
    plt.show()
    data_df[4].sort_values(0).plot(kind='barh', ax=plt.gca())
    plt.xlabel('recall score')
    plt.show()
    
    return best_models

#### Models

In [39]:
models = []

In [40]:
parameters = {
    'preprocess__diagnosis__ngram_range': ((1, 1), (1, 2), (1, 3), (2, 2)),
    'preprocess__diagnosis__max_df': (0.25, 0.5, 0.75, 1.0),
    'preprocess__diagnosis__max_features': range(25, 201, 25),
    'preprocess__medication__max_df': (0.25, 0.5, 0.75, 1.0),
    'smote__sampling_strategy': np.arange(0.2, 0.8, 0.1),
}

def add_params(params):
    result = parameters.copy()
    for key, value in params.items():
        result[key] = value
    return result

In [41]:
#Tree
params = {
    'clf__max_depth': range(3, 10),
    'clf__class_weight': [None, 'balanced']
}

svc = random_search(DecisionTreeClassifier(),
                          add_params(params),
                          150)
models.append(svc)

In [42]:
#SGD
params = {
    'clf__class_weight': [None, 'balanced']
}

svc = random_search(SGDClassifier(loss='log_loss'),
                          add_params(params),
                          150)
models.append(svc)

In [43]:
#KNN
params = {
    'clf__n_neighbors': range(3, 10)
}

svc = random_search(KNeighborsClassifier(),
                          add_params(params),
                          150)
models.append(svc)

In [44]:
#Forest
params = {
    'clf__max_depth': range(3, 10),
    'clf__n_estimators': range(200, 2000, 200),
    'clf__class_weight': [None, 'balanced']
}

svc = random_search(RandomForestClassifier(),
                          add_params(params),
                          200)
models.append(svc)

In [45]:
#Logistic
params = {
    'clf__C': [.1, 1, 10, 100, 1000],
    'clf__class_weight': [None, 'balanced']
}

svc = random_search(LogisticRegression(),
                          add_params(params),
                          150)
models.append(svc)

In [46]:
#SVC
params = {
    'clf__gamma': [.1,.5,1,2,5,10],
    'clf__C': [.1, 1, 10, 100, 1000],
    'clf__class_weight': [None, 'balanced']
}

svc = random_search(SVC(probability=True, kernel='rbf'),
                          add_params(params),
                          150)
models.append(svc)

In [47]:
#LightGBM
params = {
    'clf__max_depth': range(2, 11),
    'clf__n_estimators': range(100, 1100, 100)
}

xgb_search = random_search(LGBMClassifier(),
                          add_params(params),
                          300)
models.append(xgb_search)

In [48]:
#XGB
params = {
    'clf__max_depth': range(2, 11),
    'clf__n_estimators': range(100, 1100, 100)
}

xgb_search = random_search(XGBClassifier(),
                          add_params(params),
                          300)
models.append(xgb_search)

In [49]:
#CatBoost
params = {
    'clf__depth': range(2, 11)
}

catboost_search = random_search(CatBoostClassifier(silent=True, iterations=750),
                                       add_params(params),
                                       100)
models.append(catboost_search)

#### Results

In [None]:
look_through_models(models);

In [None]:
a = 5