In [1]:
# Disabling warnings
# Отключение отображения предупреждений
import sys
import os
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"

In [2]:
# Импорт библиотек
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime as dt
from datetime import time

from sklearn.linear_model import LogisticRegression

from copy import deepcopy


from sklearn.svm import LinearSVC
from sklearn.svm import SVC

from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve

from imblearn.over_sampling import RandomOverSampler,  SMOTE
 
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from xgboost import XGBClassifier


In [3]:
# Настройка pandas для того, чтобы отображались все колонки
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
# Feature selection class to eliminate multicollinearity
# Выбор и удаление признаков скоррелированных между собой (борьба с мультиколлинеарносстью)
class MultiCollinearityEliminator():

    # Class initialisation
    # Инициализация класса
    def __init__(self, df, target, threshold):
        self.df = df
        self.target = target
        self.threshold = threshold

    # Method to create and return the feature correlation matrix dataframe
    # Метод создающий и возращающий корреляционную матрицу признаков датафрейма
    def createCorrMatrix(self, include_target=False):
        # Checking we should include the target in the correlation matrix
        if (include_target == False):
            df_temp = self.df.drop([self.target], axis=1)

            # Setting method to Pearson to prevent issues in case the default method for df.corr() gets changed
            # Setting min_period to 30 for the sample size to be statistically significant (normal) according to
            # central limit theorem
            corrMatrix = df_temp.corr(method='pearson', min_periods=30).abs()

        # Target is included for creating the series of feature to target correlation - Please refer the notes under the
        # print statement to understand why we create the series of feature to target correlation
        elif (include_target == True):
            corrMatrix = self.df.corr(method='pearson', min_periods=30).abs()

        return corrMatrix

    # Method to create and return the feature to target correlation matrix dataframe
    # Метод создающий и возвращающий корреляционную матрицу признаков с целевой переменной
    def createCorrMatrixWithTarget(self):

        # After obtaining the list of correlated features, this method will help to view which variables
        # (in the list of correlated features) are least correlated with the target
        # This way, out the list of correlated features, we can ensure to elimate the feature that is
        # least correlated with the target
        # This not only helps to sustain the predictive power of the model but also helps in reducing model complexity

        # Obtaining the correlation matrix of the dataframe (along with the target)
        corrMatrix = self.createCorrMatrix(include_target=True)

        # Creating the required dataframe, then dropping the target row
        # and sorting by the value of correlation with target (in asceding order)
        corrWithTarget = pd.DataFrame(corrMatrix.loc[:, self.target]).drop(
            [self.target], axis=0).sort_values(by=self.target)
        #print(corrWithTarget, '\n')
        return corrWithTarget

    # Method to create and return the list of correlated features
    # Метод создающий и вовзращающий лист скоррелированных признаков
    def createCorrelatedFeaturesList(self):
        # Obtaining the correlation matrix of the dataframe (without the target)
        corrMatrix = self.createCorrMatrix(include_target=False)
        colCorr = []
        # Iterating through the columns of the correlation matrix dataframe
        for column in corrMatrix.columns:
            # Iterating through the values (row wise) of the correlation matrix dataframe
            for idx, row in corrMatrix.iterrows():
                if(row[column] > self.threshold) and (row[column] < 1):
                    # Adding the features that are not already in the list of correlated features
                    if (idx not in colCorr):
                        colCorr.append(idx)
                    if (column not in colCorr):
                        colCorr.append(column)
        #print(colCorr, '\n')
        return colCorr

    # Method to eliminate the least important features from the list of correlated features
    # Метод удаляющий наименее важные признаки (наименее скоррелированные с целевым признаком) из двух скоррелированных признаков
    def deleteFeatures(self, colCorr):
        # Obtaining the feature to target correlation matrix dataframe
        corrWithTarget = self.createCorrMatrixWithTarget()
        for idx, row in corrWithTarget.iterrows():
            #print(idx, '\n')
            if (idx in colCorr):
                self.df = self.df.drop(idx, axis=1)
                break
        return self.df

    # Method to run automatically eliminate multicollinearity
    # Метод запускающий и удаляющий мультиколлинеарность
    def autoEliminateMulticollinearity(self):
        # Obtaining the list of correlated features
        colCorr = self.createCorrelatedFeaturesList()
        while colCorr != []:
            # Obtaining the dataframe after deleting the feature (from the list of correlated features)
            # that is least correlated with the taregt
            self.df = self.deleteFeatures(colCorr)
            # Obtaining the list of correlated features
            colCorr = self.createCorrelatedFeaturesList()
        return self.df

In [5]:
# Получение диаганальной матрицы корреляций
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

In [6]:
# Вывод списка пар с максимальной корреляцией между собой
def get_top_abs_correlations(df, n=10):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

In [7]:
# Функция считает наибольший f_score при различных порогах отнесения к нулевому и первому классам
def calc_f_score(y_valid, probs):
    precision, recall, threshold = precision_recall_curve(y_valid, probs)
    denom = precision + recall
    # в целях избежания ошибки, если знаменатель f-score равен нулю, то он конвертируется в -1 
    denom[denom == 0] = -1
    f_score = (2 * precision * recall / denom)[:-1]
    return f_score, threshold

In [8]:
# Функция возвращает оптимальный порог отнесения к классу с наивысшим показателем f-score
def calc_optimal_f_score(y_valid, probs):
    f_score, thresholds = calc_f_score(y_valid, probs)
    best_f_score = max(f_score)
    threshold = thresholds[np.argmax(f_score)]
    print(f'best_threshold = {round(threshold, 4)}')
    print(f'best_f_score = {round(best_f_score,4)}')
    return threshold

In [9]:
# Функция, возвращающая датафрейм, в котором исключены признаки имеющие корреляцию с целевым признаком 
# менее заданного значения (по умолчанию: 0.05)
def reduce_low_corrs_with_target(df, target, threshold=0.05, return_predicted=False, X_test=None):    
    mask = df.corr().abs()[target].sort_values() > threshold
    df = df[mask.index[mask]]
    if return_predicted == True:
        test_mask = mask.drop(target)
        X_test = X_test[test_mask.index[test_mask]]
        return df, X_test
    else:
        return df

In [10]:
# Функция возвращающая значения 1 или 0, в зависимости от выполнения условия
def to_labels(pos_probs, threshold):
    return (pos_probs >= threshold).astype('int')

In [11]:
# Функция, возвращающая опттимальный порог отнесения к классу (в данной функции - по метрике recall_macro)
def find_opt_threshold(y_valid, probs):
    # keep probabilities for the positive outcome only
    probs = probs[:, 1]
    # define thresholds
    thresholds = np.arange(0, 1, 0.001)
    # evaluate each threshold
    scores = [recall_score(y_valid, to_labels(probs, t), average='macro') for t in thresholds]
    # get best threshold
    ix = np.argmax(scores)
    best_threshold = thresholds[ix]
    print('Threshold=%.3f, Recall_macro=%.5f' % (thresholds[ix], scores[ix]))
    return best_threshold

## Описание признаков и последующих замен
| Исходный признак | Замена | Описание |
| --- | --- | --- |
|'ID'|***'id'***| идентификатор опрошенного|
|'Пол' | ***'sex'*** | пол |   
|'Семья' | ***'family'*** | семейное положение|
|'Этнос' | ***'ethnos'*** | этнос  |
|'Национальность' | '***'nationality'*** | национальность  |
|'Религия' | ***'religion'*** | отношение к религиозной группе  |
|'Образование' | ***''education'***' | уровень образования  |
|'Профессия' | ***''profession'***' | профессия  |
|'Вы работаете?' | ***'job'*** | наличие работы в данный момент  |
|'Выход на пенсию' | ***'retire'*** | является ли опрошенный пенсионером  |
|'Прекращение работы по болезни | ***'stop_work_due_disease'*** | связано ли прекращение работы с болезнью  |
|'Сахарный диабет' | ***'diabetes'*** | наличие сахарного диабета (любого типа)  |
|'Гепатит' | ***'hepatitis'*** | наличие гепатита  |
|'Онкология' | ***'oncology'*** | наличие онкологических заболеваний  |
|'Хроническое заболевание легких' | ***'chronic_lung_disease'*** | наличие хронического заболевания легких  |
|'Бронжиальная астма' | ***'bronchial_asthma'*** | наличие бронхиальной астмы  |
|'Туберкулез легких ' | ***'tuberculosis'*** | наличие туберкулеза  |
|'ВИЧ/СПИД' |  ***'hiv/aids'*** | наличие ВИЧ/СПИД  |
|'Регулярный прим лекарственных средств' | ***'intake_medicines'*** | факт регулярного приема лекарственных средств | 
|'Травмы за год' | ***'trauma_last_year'*** | факт получения травм за последний год  |
|'Переломы' | ***'fractures'*** | наличие переломов  |
|'Статус Курения' | ***'smoking'*** | статус курения в данный момент  |
|'Возраст курения' | ***'smoking_duration'*** | длительность курения  |
|'Сигарет в день' | ***'ciggaretes_per_day'*** | количество сигарет в день  |
|'Пассивное курение' | ***'passive_smoking'*** | наличие курящих в окружении  |
|'Частота пасс кур' | ***'passive_smoking_frequency'*** | частота нахождения среди курящих  |
|'Алкоголь' | ***'alcohol'*** | отношение к приему алкоголя  |
|'Возраст алког' | ***'alcohol_duration'*** | длительность употребления алкоголя (годы)  |
|'Время засыпания' | ***'time_sleep_onset'*** | время засыпания  |
|'Время пробуждения' | ***'time_sleep_upset'*** | время пробуждения  |
|'Сон после обеда' | ***'midday_sleep'*** | сон в середине дня (после обеда)  |
|'Спорт, клубы' | ***'sport'*** | факт регулярного занятия спортом(фитнесом)  |
|'Религия, клубы' | ***'religion_clubs'*** | факт регулярного посещения религиозных мероприятий, групп  |
|'ID_y' | ***'id_y'*** | идентификатор (связь с таблицей диагнозов)  |
|'Артериальная гипертензия' | ***'arterial_hypertension'*** | наличие артериальной гипертензии  |
|'ОНМК' | ***'stroke'*** | наличие острого нарушения мозгового кровообращения (инсульта)  |
|'Стенокардия, ИБС, инфаркт миокарда' | ***'IHD'*** | наличие ишемической болезни сердца (все формы)  |
|'Сердечная недостаточность' | ***'heart_failure'*** | наличие сердечной недостаточности  |
|'Прочие заболевания сердца' | ***'other_cardio_diseases'*** | наличие других сердечных заболеваний  |

# Подготовка и очистка данных 

In [12]:
df_train = pd.read_csv('data/train.csv')
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 955 entries, 0 to 954
Data columns (total 39 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   ID                                     955 non-null    object 
 1   Пол                                    954 non-null    object 
 2   Семья                                  955 non-null    object 
 3   Этнос                                  955 non-null    object 
 4   Национальность                         955 non-null    object 
 5   Религия                                955 non-null    object 
 6   Образование                            955 non-null    object 
 7   Профессия                              955 non-null    object 
 8   Вы работаете?                          955 non-null    int64  
 9   Выход на пенсию                        955 non-null    int64  
 10  Прекращение работы по болезни          955 non-null    int64  
 11  Сахарн

In [13]:
df_test = pd.read_csv('data/test_dataset_test.csv')
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 638 entries, 0 to 637
Data columns (total 33 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   ID                                     638 non-null    object 
 1   Пол                                    638 non-null    object 
 2   Семья                                  638 non-null    object 
 3   Этнос                                  638 non-null    object 
 4   Национальность                         638 non-null    object 
 5   Религия                                638 non-null    object 
 6   Образование                            638 non-null    object 
 7   Профессия                              638 non-null    object 
 8   Вы работаете?                          638 non-null    int64  
 9   Выход на пенсию                        638 non-null    int64  
 10  Прекращение работы по болезни          638 non-null    int64  
 11  Сахарн

In [14]:
submit = pd.read_csv('data/sample_solution.csv')
submit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 638 entries, 0 to 637
Data columns (total 6 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   ID                                  638 non-null    object
 1   Артериальная гипертензия            638 non-null    int64 
 2   ОНМК                                638 non-null    int64 
 3   Стенокардия, ИБС, инфаркт миокарда  638 non-null    int64 
 4   Сердечная недостаточность           638 non-null    int64 
 5   Прочие заболевания сердца           638 non-null    int64 
dtypes: int64(5), object(1)
memory usage: 30.0+ KB


In [15]:
# Для корректной обработки признаков объединяем тренировочный и тестовый набор данных в один датасет

# выделение целевых значений и датасета
target_train = df_train.iloc[:,-5:]
df_train = df_train.iloc[:,:-5]

# вспомогательный флаг тренировочного и тестовой частей общего набора данных
df_train['train'] = 1
df_test['train'] = 0


# объединение в один датасет
df = pd.concat([df_train, df_test]).reset_index(drop=True)

In [16]:
# Фунция, которая выводит корреляцию признака с целевыми переменными (содержатся в разных датафреймах)
def check_corrs_with_target(df, target_train=target_train, col = df.columns.to_list()):
    df_train = df[df.train == 1]
    df_for_check = pd.concat([df_train, target_train], axis=1)
    cols_target = target_train.columns.to_list()
    #col.extend(cols_target)
    print('corrs_matrix_between: ', col)
    corrs = df_for_check.corr()
    return corrs.loc[col, cols_target]

In [17]:
df.head(1)

Unnamed: 0,ID,Пол,Семья,Этнос,Национальность,Религия,Образование,Профессия,Вы работаете?,Выход на пенсию,Прекращение работы по болезни,Сахарный диабет,Гепатит,Онкология,Хроническое заболевание легких,Бронжиальная астма,Туберкулез легких,ВИЧ/СПИД,Регулярный прим лекарственных средств,Травмы за год,Переломы,Статус Курения,Возраст курения,Сигарет в день,Пассивное курение,Частота пасс кур,Алкоголь,Возраст алког,Время засыпания,Время пробуждения,Сон после обеда,"Спорт, клубы","Религия, клубы",ID_y,train
0,54-102-358-02,М,в браке в настоящее время,европейская,Русские,Христианство,3 - средняя школа / закон.среднее / выше среднего,низкоквалифицированные работники,1,0,0,0,0,0,0,0,0,0,0,0,0,Курит,15.0,20.0,0,,употребляю в настоящее время,18.0,22:00:00,06:00:00,0,0,0,54-102-358-02,1


In [18]:
feature_map = {
    'ID': 'id',
    'Пол': 'sex',
    'Семья': 'family',
    'Этнос': 'ethnos',
    'Национальность': 'nationality',
    'Религия': 'religion',
    'Образование': 'education',
    'Профессия': 'profession',
    'Вы работаете?': 'job',
    'Выход на пенсию': 'retired',
    'Прекращение работы по болезни': 'stop_work_due_disease',
    'Сахарный диабет': 'diabetes',
    'Гепатит': 'hepatitis',
    'Онкология': 'oncology',
    'Хроническое заболевание легких': 'chronic_lung_disease',
    'Бронжиальная астма': 'bronchial_asthma',
    'Туберкулез легких ': 'tuberculosis',
    'ВИЧ/СПИД': 'hiv/aids',
    'Регулярный прим лекарственных средств': 'intake_medicines',
    'Травмы за год': 'trauma_last_year',
    'Переломы': 'fractures',
    'Статус Курения': 'smoking',
    'Возраст курения': 'smoking_duration',
    'Сигарет в день': 'ciggaretes_per_day',
    'Пассивное курение': 'passive_smoking',
    'Частота пасс кур': 'passive_smoking_frequency',
    'Алкоголь': 'alcohol',
    'Возраст алког': 'alcohol_duration',
    'Время засыпания': 'time_sleep_onset',
    'Время пробуждения': 'time_sleep_upset',
    'Сон после обеда': 'midday_sleep',
    'Спорт, клубы': 'sport',
    'Религия, клубы': 'religion_clubs',
    'ID_y': 'id_y',
    'Артериальная гипертензия': 'arterial_hypertension',
    'ОНМК': 'stroke',
    'Стенокардия, ИБС, инфаркт миокарда': 'IHD',
    'Сердечная недостаточность': 'heart_failure',
    'Прочие заболевания сердца': 'other_cardio_diseases'}

In [19]:
df.rename(feature_map, axis=1, inplace=True)

In [20]:
df.columns

Index(['id', 'sex', 'family', 'ethnos', 'nationality', 'religion', 'education',
       'profession', 'job', 'retired', 'stop_work_due_disease', 'diabetes',
       'hepatitis', 'oncology', 'chronic_lung_disease', 'bronchial_asthma',
       'tuberculosis', 'hiv/aids', 'intake_medicines', 'trauma_last_year',
       'fractures', 'smoking', 'smoking_duration', 'ciggaretes_per_day',
       'passive_smoking', 'passive_smoking_frequency', 'alcohol',
       'alcohol_duration', 'time_sleep_onset', 'time_sleep_upset',
       'midday_sleep', 'sport', 'religion_clubs', 'id_y', 'train'],
      dtype='object')

In [21]:
target_train.rename(feature_map, axis=1, inplace=True)

In [22]:
target_train.columns

Index(['arterial_hypertension', 'stroke', 'IHD', 'heart_failure',
       'other_cardio_diseases'],
      dtype='object')

In [23]:
#Проверка на совпадение ID в train
print((df[df.train==1].id == df[df.train==1].id_y).sum() / len(df[df.train==1]))
#Проверка на совпадение ID в test и submit
print((df[df.train==0].id.reset_index(drop=True) == submit.ID).sum() / len(submit))

1.0
1.0


In [24]:
df.id[:10]

0    54-102-358-02
1    54-103-101-01
2    54-501-026-03
3    54-501-094-02
4    54-503-022-01
5    54-002-074-01
6    54-602-027-01
7    54-102-072-01
8    54-102-145-01
9    54-002-002-01
Name: id, dtype: object

# ADD FEATURE unknown_id_feature, id_1, id_2, id_3, id_4

Последние цифры id могут характеризовать все что угодно (возможно, не имеют ценности)
Предположительно это могут быть:
1. Место проведение опроса (улица, поликлиника, больница, общественное место) 
2. Источник опроса (опросил сотрудник, бумажный носитель, интернет-анкета и др. 
3. id сотрудника, проводившего опрос  

Место и источник могут влиять на результат опроса


In [25]:
# df['unknown_id_feature_1'] = df['id'].apply(lambda x: int(x[0:2]))
# df['unknown_id_feature_2'] = df['id'].apply(lambda x: int(x[3:6]))
# df['unknown_id_feature_3'] = df['id'].apply(lambda x: int(x[7:10]))


df['unknown_id_feature'] = df['id'].apply(lambda x: int(x[-1:]))


id_features = pd.get_dummies(df['unknown_id_feature'], prefix='id')

df = pd.concat([df, id_features], axis=1)

In [26]:
df.drop(['id', 'id_y'], axis=1, inplace=True)

In [27]:
# total amount of missing data
# Общее количество пропусков в данных
df.isna().sum()

sex                             1
family                          0
ethnos                          0
nationality                     0
religion                        0
education                       0
profession                      0
job                             0
retired                         0
stop_work_due_disease           0
diabetes                        0
hepatitis                       0
oncology                        0
chronic_lung_disease            0
bronchial_asthma                0
tuberculosis                    0
hiv/aids                        0
intake_medicines                0
trauma_last_year                0
fractures                       0
smoking                         0
smoking_duration              912
ciggaretes_per_day            919
passive_smoking                 0
passive_smoking_frequency    1216
alcohol                         0
alcohol_duration              304
time_sleep_onset                0
time_sleep_upset                0
midday_sleep  

### Колонка sex

In [28]:
df.sex.value_counts(dropna=False)

Ж      1117
М       475
NaN       1
Name: sex, dtype: int64

In [29]:
# Заполнение пропуска модой (в данном случае - Ж)
df.sex.fillna(df.sex.mode()[0], inplace=True)

In [30]:
df.sex.value_counts(dropna=False)

Ж    1118
М     475
Name: sex, dtype: int64

In [31]:
df.sex = np.where(df.sex == 'Ж', 0, 1)

In [32]:
# Оценка корреляции с целевыми признаками
check_corrs_with_target(df, col = ['sex'])

corrs_matrix_between:  ['sex']


Unnamed: 0,arterial_hypertension,stroke,IHD,heart_failure,other_cardio_diseases
sex,-0.119174,0.075247,0.005842,-0.08241,-0.086526


### Колонка family

In [33]:
df.family.value_counts(dropna=False)

в браке в настоящее время                          937
вдовец / вдова                                     246
в разводе                                          201
гражданский брак / проживание с партнером          129
никогда не был(а) в браке                           77
раздельное проживание (официально не разведены)      3
Name: family, dtype: int64

In [34]:
# Создание отдельного датафрейма со статусами семейного положение
family = pd.get_dummies(df.family)
print('before replace: ', list(family.columns))

# Словарь для замены названий колонок
fam_cols_map = {'в браке в настоящее время': 'married', 
                'в разводе': 'divorced', 
                'вдовец / вдова': 'widowed',
                'гражданский брак / проживание с партнером': 'cohabited', 
                'никогда не был(а) в браке': 'single',
                'раздельное проживание (официально не разведены)': 'separated'}

# Замена названий колонок в датафрейме
family.rename(fam_cols_map, axis=1, inplace=True)
print('after replace: ', list(family.columns))

before replace:  ['в браке в настоящее время', 'в разводе', 'вдовец / вдова', 'гражданский брак / проживание с партнером', 'никогда не был(а) в браке', 'раздельное проживание (официально не разведены)']
after replace:  ['married', 'divorced', 'widowed', 'cohabited', 'single', 'separated']


In [35]:
# Добавление статусов о семейном положении в основной датафрейм
df = pd.concat([df, family], axis=1)
df.drop('family', axis=1, inplace=True)

In [36]:
# Матрица корреляций статусов семейного положения и целевых переменных
fam_cols = family.columns
check_corrs_with_target(df, col = fam_cols)

corrs_matrix_between:  Index(['married', 'divorced', 'widowed', 'cohabited', 'single', 'separated'], dtype='object')


Unnamed: 0,arterial_hypertension,stroke,IHD,heart_failure,other_cardio_diseases
married,-0.080766,0.01921,-0.025806,-0.046685,-0.02751
divorced,-0.014791,0.009703,-0.04085,-0.005838,0.029754
widowed,0.189502,0.012461,0.093806,0.132974,0.032011
cohabited,-0.03729,-0.026097,-0.01946,-0.049826,-0.001516
single,-0.051434,-0.045989,-0.004129,-0.039003,-0.033028
separated,-0.042882,-0.009703,-0.017117,-0.015315,-0.014411


### Колонка ethnos

In [37]:
df.ethnos.value_counts(dropna=False)

европейская                                                                                                      1559
другая азиатская (Корея, Малайзия, Таиланд, Вьетнам, Казахстан, Киргизия, Туркмения, Узбекистан, Таджикистан)      24
прочее (любая иная этно-расовая группа, не представленная выше)                                                    10
Name: ethnos, dtype: int64

In [38]:
ethnos = pd.get_dummies(df.ethnos)
print('before replace: ', list(ethnos.columns))

ethnos_col_map = {'другая азиатская (Корея, Малайзия, Таиланд, Вьетнам, Казахстан, Киргизия, Туркмения, Узбекистан, Таджикистан)': 'asian',
                  'европейская': 'european',
                  'прочее (любая иная этно-расовая группа, не представленная выше)': 'other'}

ethnos.rename(ethnos_col_map, axis=1, inplace=True)

print('after replace: ', list(ethnos.columns))

before replace:  ['другая азиатская (Корея, Малайзия, Таиланд, Вьетнам, Казахстан, Киргизия, Туркмения, Узбекистан, Таджикистан)', 'европейская', 'прочее (любая иная этно-расовая группа, не представленная выше)']
after replace:  ['asian', 'european', 'other']


In [39]:
df = pd.concat([df, ethnos], axis=1)
df.drop('ethnos', axis=1, inplace=True)

In [40]:
ethnos_col = ethnos.columns
check_corrs_with_target(df, col=ethnos_col)

corrs_matrix_between:  Index(['asian', 'european', 'other'], dtype='object')


Unnamed: 0,arterial_hypertension,stroke,IHD,heart_failure,other_cardio_diseases
asian,0.000964,-0.028513,0.022155,0.007666,-0.042351
european,0.002794,0.034006,-0.001218,0.009179,0.050509
other,-0.006621,-0.0182,-0.032108,-0.028727,-0.027032


### Колонка nationality

In [41]:
df.nationality.value_counts(dropna=False)

Русские                  1500
Татары                     33
Немцы                      10
Украинцы                   10
Чуваши                      9
Азербайджанцы               4
Белорусы                    3
Другие национальности       3
Казахи                      3
Армяне                      3
Удмурты                     2
Евреи                       2
Таджики                     2
Киргизы                     2
Мордва                      2
Башкиры                     1
Молдаване                   1
Буряты                      1
Эстонцы                     1
Лезгины                     1
Name: nationality, dtype: int64

In [42]:
# Добавление колонки is_russian, так как количество других национальностей слишком мало
df['is_russian'] = np.where(df['nationality'] == 'Русские', 1, 0)

In [43]:
# Удаление исходной колонки
df.drop('nationality', axis=1, inplace=True)

In [44]:
check_corrs_with_target(df, col=['is_russian'])

corrs_matrix_between:  ['is_russian']


Unnamed: 0,arterial_hypertension,stroke,IHD,heart_failure,other_cardio_diseases
is_russian,-0.016501,0.008887,-0.056265,-0.035137,0.016238


### Колонка religion

In [45]:
df.religion.value_counts(dropna=False)

Христианство         1361
Атеист / агностик     127
Нет                    73
Ислам                  30
Другое                  1
Индуизм                 1
Name: religion, dtype: int64

In [46]:
religion = pd.get_dummies(df.religion)
print('before replace: ', list(religion.columns))

religion_cols_map = {'Атеист / агностик':'atheist', 
                     'Другое':'other', 
                     'Индуизм':'hinduist', 
                     'Ислам':'muslim', 
                     'Нет':'no', 
                     'Христианство':'christian'}


religion.rename(religion_cols_map, axis=1, inplace=True)
print('after replace: ', list(religion.columns))

before replace:  ['Атеист / агностик', 'Другое', 'Индуизм', 'Ислам', 'Нет', 'Христианство']
after replace:  ['atheist', 'other', 'hinduist', 'muslim', 'no', 'christian']


In [47]:
df = pd.concat([df, religion], axis=1)

In [48]:
religion_cols = religion.columns
check_corrs_with_target(df, col=religion_cols)

corrs_matrix_between:  Index(['atheist', 'other', 'hinduist', 'muslim', 'no', 'christian'], dtype='object')


Unnamed: 0,arterial_hypertension,stroke,IHD,heart_failure,other_cardio_diseases
atheist,-0.045189,-0.030536,-0.017138,-0.032177,-0.035071
other,-0.006621,-0.0182,-0.032108,-0.028727,-0.027032
other,,,,,
hinduist,,,,,
muslim,-0.016203,-0.034006,0.001218,-0.009179,-0.050509
no,-0.011883,-0.019373,0.015392,0.015093,-0.01249
christian,0.049008,0.049311,0.004372,0.020798,0.056125


In [49]:
#Ввиду отсутствия значимых корреляций, решено оставить только колонку christian.
df.drop(['atheist', 'hinduist','muslim','no','other'], axis=1, inplace=True)

In [50]:
df.drop('religion', axis=1, inplace=True)

In [51]:
check_corrs_with_target(df, col='christian')

corrs_matrix_between:  christian


arterial_hypertension    0.049008
stroke                   0.049311
IHD                      0.004372
heart_failure            0.020798
other_cardio_diseases    0.056125
Name: christian, dtype: float64

### Колонка education

In [52]:
df.education.value_counts(dropna=False)

4 - профессиональное училище                         757
5 - ВУЗ                                              547
3 - средняя школа / закон.среднее / выше среднего    279
2 - начальная школа                                   10
Name: education, dtype: int64

In [53]:
# Приведение к порядковому признаку начинающемуся с 1
df.education = df.education.apply(lambda x: int(x[0])-1)

In [54]:
check_corrs_with_target(df, col='education')

corrs_matrix_between:  education


arterial_hypertension   -0.106846
stroke                  -0.064249
IHD                     -0.050195
heart_failure           -0.080554
other_cardio_diseases    0.016931
Name: education, dtype: float64

### Колонка profession

In [55]:
df.profession.value_counts(dropna=False)

дипломированные специалисты                                                            382
низкоквалифицированные работники                                                       225
работники,  занятые в сфере обслуживания, торговые работники магазинов и рынков        205
ремесленники и представители других отраслей промышленности                            149
служащие                                                                               142
техники и младшие специалисты                                                          140
ведение домашнего хозяйства                                                            133
операторы и монтажники установок и машинного оборудования                              128
представители   законодат.   органов   власти,  высокопостав. долж.лица и менеджеры     55
квалифицированные работники сельского хозяйства и рыболовного                           25
вооруженные силы                                                                         9

In [56]:
profession = pd.get_dummies(df.profession)
profession.columns

Index(['ведение домашнего хозяйства', 'вооруженные силы',
       'дипломированные специалисты',
       'квалифицированные работники сельского хозяйства и рыболовного',
       'низкоквалифицированные работники',
       'операторы и монтажники установок и машинного оборудования',
       'представители   законодат.   органов   власти,  высокопостав. долж.лица и менеджеры',
       'работники,  занятые в сфере обслуживания, торговые работники магазинов и рынков',
       'ремесленники и представители других отраслей промышленности',
       'служащие', 'техники и младшие специалисты'],
      dtype='object')

In [57]:
profession_cols_map = {'ведение домашнего хозяйства': 'household',
                       'вооруженные силы': 'military',
                       'дипломированные специалисты': 'senior_specialist',
                       'квалифицированные работники сельского хозяйства и рыболовного': 'farmer/fisherman',
                       'низкоквалифицированные работники': 'low_qualified',
                       'операторы и монтажники установок и машинного оборудования': 'hard_industry',
                       'представители   законодат.   органов   власти,  высокопостав. долж.лица и менеджеры': 'top_management',
                       'работники,  занятые в сфере обслуживания, торговые работники магазинов и рынков': 'service',
                       'ремесленники и представители других отраслей промышленности': 'crafters_other',
                       'служащие': 'office',
                       'техники и младшие специалисты': 'junior_specialists'}

profession.rename(profession_cols_map, axis=1, inplace=True)

In [58]:
df = pd.concat([df, profession], axis=1)
df.drop('profession', axis=1, inplace=True)

In [59]:
profession_cols = profession.columns
check_corrs_with_target(df, col=profession_cols)

corrs_matrix_between:  Index(['household', 'military', 'senior_specialist', 'farmer/fisherman',
       'low_qualified', 'hard_industry', 'top_management', 'service',
       'crafters_other', 'office', 'junior_specialists'],
      dtype='object')


Unnamed: 0,arterial_hypertension,stroke,IHD,heart_failure,other_cardio_diseases
household,0.061559,0.009688,0.05819,0.098212,-0.016993
military,0.019338,-0.015365,-0.027108,-0.024253,0.027865
senior_specialist,-0.049845,-0.020203,-0.004255,-0.037893,0.049462
farmer/fisherman,0.064455,0.049622,0.07046,0.060338,0.012978
low_qualified,-0.008645,-0.028277,0.00993,-0.048256,-0.056464
hard_industry,-0.092412,0.017783,-0.022026,-0.016324,-0.089831
top_management,-0.01503,-0.04131,-0.055884,-0.046672,-0.041891
service,-0.009041,-0.019749,-0.057863,0.037795,0.031926
crafters_other,0.060903,0.109801,0.012044,-0.047276,0.012399
office,-0.021777,0.002407,-0.000286,0.023276,0.03625


### Колонка job

In [60]:
df.job.value_counts(dropna=False)

1    865
0    728
Name: job, dtype: int64

In [61]:
check_corrs_with_target(df, col='job')

corrs_matrix_between:  job


arterial_hypertension   -0.224856
stroke                  -0.077221
IHD                     -0.206260
heart_failure           -0.176134
other_cardio_diseases   -0.058784
Name: job, dtype: float64

### Колонка retired

In [62]:
df.retired.value_counts(dropna=False)

0    1089
1     504
Name: retired, dtype: int64

In [63]:
check_corrs_with_target(df, col='retired')

corrs_matrix_between:  retired


arterial_hypertension    0.255916
stroke                   0.057585
IHD                      0.221894
heart_failure            0.175824
other_cardio_diseases    0.032419
Name: retired, dtype: float64

### Колонка stop_work_due_disease

In [64]:
df.stop_work_due_disease.value_counts(dropna=False)

0    1509
1      84
Name: stop_work_due_disease, dtype: int64

In [65]:
check_corrs_with_target(df, col = 'stop_work_due_disease')

corrs_matrix_between:  stop_work_due_disease


arterial_hypertension    0.044279
stroke                   0.097109
IHD                      0.080011
heart_failure            0.054904
other_cardio_diseases    0.031735
Name: stop_work_due_disease, dtype: float64

### Колонка diabetes

In [66]:
df.diabetes.value_counts(dropna=False)

0    1441
1     152
Name: diabetes, dtype: int64

### Колонка hepatitis

In [67]:
df.hepatitis.value_counts(dropna=False)

0    1410
1     183
Name: hepatitis, dtype: int64

### Колонка oncology

In [68]:
df.oncology.value_counts(dropna=False)

0    1521
1      72
Name: oncology, dtype: int64

### Колонка chronic_lung_disease

In [69]:
df.chronic_lung_disease.value_counts(dropna=False)

0    1469
1     124
Name: chronic_lung_disease, dtype: int64

### Колонка bronchial_asthma

In [70]:
df.bronchial_asthma.value_counts(dropna=False)

0    1530
1      63
Name: bronchial_asthma, dtype: int64

### Колонка tuberculosis

In [71]:
df.tuberculosis.value_counts(dropna=False)

0    1565
1      28
Name: tuberculosis, dtype: int64

### Колонка hiv/aids

In [72]:
df['hiv/aids'].value_counts(dropna=False)

0    1591
1       2
Name: hiv/aids, dtype: int64

In [73]:
# Колонки связанные с коморбидностью
comorbid_cols = ['diabetes', 'hepatitis', 'oncology',
                 'chronic_lung_disease', 'bronchial_asthma', 'hiv/aids']

In [74]:
check_corrs_with_target(df, col=comorbid_cols)

corrs_matrix_between:  ['diabetes', 'hepatitis', 'oncology', 'chronic_lung_disease', 'bronchial_asthma', 'hiv/aids']


Unnamed: 0,arterial_hypertension,stroke,IHD,heart_failure,other_cardio_diseases
diabetes,0.230891,0.044747,0.141505,0.100181,0.034551
hepatitis,-0.030354,-0.014875,-0.020499,0.004706,-0.004002
oncology,0.090267,0.003835,0.042075,0.044964,0.055175
chronic_lung_disease,0.066575,-0.001782,0.002166,0.049622,0.062571
bronchial_asthma,0.091658,-0.044858,0.046903,0.100989,0.023601
hiv/aids,-0.042882,-0.009703,-0.017117,-0.015315,-0.014411


# ADD FEATURE (comorbid_count & is_comorbid)

In [75]:
# Количество опрошенных с определенными заболеваниями
df[comorbid_cols].apply(lambda x: sum(x))

diabetes                152
hepatitis               183
oncology                 72
chronic_lung_disease    124
bronchial_asthma         63
hiv/aids                  2
dtype: int64

In [76]:
df['comorbid_count'] = df[comorbid_cols].sum(axis=1)
df['is_comorbid'] = np.where(df['comorbid_count']>0, 1, 0)

In [77]:
check_corrs_with_target(df, col = ['comorbid_count','is_comorbid'])

corrs_matrix_between:  ['comorbid_count', 'is_comorbid']


Unnamed: 0,arterial_hypertension,stroke,IHD,heart_failure,other_cardio_diseases
comorbid_count,0.179487,-0.00046,0.086354,0.117311,0.065174
is_comorbid,0.178101,0.022106,0.095233,0.116283,0.084209


### Колонка intake_medicines

In [78]:
df.intake_medicines.value_counts(dropna=False)

1    995
0    598
Name: intake_medicines, dtype: int64

In [79]:
check_corrs_with_target(df, col='intake_medicines')

corrs_matrix_between:  intake_medicines


arterial_hypertension    0.452289
stroke                   0.114712
IHD                      0.218472
heart_failure            0.176454
other_cardio_diseases    0.119922
Name: intake_medicines, dtype: float64

### Колонка trauma_last_year

In [80]:
df.trauma_last_year.value_counts(dropna=False)

0    1508
1      85
Name: trauma_last_year, dtype: int64

In [81]:
check_corrs_with_target(df, col='trauma_last_year')

corrs_matrix_between:  trauma_last_year


arterial_hypertension    0.011036
stroke                  -0.004355
IHD                     -0.017729
heart_failure           -0.001962
other_cardio_diseases    0.039166
Name: trauma_last_year, dtype: float64

### Колонка fractures

In [82]:
df.fractures.value_counts(dropna=False)

0    1032
1     561
Name: fractures, dtype: int64

In [83]:
check_corrs_with_target(df, col='fractures')

corrs_matrix_between:  fractures


arterial_hypertension    0.097213
stroke                   0.070100
IHD                      0.077409
heart_failure            0.058426
other_cardio_diseases    0.034900
Name: fractures, dtype: float64

# ADD FEATURES (fracture_last_year, trauma_on_retire, fracture_on_retire)

In [84]:
# Признак о переломе кости за последний год (предположительно)
df['fracture_last_year'] = df['trauma_last_year'] * df['fractures']
# Признак о наличии травмы за последний год будучи в пенсионном возрасте
df['trauma_on_retire'] = df['trauma_last_year'] * df['retired']
# Признак о переломе кости за последний год (предположительном) будучи на пенсии
df['fracture_on_retire'] = df['fracture_last_year'] * df['retired']


In [85]:
trauma_features = ['fracture_last_year', 'trauma_on_retire', 'fracture_on_retire']
check_corrs_with_target(df, col=trauma_features)

corrs_matrix_between:  ['fracture_last_year', 'trauma_on_retire', 'fracture_on_retire']


Unnamed: 0,arterial_hypertension,stroke,IHD,heart_failure,other_cardio_diseases
fracture_last_year,-0.00386,0.013674,-0.004894,-0.009606,0.016512
trauma_on_retire,0.032709,0.010554,0.046307,0.034002,0.040642
fracture_on_retire,0.006801,0.028961,0.055685,0.034031,0.039511


Примечание: генерирование вышеуказанных признаков связано с повышенными рисками развития заболеваний (не только сердечно-сосудистой системы) при переломе шейки бедра в пенсионном возрасте.

### Колонка smoking

In [86]:
df.smoking.value_counts(dropna=False)

Никогда не курил(а)    911
Курит                  360
Бросил(а)              321
Никогда не курил         1
Name: smoking, dtype: int64

In [87]:
#Исправление ошибки
df.smoking.replace('Никогда не курил', 'Никогда не курил(а)', inplace=True)

In [88]:
smoking = pd.get_dummies(df.smoking)
print('before replace: ', list(smoking.columns))

smoking_cols_map = {'Бросил(а)':'previous_smoke', 'Курит':'smoke', 'Никогда не курил(а)':'never_smoke'}

smoking.rename(smoking_cols_map, axis=1, inplace=True)

print('after replace: ', list(smoking.columns))

before replace:  ['Бросил(а)', 'Курит', 'Никогда не курил(а)']
after replace:  ['previous_smoke', 'smoke', 'never_smoke']


In [89]:
df = df.drop('smoking', axis=True)
df = pd.concat([df, smoking], axis=1)

# ADD FEATURE smoker_score

In [90]:
df['smoker_score'] = df['never_smoke']*0 + df['previous_smoke'] + df['smoke'] * 2

In [91]:
smoking_cols = ['previous_smoke', 'never_smoke', 'smoke', 'smoker_score']

In [92]:
check_corrs_with_target(df, col=smoking_cols)

corrs_matrix_between:  ['previous_smoke', 'never_smoke', 'smoke', 'smoker_score']


Unnamed: 0,arterial_hypertension,stroke,IHD,heart_failure,other_cardio_diseases
previous_smoke,0.014692,-0.054241,0.020759,-0.04527,-0.001829
never_smoke,0.069542,-0.003255,0.015962,0.094327,0.008136
smoke,-0.095603,0.055267,-0.038434,-0.067838,-0.00782
smoker_score,-0.090244,0.030081,-0.029108,-0.090927,-0.008845


### Колонка smoking_duration

In [93]:
df.smoking_duration.value_counts(dropna=False)

NaN     912
20.0    111
18.0     94
16.0     72
17.0     66
15.0     51
19.0     33
25.0     31
30.0     23
14.0     18
12.0     15
21.0     15
22.0     14
7.0      13
24.0     11
40.0     11
23.0     10
13.0      9
10.0      9
35.0      9
6.0       5
27.0      5
45.0      5
26.0      5
8.0       4
28.0      4
29.0      4
33.0      3
36.0      3
41.0      3
50.0      3
31.0      2
39.0      2
47.0      2
52.0      1
44.0      1
38.0      1
32.0      1
56.0      1
42.0      1
55.0      1
58.0      1
11.0      1
51.0      1
34.0      1
37.0      1
53.0      1
5.0       1
54.0      1
43.0      1
Name: smoking_duration, dtype: int64

In [94]:
df['smoking_duration_score'] = df['smoking_duration'] // 5

In [95]:
df['smoking_duration_score'].value_counts(dropna=False)

NaN     912
3.0     316
4.0     161
2.0      52
5.0      49
6.0      30
1.0      23
8.0      17
7.0      16
10.0      7
9.0       7
11.0      3
Name: smoking_duration_score, dtype: int64

In [96]:
df['smoking_duration'].fillna(0, inplace=True)

In [97]:
df['smoking_duration_score'].fillna(0, inplace=True)

In [98]:
df['smoking_duration_score'] = np.where(df['smoking_duration_score']  > 9, 9, df['smoking_duration_score'])

In [99]:
check_corrs_with_target(df, col=['smoking_duration', 'smoking_duration_score'])

corrs_matrix_between:  ['smoking_duration', 'smoking_duration_score']


Unnamed: 0,arterial_hypertension,stroke,IHD,heart_failure,other_cardio_diseases
smoking_duration,-0.036623,0.003147,-0.016511,-0.06434,0.020556
smoking_duration_score,-0.033445,3.3e-05,-0.017016,-0.059927,0.02171


### Колонка ciggaretes_per_day

In [100]:
df['ciggaretes_per_day_log'] = np.log(df.ciggaretes_per_day + 1)

In [101]:
df['ciggaretes_per_day_score'] = (df.ciggaretes_per_day // 10) + 1

In [102]:
df.ciggaretes_per_day_score = np.where(df.ciggaretes_per_day_score > 5, 5, df.ciggaretes_per_day_score)

In [103]:
ciggaretes_features = ['ciggaretes_per_day', 'ciggaretes_per_day_score',
                       'ciggaretes_per_day_log']

In [104]:
df['ciggaretes_per_day_score'].fillna(0, inplace=True)

In [105]:
for col in ciggaretes_features:
    df[col].fillna(0, inplace=True)

In [106]:
check_corrs_with_target(df, col=ciggaretes_features)

corrs_matrix_between:  ['ciggaretes_per_day', 'ciggaretes_per_day_score', 'ciggaretes_per_day_log']


Unnamed: 0,arterial_hypertension,stroke,IHD,heart_failure,other_cardio_diseases
ciggaretes_per_day,-0.029327,0.007107,0.015671,-0.069713,-0.041501
ciggaretes_per_day_score,-0.045596,0.011395,0.013145,-0.078645,-0.042002
ciggaretes_per_day_log,-0.057114,0.01071,0.003519,-0.088335,-0.026594


# ADD FEATURE variants of smoking_score_int

In [107]:
df['smoking_score_int_log'] = (
    df['smoker_score'] * df['smoking_duration_score']) ** df['ciggaretes_per_day_log']

df['smoking_score_int_score'] = (
    df['smoker_score'] * df['smoking_duration_score']) ** df['ciggaretes_per_day_score']

df['smoking_score_int_score_2'] = (
    df['smoker_score'] * df['ciggaretes_per_day_score']) * df['smoking_duration_score']

df['smoking_score_int_score_3'] = (
    df['smoker_score'] * (df['ciggaretes_per_day_score'] ** df['smoking_duration_score']))

df['smoking_score_int_score_log'] = df['smoker_score'] ** df['ciggaretes_per_day_log']

In [108]:
smoke_features = ['smoker_score',
                  'ciggaretes_per_day_log', 'smoking_score_int_score',
                  'smoking_score_int_log', 'smoking_score_int_score_log', 
                  'smoking_score_int_score_2', 'smoking_score_int_score_3']

In [109]:
check_corrs_with_target(df, col=smoke_features)

corrs_matrix_between:  ['smoker_score', 'ciggaretes_per_day_log', 'smoking_score_int_score', 'smoking_score_int_log', 'smoking_score_int_score_log', 'smoking_score_int_score_2', 'smoking_score_int_score_3']


Unnamed: 0,arterial_hypertension,stroke,IHD,heart_failure,other_cardio_diseases
smoker_score,-0.090244,0.030081,-0.029108,-0.090927,-0.008845
ciggaretes_per_day_log,-0.057114,0.01071,0.003519,-0.088335,-0.026594
smoking_score_int_score,0.043129,-0.010952,0.056822,-0.024884,-0.02294
smoking_score_int_log,0.011216,0.006253,-0.000963,-0.03812,-0.022421
smoking_score_int_score_log,-0.057367,0.037104,-0.029631,-0.07899,-0.040684
smoking_score_int_score_2,-0.04403,0.019476,-0.00796,-0.071419,-0.03118
smoking_score_int_score_3,0.034131,-0.006945,0.086329,-0.011101,-0.01044


In [110]:
# Проверка на наличие пропущенных значений
df[smoke_features].isna().sum()

smoker_score                   0
ciggaretes_per_day_log         0
smoking_score_int_score        0
smoking_score_int_log          0
smoking_score_int_score_log    0
smoking_score_int_score_2      0
smoking_score_int_score_3      0
dtype: int64

### Колонка passive_smoking

In [111]:
df.passive_smoking.value_counts(dropna=False)

0    1215
1     378
Name: passive_smoking, dtype: int64

In [112]:
check_corrs_with_target(df, col='passive_smoking')

corrs_matrix_between:  passive_smoking


arterial_hypertension    0.011831
stroke                  -0.044094
IHD                      0.034347
heart_failure            0.028622
other_cardio_diseases   -0.001483
Name: passive_smoking, dtype: float64

### Колонка passive_smoking_frequency

In [113]:
df.passive_smoking_frequency.value_counts(dropna=False)

NaN                       1216
1-2 раза в неделю          158
4 и более раз в день        75
2-3 раза в день             59
не менее 1 раза в день      47
3-6 раз в неделю            38
Name: passive_smoking_frequency, dtype: int64

In [114]:
df.passive_smoking_frequency.fillna(0, inplace=True)

In [115]:
# Словарь для замены текста на порядок:
passive_smoking_dict = {'1-2 раза в неделю': 1, 
                        '3-6 раз в неделю': 2,
                        'не менее 1 раза в день': 3,
                        '2-3 раза в день': 4,
                        '4 и более раз в день': 5}

In [116]:
df.passive_smoking_frequency.replace(passive_smoking_dict, inplace=True)

In [117]:
check_corrs_with_target(df, col = 'passive_smoking_frequency')

corrs_matrix_between:  passive_smoking_frequency


arterial_hypertension    0.003508
stroke                  -0.046057
IHD                      0.038805
heart_failure            0.052193
other_cardio_diseases    0.017315
Name: passive_smoking_frequency, dtype: float64

### Колонка alcohol

In [118]:
df.alcohol.value_counts(dropna=False)

употребляю в настоящее время    1046
никогда не употреблял            300
ранее употреблял                 247
Name: alcohol, dtype: int64

In [119]:
alcohol = pd.get_dummies(df.alcohol)
print('Before replace: ', list(alcohol.columns))

alcohol_col_map = {'никогда не употреблял':'never_drink_alco',
                   'ранее употреблял':'previous_drink_alco',
                   'употребляю в настоящее время':'drink_alco_now'}

alcohol.rename(alcohol_col_map, axis=1, inplace=True)
print('After replace: ', list(alcohol.columns))

Before replace:  ['никогда не употреблял', 'ранее употреблял', 'употребляю в настоящее время']
After replace:  ['never_drink_alco', 'previous_drink_alco', 'drink_alco_now']


In [120]:
df = pd.concat([df, alcohol], axis=1)
df.drop('alcohol', axis=1, inplace=True)

In [121]:
alcohol_col = alcohol.columns
check_corrs_with_target(df, col = alcohol_col)

corrs_matrix_between:  Index(['never_drink_alco', 'previous_drink_alco', 'drink_alco_now'], dtype='object')


Unnamed: 0,arterial_hypertension,stroke,IHD,heart_failure,other_cardio_diseases
never_drink_alco,0.049645,-0.042137,0.015082,0.059077,0.049747
previous_drink_alco,0.063277,0.060122,0.094059,0.097195,0.019313
drink_alco_now,-0.089252,-0.013329,-0.08568,-0.123345,-0.054927


In [122]:
df['alcohol_score'] = df['never_drink_alco']*0 + df['previous_drink_alco'] + df['drink_alco_now']*2

In [123]:
check_corrs_with_target(df, col = 'alcohol_score')

corrs_matrix_between:  alcohol_score


arterial_hypertension   -0.078958
stroke                   0.012484
IHD                     -0.059839
heart_failure           -0.104449
other_cardio_diseases   -0.057996
Name: alcohol_score, dtype: float64

### Колонка alcohol_duration

In [124]:
df.alcohol_duration.value_counts(dropna=False)

NaN     304
20.0    299
18.0    278
17.0    131
16.0    111
25.0     88
19.0     66
21.0     57
15.0     50
22.0     42
30.0     28
23.0     25
14.0     18
35.0     15
24.0     13
26.0      8
40.0      8
12.0      5
45.0      5
29.0      5
33.0      4
27.0      4
37.0      4
13.0      4
10.0      3
50.0      3
32.0      3
6.0       2
28.0      2
46.0      1
36.0      1
9.0       1
48.0      1
63.0      1
43.0      1
44.0      1
60.0      1
Name: alcohol_duration, dtype: int64

In [125]:
df['alcohol_duration_score'] = df['alcohol_duration'] // 5

In [126]:
df['alcohol_duration_log'] = np.log(df['alcohol_duration'] + 1)

In [127]:
df['alcohol_duration_score'] = np.where(df.alcohol_duration_score>  9, 9, df.alcohol_duration_score)

In [128]:
df.alcohol_duration.fillna(0, inplace=True)
df.alcohol_duration_score.fillna(0, inplace=True)
df.alcohol_duration_log.fillna(0, inplace=True)

In [129]:
df['alcohol_int_score_1'] = df['alcohol_score'] ** df['alcohol_duration_score']

df['alcohol_int_score_2'] = (df['alcohol_score'] * df['alcohol_duration_score'])

df['alcohol_int_score_3'] = df['alcohol_duration_log'] ** df['alcohol_score']


In [130]:
alco_features = ['alcohol_duration', 'alcohol_duration_score', 'alcohol_duration_log',
                 'alcohol_int_score_1', 'alcohol_int_score_2', 'alcohol_int_score_3']

In [131]:
check_corrs_with_target(df, col = alco_features)

corrs_matrix_between:  ['alcohol_duration', 'alcohol_duration_score', 'alcohol_duration_log', 'alcohol_int_score_1', 'alcohol_int_score_2', 'alcohol_int_score_3']


Unnamed: 0,arterial_hypertension,stroke,IHD,heart_failure,other_cardio_diseases
alcohol_duration,0.006058,0.045679,0.005183,-0.013339,-0.033091
alcohol_duration_score,0.018031,0.047084,0.008138,-0.004915,-0.035852
alcohol_duration_log,-0.033025,0.046026,-0.007616,-0.043556,-0.045852
alcohol_int_score_1,0.019045,0.011645,-0.015414,-0.012578,-0.015989
alcohol_int_score_2,-0.017408,0.032432,-0.036184,-0.051155,-0.039593
alcohol_int_score_3,-0.062109,0.013989,-0.064165,-0.091707,-0.044619


### Колонка time_sleep_onset

In [132]:
df.time_sleep_onset = df.time_sleep_onset.apply(lambda x: dt.strptime(x, '%H:%M:%S'))

### ADD FEATURES about sleep onset

Примечание: время засыпания связано с развитием сердечно-сосудистых заболеваний о чем есть ряд научных публикаций, одна из них представлена ниже

Статья о влиянии времени засыпания на развитие сердечно-сосудистых заболеваний:
- https://doi.org/10.1093/ehjdh/ztab088

In [133]:
df['sleep_onset_early_22'] = np.where(df.time_sleep_onset.dt.time.between(time(20,0),time(21,59)), 1, 0)
df['sleep_onset_22'] = np.where(df.time_sleep_onset.dt.time.between(time(22,0),time(22,59)), 1, 0)
df['sleep_onset_23'] = np.where(df.time_sleep_onset.dt.time.between(time(23,0),time(23,59)), 1, 0)
df['sleep_onset_later_after_0'] = np.where(df.time_sleep_onset.dt.time.between(time(0,0),time(4,0)), 1, 0)

#additional:
df['early_onset'] = df['sleep_onset_early_22'] + df['sleep_onset_22']
df['lately_onset'] = df['sleep_onset_23'] + df['sleep_onset_later_after_0']

In [134]:
sleep_onset_cols = ['sleep_onset_early_22', 'sleep_onset_22',
                    'sleep_onset_23', 'sleep_onset_later_after_0', 'early_onset', 'lately_onset']

In [135]:
check_corrs_with_target(df, col=sleep_onset_cols)

corrs_matrix_between:  ['sleep_onset_early_22', 'sleep_onset_22', 'sleep_onset_23', 'sleep_onset_later_after_0', 'early_onset', 'lately_onset']


Unnamed: 0,arterial_hypertension,stroke,IHD,heart_failure,other_cardio_diseases
sleep_onset_early_22,0.038944,0.038912,0.00707,0.010225,0.019608
sleep_onset_22,0.027138,0.064104,0.01271,-0.023735,0.012066
sleep_onset_23,-0.013236,-0.048355,0.004281,-0.002796,-0.034675
sleep_onset_later_after_0,-0.031067,-0.024444,-0.017935,0.019204,0.016151
early_onset,0.045152,0.079354,0.015401,-0.016712,0.021255
lately_onset,-0.047608,-0.078761,-0.014505,0.017477,-0.02049


### Колонка time_sleep_upset

In [136]:
df.time_sleep_upset = df.time_sleep_upset.apply(lambda x: dt.strptime(x, '%H:%M:%S')) 

In [137]:
df['sleep_upset_early_6'] = np.where(df.time_sleep_upset.dt.time.between(time(4,0),time(5,59)), 1, 0)
df['sleep_upset_6'] = np.where(df.time_sleep_upset.dt.time.between(time(6,0),time(6,59)), 1, 0)
df['sleep_upset_7'] = np.where(df.time_sleep_upset.dt.time.between(time(7,0),time(7,59)), 1, 0)
df['sleep_upset_8'] = np.where(df.time_sleep_upset.dt.time.between(time(8,0),time(8,59)), 1, 0)
df['sleep_upset_after_9'] = np.where(df.time_sleep_upset.dt.time.between(time(9,0),time(11,59)), 1, 0)

# additional:
# Ранний подъем:
df['early_upset'] = df['sleep_upset_6'] + df['sleep_upset_7']

# Поздний подъем:
df['lately_upset'] = df['sleep_upset_8'] + df['sleep_upset_after_9']

In [138]:
upset_features = ['sleep_upset_early_6', 'sleep_upset_6', 'sleep_upset_7',
                  'sleep_upset_8', 'sleep_upset_after_9', 'early_upset', 'lately_upset']

In [139]:
df.sleep_upset_early_6.value_counts()

0    1398
1     195
Name: sleep_upset_early_6, dtype: int64

In [140]:
check_corrs_with_target(df, col=upset_features)

corrs_matrix_between:  ['sleep_upset_early_6', 'sleep_upset_6', 'sleep_upset_7', 'sleep_upset_8', 'sleep_upset_after_9', 'early_upset', 'lately_upset']


Unnamed: 0,arterial_hypertension,stroke,IHD,heart_failure,other_cardio_diseases
sleep_upset_early_6,0.059959,-0.029609,-0.008345,0.039253,0.031984
sleep_upset_6,-0.035852,0.004285,0.00789,-0.013799,-0.012014
sleep_upset_7,-0.036765,0.031326,-0.008729,0.035565,-0.038354
sleep_upset_8,0.028764,-0.007423,0.03822,0.006858,0.013119
sleep_upset_after_9,0.009271,-0.023065,-0.025812,-0.070504,0.021491
early_upset,-0.068202,0.032787,-0.000377,0.019227,-0.046669
lately_upset,0.030296,-0.022269,0.013601,-0.043607,0.025885


### ADD FEATURE sleeptime

In [141]:
df['sleeptime'] = (df.time_sleep_upset - df.time_sleep_onset).apply(lambda x: x.seconds//3600)

In [142]:
# Исправление дефекта при расчете времени сна из-за особенностей datetime
df['sleeptime'] = np.where(df['sleeptime'] > 14, df['sleeptime'] - 12, df['sleeptime'])

In [143]:
df['sleeptime'].value_counts()

7     457
8     453
6     230
9     214
10     93
5      67
11     32
4      22
12     11
3       8
0       4
13      2
Name: sleeptime, dtype: int64

### ADD FEATURE low_amout_sleep, high_amout_sleep

In [144]:
df['low_amount_sleep'] = np.where(df['sleeptime'] < 6, 1 ,0)
df['high_amout_sleep'] = np.where(df['sleeptime'] > 9, 1, 0)

In [145]:
sleeptime_features = ['sleeptime', 'low_amount_sleep', 'high_amout_sleep']

In [146]:
check_corrs_with_target(df, col=sleeptime_features)

corrs_matrix_between:  ['sleeptime', 'low_amount_sleep', 'high_amout_sleep']


Unnamed: 0,arterial_hypertension,stroke,IHD,heart_failure,other_cardio_diseases
sleeptime,0.035149,0.037473,-0.006052,-0.061437,-0.010208
low_amount_sleep,-0.013547,0.010021,-0.016293,0.058856,0.056018
high_amout_sleep,0.02435,0.024505,-0.015854,-0.031118,0.042978


In [147]:
df.drop(['time_sleep_onset','time_sleep_upset'], axis=1, inplace=True)

### Колонка midday_sleep

In [148]:
df.midday_sleep.value_counts(dropna=False)

0    1232
1     361
Name: midday_sleep, dtype: int64

In [149]:
check_corrs_with_target(df, col='midday_sleep')

corrs_matrix_between:  midday_sleep


arterial_hypertension    0.075877
stroke                   0.070712
IHD                      0.049903
heart_failure            0.002388
other_cardio_diseases   -0.056408
Name: midday_sleep, dtype: float64

### Колонка sport

In [150]:
df.sport.value_counts(dropna=False)

0    1491
1     102
Name: sport, dtype: int64

In [151]:
check_corrs_with_target(df, col='sport')

corrs_matrix_between:  sport


arterial_hypertension   -0.011300
stroke                   0.024806
IHD                      0.000465
heart_failure            0.020270
other_cardio_diseases    0.031178
Name: sport, dtype: float64

### Колонка religion_clubs

In [152]:
df.religion_clubs.value_counts(dropna=False)

0    1557
1      36
Name: religion_clubs, dtype: int64

In [153]:
check_corrs_with_target(df, col = 'religion_clubs')

corrs_matrix_between:  religion_clubs


arterial_hypertension   -0.031819
stroke                  -0.032523
IHD                     -0.014801
heart_failure            0.018303
other_cardio_diseases    0.049226
Name: religion_clubs, dtype: float64

## Подготовка данных

In [154]:
# Проверка на наличие пропусков в итоговом наборе данных
df.isna().sum().sum()

0

In [155]:
df

Unnamed: 0,sex,education,job,retired,stop_work_due_disease,diabetes,hepatitis,oncology,chronic_lung_disease,bronchial_asthma,tuberculosis,hiv/aids,intake_medicines,trauma_last_year,fractures,smoking_duration,ciggaretes_per_day,passive_smoking,passive_smoking_frequency,alcohol_duration,midday_sleep,sport,religion_clubs,train,unknown_id_feature,id_1,id_2,id_3,id_4,married,divorced,widowed,cohabited,single,separated,asian,european,is_russian,christian,household,military,senior_specialist,farmer/fisherman,low_qualified,hard_industry,top_management,service,crafters_other,office,junior_specialists,comorbid_count,is_comorbid,fracture_last_year,trauma_on_retire,fracture_on_retire,previous_smoke,smoke,never_smoke,smoker_score,smoking_duration_score,ciggaretes_per_day_log,ciggaretes_per_day_score,smoking_score_int_log,smoking_score_int_score,smoking_score_int_score_2,smoking_score_int_score_3,smoking_score_int_score_log,never_drink_alco,previous_drink_alco,drink_alco_now,alcohol_score,alcohol_duration_score,alcohol_duration_log,alcohol_int_score_1,alcohol_int_score_2,alcohol_int_score_3,sleep_onset_early_22,sleep_onset_22,sleep_onset_23,sleep_onset_later_after_0,early_onset,lately_onset,sleep_upset_early_6,sleep_upset_6,sleep_upset_7,sleep_upset_8,sleep_upset_after_9,early_upset,lately_upset,sleeptime,low_amount_sleep,high_amout_sleep
0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,15.0,20.0,0,0,18.0,0,0,0,1,2,0,1,0,0,1,0,0,0,0,0,0,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2,3.0,3.044522,3.0,233.937014,216.0,18.0,54.0,8.250734,0,0,1,2,3.0,2.944439,8.0,6.0,8.669721,0,1,0,0,1,0,0,1,0,0,0,1,0,8,0,0
1,0,4,0,0,0,1,0,0,0,0,0,0,1,0,1,0.0,0.0,0,0,0.0,1,0,0,1,1,1,0,0,0,0,1,0,0,0,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1,0,0,0,0.0,0.0,1.0,0.0,1.0,0,0,0,1,0,1,1,0,0,0,0,0,0,4,1,0
2,0,4,0,0,0,0,0,0,0,0,0,0,1,0,0,0.0,0.0,1,1,17.0,0,0,0,1,3,0,0,1,0,1,0,0,0,0,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0,0,1,2,3.0,2.890372,8.0,6.0,8.354249,0,0,1,0,0,1,0,0,1,0,0,1,0,8,0,0
3,1,2,1,0,0,0,0,1,0,0,0,0,1,0,0,12.0,10.0,1,2,13.0,0,0,0,1,2,0,1,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,1,2.0,2.397895,2.0,5.270337,4.0,4.0,4.0,1.0,0,0,1,2,2.0,2.639057,4.0,4.0,6.964624,0,0,1,0,0,1,0,0,1,0,0,1,0,8,0,0
4,0,2,0,0,1,1,1,0,0,0,0,0,1,0,1,0.0,0.0,1,3,16.0,0,0,0,1,1,1,0,0,0,1,0,0,0,0,0,0,1,1,1,0,0,0,0,0,1,0,0,0,0,0,2,1,0,0,0,0,0,1,0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0,0,1,2,3.0,2.833213,8.0,6.0,8.027098,0,0,1,0,0,1,0,1,0,0,0,1,0,7,0,0
5,0,4,0,0,0,0,0,0,0,0,0,0,1,0,0,0.0,0.0,0,0,35.0,0,0,0,1,1,1,0,0,0,1,0,0,0,0,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0,0,1,2,7.0,3.583519,128.0,14.0,12.841608,0,0,1,0,0,1,0,1,0,0,0,1,0,7,0,0
6,0,1,1,0,0,0,1,0,0,0,0,0,1,0,1,0.0,0.0,0,0,0.0,0,0,0,1,1,1,0,0,0,0,0,0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,1,0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1,0,0,0,0.0,0.0,1.0,0.0,1.0,0,0,1,0,0,1,1,0,0,0,0,0,0,6,0,0
7,1,3,0,1,1,0,0,0,0,0,0,0,1,0,0,18.0,1.0,1,1,12.0,0,0,0,1,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,3.0,0.693147,1.0,2.141486,3.0,3.0,1.0,1.0,0,0,1,2,2.0,2.564949,4.0,4.0,6.578965,0,0,1,0,0,1,0,1,0,0,0,1,0,7,0,0
8,0,3,1,0,0,0,0,0,0,0,0,0,1,0,0,0.0,0.0,0,0,17.0,0,0,0,1,1,1,0,0,0,0,0,0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0,0,1,2,3.0,2.890372,8.0,6.0,8.354249,0,0,1,0,0,1,0,0,1,0,0,1,0,8,0,0
9,1,4,1,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,18.0,0,1,0,1,1,1,0,0,0,0,1,0,0,0,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0,0,1,2,3.0,2.944439,8.0,6.0,8.669721,0,0,1,0,0,1,0,0,0,1,0,0,1,9,0,0


### ADD FEATURE cardio_score

Примечание: существует довольно много разных шкал риска, но не все они легко применимы на данном датасете. В данном случае применен измененный Jakarta Cardiovascular Score (посчитан из того, что было в датасете)

https://www.researchgate.net/publication/269624976/figure/tbl4/AS:669948162629639@1536739497178/The-Framingham-Score-and-the-Jakarta-Cardiovascular-Score.png

In [156]:
df['cardio_score'] = df['sex'] + df['smoke'] * 2 + df['previous_smoke'] * 1 + \
        df['diabetes'] * 2 - df['sport'] * 3 + \
        df['retired'] * 3 + df['passive_smoking']

In [157]:
check_corrs_with_target(df, col = 'cardio_score')

corrs_matrix_between:  cardio_score


arterial_hypertension    0.200888
stroke                   0.067760
IHD                      0.203711
heart_failure            0.100453
other_cardio_diseases   -0.002081
Name: cardio_score, dtype: float64

In [158]:
df.corr()

Unnamed: 0,sex,education,job,retired,stop_work_due_disease,diabetes,hepatitis,oncology,chronic_lung_disease,bronchial_asthma,tuberculosis,hiv/aids,intake_medicines,trauma_last_year,fractures,smoking_duration,ciggaretes_per_day,passive_smoking,passive_smoking_frequency,alcohol_duration,midday_sleep,sport,religion_clubs,train,unknown_id_feature,id_1,id_2,id_3,id_4,married,divorced,widowed,cohabited,single,separated,asian,european,is_russian,christian,household,military,senior_specialist,farmer/fisherman,low_qualified,hard_industry,top_management,service,crafters_other,office,junior_specialists,comorbid_count,is_comorbid,fracture_last_year,trauma_on_retire,fracture_on_retire,previous_smoke,smoke,never_smoke,smoker_score,smoking_duration_score,ciggaretes_per_day_log,ciggaretes_per_day_score,smoking_score_int_log,smoking_score_int_score,smoking_score_int_score_2,smoking_score_int_score_3,smoking_score_int_score_log,never_drink_alco,previous_drink_alco,drink_alco_now,alcohol_score,alcohol_duration_score,alcohol_duration_log,alcohol_int_score_1,alcohol_int_score_2,alcohol_int_score_3,sleep_onset_early_22,sleep_onset_22,sleep_onset_23,sleep_onset_later_after_0,early_onset,lately_onset,sleep_upset_early_6,sleep_upset_6,sleep_upset_7,sleep_upset_8,sleep_upset_after_9,early_upset,lately_upset,sleeptime,low_amount_sleep,high_amout_sleep,cardio_score
sex,1.0,-0.091298,0.132431,-0.130662,0.030411,-0.043548,-0.02826,-0.055943,-0.010112,-0.068898,0.038126,0.015642,-0.16065,0.003998,0.131355,0.254779,0.479741,-0.008747,-0.046456,0.078465,0.056894,-0.013533,-0.034481,0.006269,0.53344,-0.566081,0.572391,-0.007482,0.015642,0.288875,-0.12784,-0.232984,0.007721,-0.076521,-0.028313,0.099624,-0.074647,-0.077662,-0.19382,-0.072714,0.024101,-0.128253,0.028105,0.042986,0.221272,0.079671,-0.094775,0.205339,-0.073882,-0.0957,-0.078749,-0.057228,0.010227,-0.053695,-0.051894,0.274652,0.235106,-0.421452,0.372516,0.241871,0.500839,0.524611,0.084114,0.059318,0.340758,-0.016807,0.306573,-0.184105,0.092314,0.081219,0.139758,0.065578,0.151828,-0.043811,0.04276,0.062221,-0.039064,-0.015037,0.074347,-0.047275,-0.033976,0.030527,0.049634,0.017062,0.004007,-0.067864,-0.013824,0.020284,-0.065039,-0.05137,-0.000654,-0.034875,0.295923
education,-0.091298,1.0,0.124734,-0.103879,-0.082145,-0.082017,0.006857,-0.009262,-0.043254,-0.048303,-0.028894,-0.056855,-0.042997,0.0146,-0.064441,-0.084878,-0.129003,-0.036542,-0.043267,-0.010456,-0.114852,0.135635,-0.009404,-0.054529,-0.070363,0.070862,-0.068295,-0.015347,-0.007659,0.032081,0.049075,-0.099546,-0.022615,0.048786,-0.009383,-0.055319,0.080116,0.064933,0.032393,-0.058901,0.041823,0.547824,-0.041293,-0.3152,-0.169593,0.149983,-0.090821,-0.13221,-0.012554,-0.051671,-0.075278,-0.056373,0.022207,-0.021616,0.004447,-0.036858,-0.100062,0.114478,-0.119567,-0.078674,-0.1418,-0.144747,-0.029819,-0.045368,-0.110651,-0.005745,-0.114525,-0.039443,-0.068472,0.084671,0.070326,-0.014515,0.027039,-0.072664,0.014689,0.05257,-0.018415,-0.054707,-0.000615,0.058446,-0.0602,0.060982,-0.104595,0.015477,0.034404,0.00179,0.030757,0.046792,0.023224,0.008942,-0.027608,-0.001499,-0.235916
job,0.132431,0.124734,1.0,-0.741556,-0.25718,-0.130979,0.010397,-0.127971,-0.043893,-0.078942,-0.040315,-0.038648,-0.229758,-0.000869,-0.012198,0.051843,0.069567,0.067377,0.053416,0.020377,-0.159606,0.028898,0.003832,-0.047746,0.054003,-0.068323,0.079818,-0.059614,-0.003061,0.026138,0.052588,-0.151971,0.087549,0.006986,-0.018282,0.030703,-0.039568,-0.013442,-0.003658,-0.301664,-0.048539,0.05482,-0.026108,0.082588,0.071835,0.090659,0.043974,-0.077498,0.021643,0.013263,-0.145035,-0.121559,0.026444,-0.143129,-0.109796,-0.013519,0.110037,-0.082064,0.105163,0.050636,0.082951,0.080467,-0.013571,-0.011838,0.072851,-0.02756,0.091025,-0.057695,-0.108349,0.130097,0.106618,0.016508,0.048841,-0.034929,0.053754,0.094363,-0.012768,0.06298,0.018018,-0.068555,0.051893,-0.052827,0.100406,0.13971,0.012259,-0.187289,-0.115798,0.147133,-0.234443,-0.133863,0.03701,-0.102743,-0.503324
retired,-0.130662,-0.103879,-0.741556,1.0,-0.088034,0.164985,-0.046133,0.124887,0.029061,0.055876,0.001451,-0.02412,0.226344,0.000645,0.026871,-0.084222,-0.126424,-0.04313,-0.031741,0.000652,0.105708,0.015046,-0.003541,0.04918,-0.098962,0.102642,-0.103249,0.008954,-0.02412,-0.058835,-0.010542,0.187402,-0.102983,-0.040037,0.032717,-0.061977,0.063108,0.048496,0.009188,0.199678,0.038765,-0.028006,0.011842,-0.074361,-0.047158,-0.08429,-0.055865,0.087422,-0.013863,0.012902,0.124568,0.114032,-0.01906,0.193012,0.148062,-0.038898,-0.125541,0.137671,-0.146435,-0.07919,-0.142216,-0.13814,-0.045889,-0.024841,-0.109351,-0.017865,-0.12526,0.058984,0.029286,-0.07089,-0.071712,0.009297,-0.039628,0.067761,-0.001411,-0.034871,0.022579,-0.035714,-0.024269,0.047987,-0.021564,0.024373,-0.085227,-0.108215,0.009654,0.16151,0.052065,-0.096295,0.16835,0.110526,-0.044063,0.068807,0.674486
stop_work_due_disease,0.030411,-0.082145,-0.25718,-0.088034,1.0,0.028537,0.055936,0.056833,0.046771,0.053006,0.01119,0.070955,0.119102,-0.006025,0.014219,-0.022864,0.044506,-0.006155,-0.01652,-0.02144,0.046725,-0.050237,0.020821,-0.02498,0.023987,-0.02988,0.033892,-0.016762,-0.008365,-0.030867,-0.005066,0.023538,0.012332,0.025403,-0.010248,-0.02918,-0.004026,0.010829,-0.029991,0.030328,-0.017784,-0.033834,-0.029791,0.017223,0.002588,-0.029233,-0.006793,0.039965,-0.004808,0.006128,0.101872,0.069015,-0.016525,-0.03098,-0.023765,0.028523,-0.026749,-0.000513,-0.013274,-0.025474,0.019509,0.031766,0.053556,0.005409,0.007094,-0.00392,0.002955,-0.005885,0.12397,-0.089653,-0.050911,-0.024635,-0.005978,-0.044038,-0.066832,-0.084107,0.015984,-0.01126,0.000261,-0.003127,-0.002255,-0.003016,-0.01956,-0.031588,-0.021692,0.061313,0.03787,-0.050725,0.076722,0.042092,-0.003755,0.027192,-0.036188
diabetes,-0.043548,-0.082017,-0.130979,0.164985,0.028537,1.0,-0.016493,0.083622,-0.030559,0.032766,0.005338,-0.011515,0.220896,-0.029572,0.002106,-0.052646,-0.029773,0.009705,0.020312,0.025833,0.104907,0.011062,-0.006254,0.043067,-0.025184,0.020789,-0.015963,-0.023074,-0.011515,0.00692,-0.046196,0.079987,-0.041581,-0.033346,0.035176,-0.022628,0.018394,0.017077,0.000829,0.071909,0.004027,-0.037281,0.010565,0.003258,-0.017398,-0.049714,0.009185,0.013082,0.00338,-0.002705,0.487973,0.489444,-0.018438,-0.042646,-0.032714,-0.024658,-0.068206,0.077654,-0.081274,-0.050302,-0.065889,-0.050485,0.034123,-0.005599,-0.035421,-0.006722,-0.041656,0.012978,0.037969,-0.039629,-0.030205,0.032869,0.000766,0.024934,0.014843,-0.010769,0.060545,-0.019416,-0.047634,0.028782,0.013019,-0.021148,-0.023511,0.021519,-0.026005,0.004642,0.02066,-0.003076,0.018401,0.016396,0.003181,0.036707,0.382336
hepatitis,-0.02826,0.006857,0.010397,-0.046133,0.055936,-0.016493,1.0,0.02586,0.03494,0.007704,0.086642,0.098416,0.027226,0.002062,0.059984,0.055496,-0.000302,-0.006588,-0.01198,0.015836,0.007191,0.002272,-0.028288,0.021261,-0.012998,0.012,-0.011408,0.002255,-0.012773,-0.026561,0.029108,-0.055895,0.059035,0.028954,0.029758,-0.028396,0.039581,0.00574,-0.007525,-0.044684,0.025375,-0.027126,-0.013811,-0.010443,0.023867,0.028916,0.026162,-0.014311,0.032384,0.006377,0.536265,0.542912,0.002316,-0.016802,-0.016545,0.039872,0.017153,-0.04683,0.036837,0.051738,0.026067,0.02018,0.018538,-0.012914,0.026381,-0.009429,-0.003934,-0.032544,0.036034,-0.000671,0.015683,0.01893,0.026788,-0.025535,0.007171,-0.001383,-0.027636,-0.031568,0.054272,-0.013089,-0.043457,0.044878,-0.002409,-0.023693,0.003859,0.023585,0.00449,-0.019473,0.022381,0.003467,-0.021026,-0.019968,-0.032144
oncology,-0.055943,-0.009262,-0.127971,0.124887,0.056833,0.083622,0.02586,1.0,-0.018097,0.01787,-0.006106,-0.007714,0.087544,-0.011318,-0.065517,-0.054048,-0.031214,-0.014808,-0.039059,-0.004258,0.026589,-0.032219,0.048246,-0.00101,-0.032645,0.030708,-0.027665,-0.015457,-0.007714,-0.05127,0.008329,0.065907,-0.009199,-0.006766,0.060248,-0.002102,-0.009686,0.028398,-0.055804,0.054497,-0.0164,-0.008957,-0.003159,-0.0275,-0.030962,-0.008042,-0.002396,-0.028378,0.027381,0.039195,0.383882,0.32788,-0.026668,-0.028569,-0.021915,0.011236,-0.074212,0.053628,-0.069893,-0.054283,-0.049714,-0.043505,-0.035335,-0.015926,-0.06696,-0.005801,-0.073244,0.003406,0.040375,-0.033581,-0.021843,-0.000697,-0.004482,0.025854,-0.011541,-0.023331,-0.013455,-0.029074,0.034919,-0.002884,-0.033874,0.034716,-0.03516,-0.027303,0.012709,0.046767,0.01384,-0.014817,0.047873,0.046329,-0.019406,-0.002549,0.082883
chronic_lung_disease,-0.010112,-0.043254,-0.043893,0.029061,0.046771,-0.030559,0.03494,-0.018097,1.0,0.253617,0.139441,0.055866,0.080075,0.056125,0.045774,0.032273,0.043,-0.046394,-0.042675,-0.016742,0.038615,0.010148,0.018883,-0.011178,-0.037889,0.034937,-0.030805,-0.020641,-0.010301,-0.00922,0.009555,0.018487,-0.008945,-0.010856,-0.01262,0.041004,-0.054365,-0.017596,-0.00625,-0.019929,-0.0219,-0.03696,0.019869,-0.003459,-0.034163,-0.054942,0.02129,0.003233,0.016007,0.100153,0.510529,0.437838,0.042275,0.034457,0.01773,-0.03497,0.078305,-0.037844,0.062491,0.030875,0.046645,0.05227,0.045551,0.042798,0.078847,-0.007521,0.100975,0.021861,0.037373,-0.046488,-0.038713,-0.026162,-0.019701,0.001993,-0.039081,-0.039977,0.000736,0.002505,-0.039436,0.03408,0.002702,-0.006696,0.034464,0.005946,-0.019003,-0.007538,-0.008217,-0.011755,-0.011953,-0.052277,0.078248,0.002149,0.022221
bronchial_asthma,-0.068898,-0.048303,-0.078942,0.055876,0.053006,0.032766,0.007704,0.01787,0.253617,1.0,0.02188,-0.007195,0.104098,0.009149,0.01223,-0.061893,-0.072214,-0.007186,1.3e-05,0.01383,0.005564,-0.000446,-0.030855,0.021243,-0.020232,0.017772,-0.014812,-0.014416,-0.007195,-0.026548,0.010193,0.020244,-0.001201,0.014339,-0.008814,0.027786,-0.014606,-0.031899,0.01073,0.078481,-0.015296,-0.030985,0.026208,-0.026805,-0.012585,-0.020731,0.018205,-0.04306,0.015646,0.028023,0.447307,0.3058,0.01137,0.023261,0.011862,-0.029669,-0.055737,0.071176,-0.071052,-0.060327,-0.079244,-0.078296,-0.025422,-0.014939,-0.064167,-0.005391,-0.055931,-0.007121,0.002061,0.004292,0.006097,0.013955,0.009962,0.026804,0.0155,0.010529,0.034783,-0.03099,0.000225,0.010913,-0.010925,0.011754,0.002832,-0.042499,0.010058,0.016273,0.024706,-0.032035,0.030734,0.018954,0.02651,0.052013,0.002335


In [159]:
train = df[df['train'] == 1].drop('train', axis = 1)
X_test = df[df['train'] == 0].drop('train', axis = 1)

In [160]:
#scaler = StandardScaler()

#for col in train.columns:
#        train[col] = scaler.fit_transform(train[col].values.reshape(-1, 1))
#        X_test[col] = scaler.transform(X_test[col].values.reshape(-1,1))

# В ходе экспериментов произошел сознательный отказ от использования StandardScaler (и других скейлеров) в пользу интерпретируемости модели

In [161]:
corr_matrix = check_corrs_with_target(df, col = df.columns)
corr_matrix.abs().sum()

corrs_matrix_between:  Index(['sex', 'education', 'job', 'retired', 'stop_work_due_disease',
       'diabetes', 'hepatitis', 'oncology', 'chronic_lung_disease',
       'bronchial_asthma', 'tuberculosis', 'hiv/aids', 'intake_medicines',
       'trauma_last_year', 'fractures', 'smoking_duration',
       'ciggaretes_per_day', 'passive_smoking', 'passive_smoking_frequency',
       'alcohol_duration', 'midday_sleep', 'sport', 'religion_clubs', 'train',
       'unknown_id_feature', 'id_1', 'id_2', 'id_3', 'id_4', 'married',
       'divorced', 'widowed', 'cohabited', 'single', 'separated', 'asian',
       'european', 'is_russian', 'christian', 'household', 'military',
       'senior_specialist', 'farmer/fisherman', 'low_qualified',
       'hard_industry', 'top_management', 'service', 'crafters_other',
       'office', 'junior_specialists', 'comorbid_count', 'is_comorbid',
       'fracture_last_year', 'trauma_on_retire', 'fracture_on_retire',
       'previous_smoke', 'smoke', 'never_smoke', 's

arterial_hypertension    5.571909
stroke                   3.106042
IHD                      3.502097
heart_failure            4.696558
other_cardio_diseases    3.027031
dtype: float64

### Так как данные имеют большую суммарную связь с артериальной гипертнезией, то сосредоточимся на предсказании этого показателя. 
#### В дальнейшем обучив модель и "подставив" предсказания в тестовый датасет продолжим предсказывать другие

In [162]:
target_train.corr()

Unnamed: 0,arterial_hypertension,stroke,IHD,heart_failure,other_cardio_diseases
arterial_hypertension,1.0,0.12272,0.296755,0.266396,0.086785
stroke,0.12272,1.0,0.015392,-0.002087,-0.030536
IHD,0.296755,0.015392,1.0,0.522954,0.049801
heart_failure,0.266396,-0.002087,0.522954,1.0,0.138133
other_cardio_diseases,0.086785,-0.030536,0.049801,0.138133,1.0


#### Исходя из корреляций между АГ и другими болезнями у метода хорошие перспективы

# Поиск лучшей метрики с помощью логистической регрессии

In [163]:
# Гиперпараметры логистической регрессии для перебора с помощью GridsearchCV
C = [1, 0.1, 0.01, 0.001]
iters = [50, 100, 200, 300]
epsilon_stop = [0.0001]

param_grid_logreg = [
    {'C': C,
     'penalty': ['l1'],
     'solver': ['saga', 'liblinear'],
     'class_weight':[None, 'balanced'],
     'multi_class': ['auto', 'ovr'],
     'max_iter': iters,
     'tol': epsilon_stop},
    {'C': C,
     'penalty': ['l2'],
     'solver': ['newton-cg', 'liblinear', 'lbfgs', 'saga'],
     'class_weight':[None, 'balanced'],
     'multi_class': ['auto', 'ovr'],
     'max_iter': iters,
     'tol':epsilon_stop},
    {'penalty': ['none'],
     'solver': ['saga'],
     'class_weight':[None, 'balanced'],
     'multi_class': ['auto', 'ovr'],
     'max_iter': iters,
     'tol':epsilon_stop}]

# Параметры случайного леса
param_grid_random_forest = { 
    'n_estimators': [100, 200, 250, 300, 350, 400],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy'],
}

# Гиперпараметры svc
loss = ['hinge', 'squared_hinge']

param_grid_svc = [
    {'C': C,
     'penalty': ['l1'],
     'loss' : loss,
     'class_weight':[None, 'balanced'],
     'multi_class': ['auto', 'ovr'],
     'max_iter': iters,
     'tol': epsilon_stop},
    {'C': C,
     'penalty': ['l2'],
     'class_weight':[None, 'balanced'],
     'multi_class': ['auto', 'ovr'],
     'max_iter': iters,
     'tol':epsilon_stop}]

In [164]:
def score_model(model, params=None, oversampler=None, cv=None, X_train=None, y_train=None, n_splits=5, random_state=24):
    """
    Creates folds manually, and upsamples within each fold.
    Returns an array of validation (recall) scores
    """
    if cv is None:
        cv = StratifiedKFold(n_splits=n_splits)
    
    thresholds = []
    recall_macro_scores = []
    f1_scores = []
    models = {}
    
    for train_fold_index, val_fold_index in cv.split(X_train, y_train):
        # Get the training data
        X_train_fold, y_train_fold = X_train.iloc[train_fold_index], y_train[train_fold_index]
        # Get the validation data
        X_val_fold, y_val_fold = X_train.iloc[val_fold_index], y_train[val_fold_index]

        # Upsample only the data in the training section
        X_train_fold_upsample, y_train_fold_upsample = oversampler.fit_resample(X_train_fold,
                                                                           y_train_fold)
        # Fit the model on the upsampled training data
        model_obj = model.fit(X_train_fold_upsample, y_train_fold_upsample)
        
        
        # Score the model on the (non-upsampled) validation data
        probs = model_obj.predict_proba(X_val_fold)
        threshold = find_opt_threshold(y_val_fold, probs)
        thresholds.append(threshold)
        y_pred = np.where(probs[:, 1] > threshold, 1, 0)
        
        recall_macro_score = recall_score(y_val_fold, y_pred, average='macro')
        recall_macro_scores.append(recall_macro_score)
        
        f1score = f1_score(y_val_fold, y_pred)
        f1_scores.append(f1score)
    
    
    print('recall_macro_scores: ', recall_macro_scores)
    print('recall_macro_mean: ', np.mean(recall_macro_scores))
    print()
    print('f1_scores: ', f1_scores)
    print('f1_mean: ', np.mean(f1_scores))
    print()
    print('mean_threshold: ', np.mean(thresholds))
    return np.mean(thresholds)

In [165]:
# Модель логистической регрессии с балансировкой выборки.
def fit_model(df, target, model='logreg', scoring='f1',
              split=True,
              reduce_corrs=False, threshold_reduce=0.05,
              grid=False, oversampling=False,  param_grid=None,
              oversampler='random', resample_strategy='auto',
              undersampler=None, undersample_strategy=None,
              return_predicted=False, X_test=None,
              random_state=None, n_splits=5):

    cv_grid = StratifiedKFold(random_state=random_state)

    if reduce_corrs == True:
        df, X_test = reduce_low_corrs_with_target(df=df, target=target,
                                                  threshold=threshold_reduce, return_predicted=return_predicted,
                                                  X_test=X_test)

    used_features = df.columns
    print('Used features: ', list(df.columns))
    print()

    X = df.drop(target, axis=1)
    y = df[target]

    if split == True:
        X_train, X_valid, y_train, y_valid = train_test_split(
            X, y, test_size=0.20, stratify=y, random_state=random_state)
    else:
        X_train = X.copy()
        y_train = y.copy()

    if model == 'logreg':
        # Логистическая регрессия
        clf = LogisticRegression(random_state=random_state, n_jobs=-1)

        # Поиск гиперпараметров GridSearchCV при помощи перебора параметров по сетке param_grid:
        if grid == True:
            grid_search = GridSearchCV(
                clf, param_grid, scoring=scoring, n_jobs=-1, cv=cv_grid)
            grid_search.fit(X, y)

        # Печатаем параметры развернуто:
            best_model = grid_search.best_estimator_
            best_parameters = best_model.get_params()
            for param_name in sorted(best_parameters.keys()):
                print('\t%s: %r' % (param_name, best_parameters[param_name]))

        clf = LogisticRegression(**best_parameters)

    if model == 'svc':
        # SVC
        clf = LinearSVC(random_state=random_state)

        if grid == True:
            grid_search = GridSearchCV(
                clf, param_grid, scoring=scoring, cv=cv_grid, n_jobs=-1)
            grid_search.fit(X, y)

            best_model = grid_search.best_estimator_
            best_parameters = best_model.get_params()
            for param_name in sorted(best_parameters.keys()):
                print('\t%s: %r' % (param_name, best_parameters[param_name]))

            clf = LinearSVC(**best_parameters)

    print()

    if model == 'kneigh':
        clf = KNeighborsClassifier(
            n_neighbors=5, weights='distance', n_jobs=-1)

    if model == 'svc2':
        clf = SVC(probability=True)

    if model == 'gradboost':
        clf = GradientBoostingClassifier()

    if model == 'randomforest':
        clf = RandomForestClassifier()
        if grid == True:
            grid_search = GridSearchCV(
                clf, param_grid, scoring=scoring, cv=5, n_jobs=-1)
            grid_search.fit(X, y)

            best_model = grid_search.best_estimator_
            best_parameters = best_model.get_params()
            for param_name in sorted(best_parameters.keys()):
                print('\t%s: %r' % (param_name, best_parameters[param_name]))
                print()

    # Балансировка выборки
    if oversampling == True:
        if oversampler == 'random':
            oversampler = RandomOverSampler(
                sampling_strategy=resample_strategy, random_state=random_state)

        if oversampler == 'smote':
            oversampler = SMOTE(
                sampling_strategy=resample_strategy, random_state=random_state)

        threshold = score_model(model=clf, params=best_parameters, oversampler=oversampler,
                                cv=None, X_train=X_train, y_train=y_train, n_splits=n_splits, random_state=random_state)

        X_train, y_train = oversampler.fit_resample(X_train, y_train)

    clf.fit(X_train, y_train)

    probs = clf.predict_proba(X)
    threshold = find_opt_threshold(y, probs)
    print('Self threshold: ', threshold)

    
    coefs_df = pd.DataFrame(data = clf.coef_[0], 
             index = X_train.columns, 
             columns = ['coef'])
    
    print('Coefficients of LogisticRegression:')
    print(coefs_df.sort_values(by='coef', ascending=False))
    print()
    
    print('Odds:')
    odds = np.exp(clf.coef_[0])
    odds_df = pd.DataFrame(odds, 
             X.columns, 
             columns=['coef'])\
            .sort_values(by='coef', ascending=False)
    print(odds_df)
    
    
    if split == True:
        probs_valid = clf.predict_proba(X_valid)
        y_pred = np.where(probs_train[:, 1] > threshold, 1, 0)
        print()
        print('valid_recall_score: ', recall_score(
            y_valid, y_pred, average='macro'))
    else:
        y_pred = np.where(probs[:, 1] > threshold, 1, 0)
        print()
        print('self_recall_score: ', recall_score(y, y_pred, average='macro'))

    if return_predicted == True:
        probs_train = clf.predict_proba(X)
        predicted_train = np.where(probs_train[:, 1] > threshold, 1, 0)
        probs_test = clf.predict_proba(X_test)[:, 1]
        predicted_test = np.where(probs_test > threshold, 1, 0)
        return probs_test, predicted_test
    else:
        return clf

In [166]:
df_ah = pd.concat([train, target_train['arterial_hypertension']], axis=1)

In [167]:
ah = MultiCollinearityEliminator(df_ah, 'arterial_hypertension', 0.6).autoEliminateMulticollinearity()

In [168]:
SEED=None

In [169]:
X_pred = pd.DataFrame()

In [170]:
ah.arterial_hypertension.value_counts()

0    509
1    446
Name: arterial_hypertension, dtype: int64

In [171]:
ah_probs, ah_pred = fit_model(ah, 'arterial_hypertension',
            split=False,
            reduce_corrs=True, threshold_reduce=0.06, 
            model='logreg', grid=True, param_grid=param_grid_logreg, scoring='f1',
            oversampling=True, oversampler='random', resample_strategy='auto',
            return_predicted=True, X_test=X_test, 
            random_state=SEED, n_splits=5)

Used features:  ['crafters_other', 'household', 'farmer/fisherman', 'chronic_lung_disease', 'early_upset', 'midday_sleep', 'married', 'drink_alco_now', 'oncology', 'bronchial_asthma', 'hard_industry', 'smoke', 'fractures', 'unknown_id_feature', 'education', 'sex', 'comorbid_count', 'widowed', 'diabetes', 'retired', 'intake_medicines', 'arterial_hypertension']

	C: 0.1
	class_weight: None
	dual: False
	fit_intercept: True
	intercept_scaling: 1
	l1_ratio: None
	max_iter: 50
	multi_class: 'auto'
	n_jobs: -1
	penalty: 'l1'
	random_state: None
	solver: 'saga'
	tol: 0.0001
	verbose: 0
	warm_start: False

Threshold=0.638, Recall_macro=0.71095
Threshold=0.335, Recall_macro=0.73017
Threshold=0.551, Recall_macro=0.75050
Threshold=0.560, Recall_macro=0.75182
Threshold=0.521, Recall_macro=0.76188
recall_macro_scores:  [0.7109495483586693, 0.7301718440185063, 0.7504957038995373, 0.7518175809649703, 0.7618811881188119]
recall_macro_mean:  0.741063173072099

f1_scores:  [0.6707317073170731, 0.7389162

In [172]:
print('len of predicted: ', sum(ah_pred))
X_pred['arterial_hypertension'] = ah_pred
X_test['arterial_hypertension'] = ah_pred

len of predicted:  303


In [173]:
df_ihd = pd.concat([train, target_train[['arterial_hypertension','IHD']]], axis=1)


In [174]:
ihd = MultiCollinearityEliminator(df_ihd, 'IHD', 0.6).autoEliminateMulticollinearity()

In [175]:
ihd['IHD'].value_counts()

0    838
1    117
Name: IHD, dtype: int64

In [176]:
ihd_probs, ihd_pred = fit_model(ihd, 'IHD', split=False,
                            reduce_corrs=True, threshold_reduce=0.04,
                            model='logreg', grid=True, param_grid=param_grid_logreg, scoring='f1',
                            oversampling=True, oversampler='random', resample_strategy='auto',
                            return_predicted=True, X_test=X_test, random_state=SEED, n_splits=3)


Used features:  ['divorced', 'oncology', 'bronchial_asthma', 'midday_sleep', 'education', 'fracture_on_retire', 'top_management', 'is_russian', 'service', 'household', 'farmer/fisherman', 'fractures', 'tuberculosis', 'stop_work_due_disease', 'smoking_score_int_score_3', 'widowed', 'previous_drink_alco', 'is_comorbid', 'diabetes', 'intake_medicines', 'retired', 'arterial_hypertension', 'IHD']

	C: 1
	class_weight: 'balanced'
	dual: False
	fit_intercept: True
	intercept_scaling: 1
	l1_ratio: None
	max_iter: 50
	multi_class: 'ovr'
	n_jobs: -1
	penalty: 'l1'
	random_state: None
	solver: 'liblinear'
	tol: 0.0001
	verbose: 0
	warm_start: False

Threshold=0.441, Recall_macro=0.74533
Threshold=0.406, Recall_macro=0.75159
Threshold=0.288, Recall_macro=0.75931
recall_macro_scores:  [0.7453296703296703, 0.7515853322304935, 0.7593052109181142]
recall_macro_mean:  0.7520734044927594

f1_scores:  [0.40816326530612246, 0.3804347826086957, 0.3775510204081633]
f1_mean:  0.3887163561076605

mean_thresho

In [177]:
X_pred['IHD'] = ihd_pred
X_test['IHD'] = ihd_pred
#target_train['IHD'] = ihd_train
ihd_pred.sum()

297

In [178]:
df_hf = pd.concat([train, target_train[['arterial_hypertension','IHD','heart_failure']]], axis=1)


In [179]:
hf = MultiCollinearityEliminator(df_hf, 'heart_failure', 0.6).autoEliminateMulticollinearity()

In [180]:
hf['heart_failure'].value_counts()

0    859
1     96
Name: heart_failure, dtype: int64

In [181]:
hf_probs, hf_pred = fit_model(hf, 'heart_failure', 
                            reduce_corrs=True, threshold_reduce=0.04, split=False,
                            model = 'logreg', scoring='f1', grid=True, param_grid=param_grid_logreg,
                            oversampling=True, oversampler='random', resample_strategy='auto',
                            return_predicted=True, X_test=X_test, n_splits=3,
                            random_state=SEED)


Used features:  ['lately_upset', 'oncology', 'previous_smoke', 'top_management', 'married', 'crafters_other', 'low_qualified', 'tuberculosis', 'chronic_lung_disease', 'cohabited', 'passive_smoking_frequency', 'stop_work_due_disease', 'id_2', 'fractures', 'low_amount_sleep', 'farmer/fisherman', 'id_4', 'sleeptime', 'sleep_upset_after_9', 'education', 'sex', 'never_smoke', 'household', 'diabetes', 'bronchial_asthma', 'comorbid_count', 'drink_alco_now', 'widowed', 'job', 'intake_medicines', 'arterial_hypertension', 'IHD', 'heart_failure']

	C: 0.01
	class_weight: 'balanced'
	dual: False
	fit_intercept: True
	intercept_scaling: 1
	l1_ratio: None
	max_iter: 50
	multi_class: 'auto'
	n_jobs: -1
	penalty: 'l1'
	random_state: None
	solver: 'saga'
	tol: 0.0001
	verbose: 0
	warm_start: False

Threshold=0.466, Recall_macro=0.84881
Threshold=0.401, Recall_macro=0.76541
Threshold=0.413, Recall_macro=0.74279
recall_macro_scores:  [0.8488131533101045, 0.7654064685314685, 0.7427884615384616]
recall_mac

In [182]:
X_pred['heart_failure'] = hf_pred
X_test['heart_failure'] = hf_pred
#target_train['heart_failure'] = hf_train 
sum(hf_pred)

299

# Предсказание ACD

In [183]:
df_stroke = pd.concat([train, target_train[['arterial_hypertension','IHD','heart_failure', 'stroke']]], axis=1)


In [184]:
stroke = MultiCollinearityEliminator(df_stroke, 'stroke', 0.6).autoEliminateMulticollinearity()

In [185]:
stroke['stroke'].value_counts()

0    914
1     41
Name: stroke, dtype: int64

In [186]:
stroke_probs, stroke_pred = fit_model(stroke, 'stroke', split=False,
                            reduce_corrs=True, threshold_reduce=0.03,
                            model='logreg', scoring='f1', grid=True, param_grid=param_grid_logreg, 
                            oversampling=True, oversampler='random', resample_strategy='auto',
                            return_predicted=True, X_test=X_test, random_state=SEED, n_splits=3)


Used features:  ['tuberculosis', 'sleep_upset_7', 'religion_clubs', 'early_upset', 'european', 'sleeptime', 'sleep_onset_early_22', 'top_management', 'id_3', 'diabetes', 'bronchial_asthma', 'junior_specialists', 'single', 'passive_smoking_frequency', 'alcohol_duration_score', 'sleep_onset_23', 'christian', 'farmer/fisherman', 'previous_smoke', 'smoke', 'previous_drink_alco', 'education', 'cardio_score', 'fractures', 'midday_sleep', 'sex', 'job', 'early_onset', 'id_1', 'stop_work_due_disease', 'crafters_other', 'intake_medicines', 'arterial_hypertension', 'stroke']

	C: 0.01
	class_weight: 'balanced'
	dual: False
	fit_intercept: True
	intercept_scaling: 1
	l1_ratio: None
	max_iter: 50
	multi_class: 'ovr'
	n_jobs: -1
	penalty: 'l2'
	random_state: None
	solver: 'saga'
	tol: 0.0001
	verbose: 0
	warm_start: False

Threshold=0.496, Recall_macro=0.71452
Threshold=0.384, Recall_macro=0.66898
Threshold=0.376, Recall_macro=0.74013
recall_macro_scores:  [0.7145199063231851, 0.6689785624211854, 0.

In [187]:
X_pred['stroke'] = stroke_pred
X_test['stroke'] = stroke_pred
sum(stroke_pred)

211

## Предсказание other

In [188]:
df_other = pd.concat([train, target_train[['arterial_hypertension',
                   'IHD', 'heart_failure', 'stroke', 'other_cardio_diseases']]], axis=1)

In [189]:
other = MultiCollinearityEliminator(df_other, 'other_cardio_diseases', 0.6).autoEliminateMulticollinearity()

In [190]:
other['other_cardio_diseases'].value_counts()

0    869
1     86
Name: other_cardio_diseases, dtype: int64

In [191]:
other_probs, other_pred = fit_model(other, 'other_cardio_diseases', split=False,
                            reduce_corrs=True, threshold_reduce=0.03,
                            model='logreg', scoring='f1', grid=True, param_grid=param_grid_logreg,
                            oversampling=True, oversampler='random', resample_strategy='auto',
                            return_predicted=True, X_test=X_test, random_state=SEED, n_splits=2)


Used features:  ['stroke', 'sport', 'stop_work_due_disease', 'service', 'sleep_upset_early_6', 'widowed', 'single', 'diabetes', 'sleep_onset_23', 'fractures', 'office', 'sleep_upset_7', 'trauma_last_year', 'trauma_on_retire', 'top_management', 'ciggaretes_per_day_score', 'high_amout_sleep', 'early_upset', 'religion_clubs', 'senior_specialist', 'IHD', 'european', 'id_2', 'oncology', 'low_amount_sleep', 'christian', 'midday_sleep', 'low_qualified', 'alcohol_score', 'job', 'chronic_lung_disease', 'is_comorbid', 'sex', 'arterial_hypertension', 'hard_industry', 'intake_medicines', 'heart_failure', 'other_cardio_diseases']

	C: 0.1
	class_weight: 'balanced'
	dual: False
	fit_intercept: True
	intercept_scaling: 1
	l1_ratio: None
	max_iter: 50
	multi_class: 'auto'
	n_jobs: -1
	penalty: 'l2'
	random_state: None
	solver: 'liblinear'
	tol: 0.0001
	verbose: 0
	warm_start: False

Threshold=0.393, Recall_macro=0.67837
Threshold=0.417, Recall_macro=0.67469
recall_macro_scores:  [0.6783747661053194, 0

In [192]:
X_pred['other_cardio_diseases'] = other_pred
X_test['other_cardio_diseases'] = other_pred
sum(other_pred)

317

In [193]:
result = X_pred[['arterial_hypertension','stroke', 'IHD', 'heart_failure', 'other_cardio_diseases']].copy()

In [194]:
submit = pd.read_csv('data/sample_solution.csv')

In [195]:
submit.iloc[:,1:] = result.values

In [196]:
submit.iloc[:,1:].sum()

Артериальная гипертензия              303
ОНМК                                  211
Стенокардия, ИБС, инфаркт миокарда    297
Сердечная недостаточность             299
Прочие заболевания сердца             317
dtype: int64

In [197]:
#submit.to_csv('submits/submit_N.csv', index=False)