In [1120]:
# Отключение warnings 
import sys, os, warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses

In [1121]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime as dt
from datetime import time

from sklearn.linear_model import LogisticRegression

from sklearn.svm import LinearSVC
from sklearn.svm import SVC

from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier


from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve

from imblearn.over_sampling import RandomOverSampler,  SMOTE
from imblearn.under_sampling import RandomUnderSampler

from xgboost import XGBClassifier

from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.ensemble import BalancedRandomForestClassifier


from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score



In [1122]:
#Feature selection class to eliminate multicollinearity
class MultiCollinearityEliminator():
    
    #Class Constructor
    def __init__(self, df, target, threshold):
        self.df = df
        self.target = target
        self.threshold = threshold

    #Method to create and return the feature correlation matrix dataframe
    def createCorrMatrix(self, include_target = False):
        #Checking we should include the target in the correlation matrix
        if (include_target == False):
            df_temp = self.df.drop([self.target], axis =1)
            
            #Setting method to Pearson to prevent issues in case the default method for df.corr() gets changed
            #Setting min_period to 30 for the sample size to be statistically significant (normal) according to 
            #central limit theorem
            corrMatrix = df_temp.corr(method='pearson', min_periods=30).abs()
        #Target is included for creating the series of feature to target correlation - Please refer the notes under the 
        #print statement to understand why we create the series of feature to target correlation
        elif (include_target == True):
            corrMatrix = self.df.corr(method='pearson', min_periods=30).abs()
        return corrMatrix

    #Method to create and return the feature to target correlation matrix dataframe
    def createCorrMatrixWithTarget(self):
        #After obtaining the list of correlated features, this method will help to view which variables 
        #(in the list of correlated features) are least correlated with the target
        #This way, out the list of correlated features, we can ensure to elimate the feature that is 
        #least correlated with the target
        #This not only helps to sustain the predictive power of the model but also helps in reducing model complexity
        
        #Obtaining the correlation matrix of the dataframe (along with the target)
        corrMatrix = self.createCorrMatrix(include_target = True)                           
        #Creating the required dataframe, then dropping the target row 
        #and sorting by the value of correlation with target (in asceding order)
        corrWithTarget = pd.DataFrame(corrMatrix.loc[:,self.target]).drop([self.target], axis = 0).sort_values(by = self.target)                    
        print(corrWithTarget, '\n')
        return corrWithTarget

    #Method to create and return the list of correlated features
    def createCorrelatedFeaturesList(self):
        #Obtaining the correlation matrix of the dataframe (without the target)
        corrMatrix = self.createCorrMatrix(include_target = False)                          
        colCorr = []
        #Iterating through the columns of the correlation matrix dataframe
        for column in corrMatrix.columns:
            #Iterating through the values (row wise) of the correlation matrix dataframe
            for idx, row in corrMatrix.iterrows():                                            
                if(row[column]>self.threshold) and (row[column]<1):
                    #Adding the features that are not already in the list of correlated features
                    if (idx not in colCorr):
                        colCorr.append(idx)
                    if (column not in colCorr):
                        colCorr.append(column)
        print(colCorr, '\n')
        return colCorr

    #Method to eliminate the least important features from the list of correlated features
    def deleteFeatures(self, colCorr):
        #Obtaining the feature to target correlation matrix dataframe
        corrWithTarget = self.createCorrMatrixWithTarget()                                  
        for idx, row in corrWithTarget.iterrows():
            print(idx, '\n')
            if (idx in colCorr):
                self.df = self.df.drop(idx, axis =1)
                break
        return self.df

    #Method to run automatically eliminate multicollinearity
    def autoEliminateMulticollinearity(self):
        #Obtaining the list of correlated features
        colCorr = self.createCorrelatedFeaturesList()                                       
        while colCorr != []:
            #Obtaining the dataframe after deleting the feature (from the list of correlated features) 
            #that is least correlated with the taregt
            self.df = self.deleteFeatures(colCorr)
            #Obtaining the list of correlated features
            colCorr = self.createCorrelatedFeaturesList()                                     
        return self.df

In [1123]:
# Получение диаганальной матрицы корреляций
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

In [1124]:
# Функция считает наибольший f_score при различных порогах отнесения к нулевому и первому классам
def calc_f_score(y_valid, probs):
    precision, recall, threshold = precision_recall_curve(y_valid, probs)
    denom = precision + recall
    # в целях избежания выбрасывания ошибки, если знаменатель f-score равен нулю, то он конвертируется в -1 
    denom[denom == 0] = -1
    f_score = (2 * precision * recall / denom)[:-1]
    return f_score, threshold

In [1125]:
# Функция возвращает оптимальный порог отнесения к классу с наивысшим показателем f-score
def calc_optimal_f_score(y_valid, probs):
    f_score, thresholds = calc_f_score(y_valid, probs)
    best_f_score = max(f_score)
    threshold = thresholds[np.argmax(f_score)]
    print(f'best_threshold = {round(threshold, 4)}')
    print(f'best_f_score = {round(best_f_score,4)}')
    return threshold

In [1126]:
# Вывод списка пар с максимальной корреляцией между собой
def get_top_abs_correlations(df, n=10):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

In [1127]:
# Настройка pandas для того, чтобы отображались все колонки
pd.set_option('display.max_columns', None)

# Подготовка и очистка данных 

In [1128]:
df_train = pd.read_csv('train.csv')
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 955 entries, 0 to 954
Data columns (total 39 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   ID                                     955 non-null    object 
 1   Пол                                    954 non-null    object 
 2   Семья                                  955 non-null    object 
 3   Этнос                                  955 non-null    object 
 4   Национальность                         955 non-null    object 
 5   Религия                                955 non-null    object 
 6   Образование                            955 non-null    object 
 7   Профессия                              955 non-null    object 
 8   Вы работаете?                          955 non-null    int64  
 9   Выход на пенсию                        955 non-null    int64  
 10  Прекращение работы по болезни          955 non-null    int64  
 11  Сахарн

In [1129]:
df_test = pd.read_csv('test_dataset_test.csv')
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 638 entries, 0 to 637
Data columns (total 33 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   ID                                     638 non-null    object 
 1   Пол                                    638 non-null    object 
 2   Семья                                  638 non-null    object 
 3   Этнос                                  638 non-null    object 
 4   Национальность                         638 non-null    object 
 5   Религия                                638 non-null    object 
 6   Образование                            638 non-null    object 
 7   Профессия                              638 non-null    object 
 8   Вы работаете?                          638 non-null    int64  
 9   Выход на пенсию                        638 non-null    int64  
 10  Прекращение работы по болезни          638 non-null    int64  
 11  Сахарн

In [1130]:
# Для корректной обработки признаков объединяем тренировочный и тестовый набор данных в один датасет

# выделение целевых значений и датасета
target_train = df_train.iloc[:,-5:]
df_train = df_train.iloc[:,:-5]

# вспомогательный флаг тренировочного и тестовой частей общего набора данных
df_train['train'] = 1
df_test['train'] = 0


# объединение в один датасет
df = pd.concat([df_train, df_test]).reset_index(drop=True)

## df.head(1)

In [1131]:
df.columns

Index(['ID', 'Пол', 'Семья', 'Этнос', 'Национальность', 'Религия',
       'Образование', 'Профессия', 'Вы работаете?', 'Выход на пенсию',
       'Прекращение работы по болезни', 'Сахарный диабет', 'Гепатит',
       'Онкология', 'Хроническое заболевание легких', 'Бронжиальная астма',
       'Туберкулез легких ', 'ВИЧ/СПИД',
       'Регулярный прим лекарственных средств', 'Травмы за год', 'Переломы',
       'Статус Курения', 'Возраст курения', 'Сигарет в день',
       'Пассивное курение', 'Частота пасс кур', 'Алкоголь', 'Возраст алког',
       'Время засыпания', 'Время пробуждения', 'Сон после обеда',
       'Спорт, клубы', 'Религия, клубы', 'ID_y', 'train'],
      dtype='object')

In [1132]:
df.columns = ['id', 'sex', 'family', 'ethnos', 'nationality', 'religion', 'education', 'profession',
              'job', 'retired', 'stop_work_due_disease', 'diabetes', 'hepatitis', 'oncology',
              'chronic_lung_disease', 'bronchial_asthma', 'tuberculosis', 'hiv/aids', 'intake_medicines',
              'trauma_last_year', 'fractures', 'smoking', 'smoking_duration', 'ciggaretes_per_day',
              'passive_smoking', 'passive_smoking_frequency', 'alcohol', 'alcohol_duration', 'time_fall_asleep',
              'time_awakening', 'midday_sleep', 'sport_clubs', 'religion_clubs', 'id_y', 'train']

In [1133]:
target_cols = ['arterial_hypertension', 'ACD',
                        'IHD', 'heart_failure', 'other_cardio_diseases']

In [1134]:
target_train.columns = target_cols

In [1135]:
# Проверка на совпадение ID в train
#print((df[df.train==1].id == df[df.train==1].id_y).sum() / len(df[df.train==1]))
# Проверка на совпадение ID в test и submit
#print((df[df.train==0].id.reset_index(drop=True) == submit.ID).sum() / len(submit))

In [1136]:
df['unknown_id_feature'] = df['id'].apply(lambda x: int(x[-1:]))

id_features = pd.get_dummies(df['unknown_id_feature'], prefix='id')

df = pd.concat([df, id_features], axis=1)

df.drop(['id', 'id_y'], axis=1, inplace=True)

In [1137]:
# Общее количество пропусков в данных
df.isna().sum()

sex                             1
family                          0
ethnos                          0
nationality                     0
religion                        0
education                       0
profession                      0
job                             0
retired                         0
stop_work_due_disease           0
diabetes                        0
hepatitis                       0
oncology                        0
chronic_lung_disease            0
bronchial_asthma                0
tuberculosis                    0
hiv/aids                        0
intake_medicines                0
trauma_last_year                0
fractures                       0
smoking                         0
smoking_duration              912
ciggaretes_per_day            919
passive_smoking                 0
passive_smoking_frequency    1216
alcohol                         0
alcohol_duration              304
time_fall_asleep                0
time_awakening                  0
midday_sleep  

In [1138]:
x = 'sex'

In [1139]:
x = ['sex']

In [1140]:
y = df.columns.to_list()

In [1141]:
x.extend(y)

In [1142]:
x

['sex',
 'sex',
 'family',
 'ethnos',
 'nationality',
 'religion',
 'education',
 'profession',
 'job',
 'retired',
 'stop_work_due_disease',
 'diabetes',
 'hepatitis',
 'oncology',
 'chronic_lung_disease',
 'bronchial_asthma',
 'tuberculosis',
 'hiv/aids',
 'intake_medicines',
 'trauma_last_year',
 'fractures',
 'smoking',
 'smoking_duration',
 'ciggaretes_per_day',
 'passive_smoking',
 'passive_smoking_frequency',
 'alcohol',
 'alcohol_duration',
 'time_fall_asleep',
 'time_awakening',
 'midday_sleep',
 'sport_clubs',
 'religion_clubs',
 'train',
 'unknown_id_feature',
 'id_1',
 'id_2',
 'id_3',
 'id_4']

In [1143]:
df.columns.to_list()

['sex',
 'family',
 'ethnos',
 'nationality',
 'religion',
 'education',
 'profession',
 'job',
 'retired',
 'stop_work_due_disease',
 'diabetes',
 'hepatitis',
 'oncology',
 'chronic_lung_disease',
 'bronchial_asthma',
 'tuberculosis',
 'hiv/aids',
 'intake_medicines',
 'trauma_last_year',
 'fractures',
 'smoking',
 'smoking_duration',
 'ciggaretes_per_day',
 'passive_smoking',
 'passive_smoking_frequency',
 'alcohol',
 'alcohol_duration',
 'time_fall_asleep',
 'time_awakening',
 'midday_sleep',
 'sport_clubs',
 'religion_clubs',
 'train',
 'unknown_id_feature',
 'id_1',
 'id_2',
 'id_3',
 'id_4']

In [1144]:
def check_corrs_with_target(df, target_train=target_train, col = df.columns.to_list()):
    df_train = df[df.train == 1]
    df_for_check = pd.concat([df_train, target_train], axis=1)
    cols_target = target_train.columns.to_list()
    #col.extend(cols_target)
    print('corrs_matrix_between: ', col)
    corrs = df_for_check.corr()
    return corrs.loc[col, cols_target]

### Колонка sex

In [1145]:
df.sex.value_counts(dropna=False)

Ж      1117
М       475
NaN       1
Name: sex, dtype: int64

In [1146]:
# Заполнение пропуска модой (в данном случае - Ж)
df.sex.fillna(df.sex.mode()[0], inplace=True)

In [1147]:
df.sex.value_counts(dropna=False)

Ж    1118
М     475
Name: sex, dtype: int64

In [1148]:
df.sex = np.where(df.sex == 'Ж', 0, 1)

In [1149]:
check_corrs_with_target(df, col = ['sex'])

corrs_matrix_between:  ['sex']


Unnamed: 0,arterial_hypertension,ACD,IHD,heart_failure,other_cardio_diseases
sex,-0.119174,0.075247,0.005842,-0.08241,-0.086526


### Колонка family

In [1150]:
df.family.value_counts(dropna=False)

в браке в настоящее время                          937
вдовец / вдова                                     246
в разводе                                          201
гражданский брак / проживание с партнером          129
никогда не был(а) в браке                           77
раздельное проживание (официально не разведены)      3
Name: family, dtype: int64

In [1151]:
family = pd.get_dummies(df.family)
print('before replace: ', list(family.columns))
fam_cols = ['married', 'divorced', 'widowed','cohabited','single','separated']
family.columns = fam_cols
print('after replace: ', list(family.columns))

before replace:  ['в браке в настоящее время', 'в разводе', 'вдовец / вдова', 'гражданский брак / проживание с партнером', 'никогда не был(а) в браке', 'раздельное проживание (официально не разведены)']
after replace:  ['married', 'divorced', 'widowed', 'cohabited', 'single', 'separated']


In [1152]:
df = pd.concat([df, family], axis=1)
df.drop('family', axis=1, inplace=True)

In [1153]:
check_corrs_with_target(df, col = fam_cols)

corrs_matrix_between:  ['married', 'divorced', 'widowed', 'cohabited', 'single', 'separated']


Unnamed: 0,arterial_hypertension,ACD,IHD,heart_failure,other_cardio_diseases
married,-0.080766,0.01921,-0.025806,-0.046685,-0.02751
divorced,-0.014791,0.009703,-0.04085,-0.005838,0.029754
widowed,0.189502,0.012461,0.093806,0.132974,0.032011
cohabited,-0.03729,-0.026097,-0.01946,-0.049826,-0.001516
single,-0.051434,-0.045989,-0.004129,-0.039003,-0.033028
separated,-0.042882,-0.009703,-0.017117,-0.015315,-0.014411


### Колонка ethnos

In [1154]:
df.ethnos.value_counts(dropna=False)

европейская                                                                                                      1559
другая азиатская (Корея, Малайзия, Таиланд, Вьетнам, Казахстан, Киргизия, Туркмения, Узбекистан, Таджикистан)      24
прочее (любая иная этно-расовая группа, не представленная выше)                                                    10
Name: ethnos, dtype: int64

In [1155]:
ethnos = pd.get_dummies(df.ethnos)
print('before replace: ', list(ethnos.columns))
ethnos_col = ['asian', 'european', 'other']
ethnos.columns = ethnos_col
print('after replace: ', list(ethnos.columns))

before replace:  ['другая азиатская (Корея, Малайзия, Таиланд, Вьетнам, Казахстан, Киргизия, Туркмения, Узбекистан, Таджикистан)', 'европейская', 'прочее (любая иная этно-расовая группа, не представленная выше)']
after replace:  ['asian', 'european', 'other']


In [1156]:
df = pd.concat([df, ethnos], axis=1)

In [1157]:
df.drop('ethnos', axis=1, inplace=True)

In [1158]:
check_corrs_with_target(df, col=ethnos_col)

corrs_matrix_between:  ['asian', 'european', 'other']


Unnamed: 0,arterial_hypertension,ACD,IHD,heart_failure,other_cardio_diseases
asian,0.000964,-0.028513,0.022155,0.007666,-0.042351
european,0.002794,0.034006,-0.001218,0.009179,0.050509
other,-0.006621,-0.0182,-0.032108,-0.028727,-0.027032


### Колонка nationality

In [1159]:
df.nationality.value_counts(dropna=False)

Русские                  1500
Татары                     33
Немцы                      10
Украинцы                   10
Чуваши                      9
Азербайджанцы               4
Белорусы                    3
Другие национальности       3
Казахи                      3
Армяне                      3
Удмурты                     2
Евреи                       2
Таджики                     2
Киргизы                     2
Мордва                      2
Башкиры                     1
Молдаване                   1
Буряты                      1
Эстонцы                     1
Лезгины                     1
Name: nationality, dtype: int64

In [1160]:
# Добавление колонки is_russian, так как количество других национальностей нерепрезентативно
df['is_russian'] = np.where(df['nationality'] == 'Русские', 1, 0)

In [1161]:
# Удаление исходной колонки
df.drop('nationality', axis=1, inplace=True)

In [1162]:
check_corrs_with_target(df, col=['is_russian'])

corrs_matrix_between:  ['is_russian']


Unnamed: 0,arterial_hypertension,ACD,IHD,heart_failure,other_cardio_diseases
is_russian,-0.016501,0.008887,-0.056265,-0.035137,0.016238


### Колонка religion

In [1163]:
df.religion.value_counts(dropna=False)

Христианство         1361
Атеист / агностик     127
Нет                    73
Ислам                  30
Другое                  1
Индуизм                 1
Name: religion, dtype: int64

In [1164]:
religion = pd.get_dummies(df.religion)
print('before replace: ', list(religion.columns))
religion_cols = ['atheist', 'other', 'hinduist', 'muslim', 'no', 'christian']
religion.columns = religion_cols
print('after replace: ', list(religion.columns))

before replace:  ['Атеист / агностик', 'Другое', 'Индуизм', 'Ислам', 'Нет', 'Христианство']
after replace:  ['atheist', 'other', 'hinduist', 'muslim', 'no', 'christian']


In [1165]:
religion.drop(['atheist', 'hinduist','muslim','no','other'], axis=1, inplace=True)

In [1166]:
df = pd.concat([df, religion], axis=1)

In [1167]:
df.drop('religion', axis=1, inplace=True)

In [1168]:
check_corrs_with_target(df, col='christian')

corrs_matrix_between:  christian


arterial_hypertension    0.049008
ACD                      0.049311
IHD                      0.004372
heart_failure            0.020798
other_cardio_diseases    0.056125
Name: christian, dtype: float64

### Колонка education

In [1169]:
df.education.value_counts(dropna=False)

4 - профессиональное училище                         757
5 - ВУЗ                                              547
3 - средняя школа / закон.среднее / выше среднего    279
2 - начальная школа                                   10
Name: education, dtype: int64

In [1170]:
# Приведение к порядковому признаку
# Единицу можно было не вычитать, так как впоследствии все равно применяется StandardScaler
df.education = df.education.apply(lambda x: int(x[0])-1)

In [1171]:
check_corrs_with_target(df, col='education')

corrs_matrix_between:  education


arterial_hypertension   -0.106846
ACD                     -0.064249
IHD                     -0.050195
heart_failure           -0.080554
other_cardio_diseases    0.016931
Name: education, dtype: float64

### Колонка profession

In [1172]:
df.profession.value_counts(dropna=False)

дипломированные специалисты                                                            382
низкоквалифицированные работники                                                       225
работники,  занятые в сфере обслуживания, торговые работники магазинов и рынков        205
ремесленники и представители других отраслей промышленности                            149
служащие                                                                               142
техники и младшие специалисты                                                          140
ведение домашнего хозяйства                                                            133
операторы и монтажники установок и машинного оборудования                              128
представители   законодат.   органов   власти,  высокопостав. долж.лица и менеджеры     55
квалифицированные работники сельского хозяйства и рыболовного                           25
вооруженные силы                                                                         9

In [1173]:
profession = pd.get_dummies(df.profession)
profession.columns

Index(['ведение домашнего хозяйства', 'вооруженные силы',
       'дипломированные специалисты',
       'квалифицированные работники сельского хозяйства и рыболовного',
       'низкоквалифицированные работники',
       'операторы и монтажники установок и машинного оборудования',
       'представители   законодат.   органов   власти,  высокопостав. долж.лица и менеджеры',
       'работники,  занятые в сфере обслуживания, торговые работники магазинов и рынков',
       'ремесленники и представители других отраслей промышленности',
       'служащие', 'техники и младшие специалисты'],
      dtype='object')

In [1174]:
profession_cols = ['household','military','senior_specialist','farmer/fisherman','low_qualified',
                      'hard_industry','top_management','service','crafters_other', 'office', 'junior_specialists']

profession.columns = profession_cols 

In [1175]:
df = pd.concat([df, profession], axis=1)
df.drop('profession', axis=1, inplace=True)

In [1176]:
check_corrs_with_target(df, col=profession_cols)

corrs_matrix_between:  ['household', 'military', 'senior_specialist', 'farmer/fisherman', 'low_qualified', 'hard_industry', 'top_management', 'service', 'crafters_other', 'office', 'junior_specialists']


Unnamed: 0,arterial_hypertension,ACD,IHD,heart_failure,other_cardio_diseases
household,0.061559,0.009688,0.05819,0.098212,-0.016993
military,0.019338,-0.015365,-0.027108,-0.024253,0.027865
senior_specialist,-0.049845,-0.020203,-0.004255,-0.037893,0.049462
farmer/fisherman,0.064455,0.049622,0.07046,0.060338,0.012978
low_qualified,-0.008645,-0.028277,0.00993,-0.048256,-0.056464
hard_industry,-0.092412,0.017783,-0.022026,-0.016324,-0.089831
top_management,-0.01503,-0.04131,-0.055884,-0.046672,-0.041891
service,-0.009041,-0.019749,-0.057863,0.037795,0.031926
crafters_other,0.060903,0.109801,0.012044,-0.047276,0.012399
office,-0.021777,0.002407,-0.000286,0.023276,0.03625


### Колонка job

In [1177]:
df.job.value_counts(dropna=False)

1    865
0    728
Name: job, dtype: int64

In [1178]:
check_corrs_with_target(df, col='job')

corrs_matrix_between:  job


arterial_hypertension   -0.224856
ACD                     -0.077221
IHD                     -0.206260
heart_failure           -0.176134
other_cardio_diseases   -0.058784
Name: job, dtype: float64

### Колонка retired

In [1179]:
df.retired.value_counts(dropna=False)

0    1089
1     504
Name: retired, dtype: int64

In [1180]:
check_corrs_with_target(df, col='retired')

corrs_matrix_between:  retired


arterial_hypertension    0.255916
ACD                      0.057585
IHD                      0.221894
heart_failure            0.175824
other_cardio_diseases    0.032419
Name: retired, dtype: float64

### Колонка stop_work_due_disease

In [1181]:
df.stop_work_due_disease.value_counts(dropna=False)

0    1509
1      84
Name: stop_work_due_disease, dtype: int64

In [1182]:
check_corrs_with_target(df, col = 'stop_work_due_disease')

corrs_matrix_between:  stop_work_due_disease


arterial_hypertension    0.044279
ACD                      0.097109
IHD                      0.080011
heart_failure            0.054904
other_cardio_diseases    0.031735
Name: stop_work_due_disease, dtype: float64

### Колонка diabetes

In [1183]:
df.diabetes.value_counts(dropna=False)

0    1441
1     152
Name: diabetes, dtype: int64

### Колонка hepatitis

In [1184]:
df.hepatitis.value_counts(dropna=False)

0    1410
1     183
Name: hepatitis, dtype: int64

### Колонка oncology

In [1185]:
df.oncology.value_counts(dropna=False)

0    1521
1      72
Name: oncology, dtype: int64

### Колонка chronic_lung_disease

In [1186]:
df.chronic_lung_disease.value_counts(dropna=False)

0    1469
1     124
Name: chronic_lung_disease, dtype: int64

### Колонка bronchial_asthma

In [1187]:
df.bronchial_asthma.value_counts(dropna=False)

0    1530
1      63
Name: bronchial_asthma, dtype: int64

### Колонка tuberculosis

In [1188]:
df.tuberculosis.value_counts(dropna=False)

0    1565
1      28
Name: tuberculosis, dtype: int64

### Колонка hiv/aids

In [1189]:
df['hiv/aids'].value_counts(dropna=False)

0    1591
1       2
Name: hiv/aids, dtype: int64

In [1190]:
check_corrs_with_target(df, col=['diabetes', 'hepatitis', 'oncology',
                        'chronic_lung_disease', 'bronchial_asthma', 'hiv/aids'])

corrs_matrix_between:  ['diabetes', 'hepatitis', 'oncology', 'chronic_lung_disease', 'bronchial_asthma', 'hiv/aids']


Unnamed: 0,arterial_hypertension,ACD,IHD,heart_failure,other_cardio_diseases
diabetes,0.230891,0.044747,0.141505,0.100181,0.034551
hepatitis,-0.030354,-0.014875,-0.020499,0.004706,-0.004002
oncology,0.090267,0.003835,0.042075,0.044964,0.055175
chronic_lung_disease,0.066575,-0.001782,0.002166,0.049622,0.062571
bronchial_asthma,0.091658,-0.044858,0.046903,0.100989,0.023601
hiv/aids,-0.042882,-0.009703,-0.017117,-0.015315,-0.014411


# ADD FEATURE

In [1191]:
# Колонки связанные с коморбидностью
comorbid_cols = ['diabetes', 'hepatitis', 'oncology',
                        'chronic_lung_disease', 'bronchial_asthma', 'hiv/aids']

In [1192]:
df[comorbid_cols].apply(lambda x: sum(x))

diabetes                152
hepatitis               183
oncology                 72
chronic_lung_disease    124
bronchial_asthma         63
hiv/aids                  2
dtype: int64

In [1193]:
df['comorbid_count'] = df[comorbid_cols].sum(axis=1)
df['is_comorbid'] = np.where(df['comorbid_count']>0, 1, 0)

In [1194]:
check_corrs_with_target(df, col = ['comorbid_count','is_comorbid'])

corrs_matrix_between:  ['comorbid_count', 'is_comorbid']


Unnamed: 0,arterial_hypertension,ACD,IHD,heart_failure,other_cardio_diseases
comorbid_count,0.179487,-0.00046,0.086354,0.117311,0.065174
is_comorbid,0.178101,0.022106,0.095233,0.116283,0.084209


### Колонка intake_medicines

In [1195]:
df.intake_medicines.value_counts(dropna=False)

1    995
0    598
Name: intake_medicines, dtype: int64

In [1196]:
check_corrs_with_target(df, col='intake_medicines')

corrs_matrix_between:  intake_medicines


arterial_hypertension    0.452289
ACD                      0.114712
IHD                      0.218472
heart_failure            0.176454
other_cardio_diseases    0.119922
Name: intake_medicines, dtype: float64

### Колонка trauma_last_year

In [1197]:
df.trauma_last_year.value_counts(dropna=False)

0    1508
1      85
Name: trauma_last_year, dtype: int64

In [1198]:
check_corrs_with_target(df, col='trauma_last_year')

corrs_matrix_between:  trauma_last_year


arterial_hypertension    0.011036
ACD                     -0.004355
IHD                     -0.017729
heart_failure           -0.001962
other_cardio_diseases    0.039166
Name: trauma_last_year, dtype: float64

### Колонка fractures

In [1199]:
df.fractures.value_counts(dropna=False)

0    1032
1     561
Name: fractures, dtype: int64

In [1200]:
check_corrs_with_target(df, col='fractures')

corrs_matrix_between:  fractures


arterial_hypertension    0.097213
ACD                      0.070100
IHD                      0.077409
heart_failure            0.058426
other_cardio_diseases    0.034900
Name: fractures, dtype: float64

# ADD FEATURE

In [1201]:
trauma_features = ['fracture_last_year', 'trauma_on_retire', 'fracture_on_retire']
df['fracture_last_year'] = df['trauma_last_year'] * df['fractures']
df['trauma_on_retire'] = df['trauma_last_year'] * df['retired']
df['fracture_on_retire'] = df['fracture_last_year'] * df['retired']


In [1202]:
check_corrs_with_target(df, col=trauma_features)

corrs_matrix_between:  ['fracture_last_year', 'trauma_on_retire', 'fracture_on_retire']


Unnamed: 0,arterial_hypertension,ACD,IHD,heart_failure,other_cardio_diseases
fracture_last_year,-0.00386,0.013674,-0.004894,-0.009606,0.016512
trauma_on_retire,0.032709,0.010554,0.046307,0.034002,0.040642
fracture_on_retire,0.006801,0.028961,0.055685,0.034031,0.039511


### Колонка smoking

In [1203]:
df.smoking.value_counts(dropna=False)

Никогда не курил(а)    911
Курит                  360
Бросил(а)              321
Никогда не курил         1
Name: smoking, dtype: int64

In [1204]:
#Исправление ошибки
df.smoking.replace('Никогда не курил', 'Никогда не курил(а)', inplace=True)

In [1205]:
smoking = pd.get_dummies(df.smoking)
print('before replace: ', list(smoking.columns))
smoking_cols = ['previous_smoke', 'smoking', 'never_smoke']
smoking.columns= smoking_cols 
print('after replace: ', list(smoking.columns))

before replace:  ['Бросил(а)', 'Курит', 'Никогда не курил(а)']
after replace:  ['previous_smoke', 'smoking', 'never_smoke']


In [1206]:
df = df.drop('smoking', axis=True)
df = pd.concat([df, smoking], axis=1)

# ADD FEATURE

In [1207]:
df['smoker_score'] = df['never_smoke']*0 + df['previous_smoke'] + df['smoking'] * 2

In [1208]:
smoking_cols = ['previous_smoke', 'never_smoke', 'smoking', 'smoker_score']

In [1209]:
check_corrs_with_target(df, col=smoking_cols)

corrs_matrix_between:  ['previous_smoke', 'never_smoke', 'smoking', 'smoker_score']


Unnamed: 0,arterial_hypertension,ACD,IHD,heart_failure,other_cardio_diseases
previous_smoke,0.014692,-0.054241,0.020759,-0.04527,-0.001829
never_smoke,0.069542,-0.003255,0.015962,0.094327,0.008136
smoking,-0.095603,0.055267,-0.038434,-0.067838,-0.00782
smoker_score,-0.090244,0.030081,-0.029108,-0.090927,-0.008845


### Колонка smoking_duration

In [1210]:
df.smoking_duration.value_counts(dropna=False)

NaN     912
20.0    111
18.0     94
16.0     72
17.0     66
15.0     51
19.0     33
25.0     31
30.0     23
14.0     18
12.0     15
21.0     15
22.0     14
7.0      13
24.0     11
40.0     11
23.0     10
13.0      9
10.0      9
35.0      9
6.0       5
27.0      5
45.0      5
26.0      5
8.0       4
28.0      4
29.0      4
33.0      3
36.0      3
41.0      3
50.0      3
31.0      2
39.0      2
47.0      2
52.0      1
44.0      1
38.0      1
32.0      1
56.0      1
42.0      1
55.0      1
58.0      1
11.0      1
51.0      1
34.0      1
37.0      1
53.0      1
5.0       1
54.0      1
43.0      1
Name: smoking_duration, dtype: int64

In [1211]:
df['smoking_duration_score'] = df['smoking_duration'] // 5

In [1212]:
df['smoking_duration_score'].value_counts(dropna=False)

NaN     912
3.0     316
4.0     161
2.0      52
5.0      49
6.0      30
1.0      23
8.0      17
7.0      16
10.0      7
9.0       7
11.0      3
Name: smoking_duration_score, dtype: int64

In [1213]:
df['smoking_duration'].fillna(0, inplace=True)

In [1214]:
df['smoking_duration_score'].fillna(0, inplace=True)

In [1215]:
df['smoking_duration_score'] = np.where(df['smoking_duration_score']  > 9, 9, df['smoking_duration_score'])

In [1216]:
check_corrs_with_target(df, col=['smoking_duration', 'smoking_duration_score'])

corrs_matrix_between:  ['smoking_duration', 'smoking_duration_score']


Unnamed: 0,arterial_hypertension,ACD,IHD,heart_failure,other_cardio_diseases
smoking_duration,-0.036623,0.003147,-0.016511,-0.06434,0.020556
smoking_duration_score,-0.033445,3.3e-05,-0.017016,-0.059927,0.02171


### Колонка ciggaretes_per_day

In [1217]:
#df.ciggaretes_per_day = df.ciggaretes_per_day.replace(180,18)

In [1218]:
df['ciggaretes_per_day_log'] = np.log(df.ciggaretes_per_day + 1)

In [1219]:
df['ciggaretes_per_day_score'] = (df.ciggaretes_per_day // 10)

In [1220]:
df['ciggaretes_per_day_score_log'] = np.log(df.ciggaretes_per_day_score + 1)

In [1221]:
df.ciggaretes_per_day_score# = np.where(df.ciggaretes_per_day_score > 4, 4, df.ciggaretes_per_day_score)

0       2.0
1       NaN
2       NaN
3       1.0
4       NaN
       ... 
1588    NaN
1589    NaN
1590    NaN
1591    NaN
1592    NaN
Name: ciggaretes_per_day_score, Length: 1593, dtype: float64

In [1222]:
ciggaretes_features = ['ciggaretes_per_day', 'ciggaretes_per_day_score',
                       'ciggaretes_per_day_log', 'ciggaretes_per_day_score_log']

In [1223]:
check_corrs_with_target(df, col=ciggaretes_features)

corrs_matrix_between:  ['ciggaretes_per_day', 'ciggaretes_per_day_score', 'ciggaretes_per_day_log', 'ciggaretes_per_day_score_log']


Unnamed: 0,arterial_hypertension,ACD,IHD,heart_failure,other_cardio_diseases
ciggaretes_per_day,0.052032,0.025677,0.058002,-0.006836,-0.072132
ciggaretes_per_day_score,0.049794,0.041378,0.061849,-0.007198,-0.088777
ciggaretes_per_day_log,0.041191,0.067571,0.06788,-0.012832,-0.05917
ciggaretes_per_day_score_log,0.034831,0.068718,0.076364,-0.012018,-0.095889


In [1224]:
df['ciggaretes_per_day_score'].fillna(0, inplace=True)

In [1225]:
for col in ciggaretes_features:
    df[col].fillna(0, inplace=True)

In [1226]:
check_corrs_with_target(df, col=ciggaretes_features)

corrs_matrix_between:  ['ciggaretes_per_day', 'ciggaretes_per_day_score', 'ciggaretes_per_day_log', 'ciggaretes_per_day_score_log']


Unnamed: 0,arterial_hypertension,ACD,IHD,heart_failure,other_cardio_diseases
ciggaretes_per_day,-0.029327,0.007107,0.015671,-0.069713,-0.041501
ciggaretes_per_day_score,-0.021092,0.016827,0.021948,-0.061864,-0.05238
ciggaretes_per_day_log,-0.057114,0.01071,0.003519,-0.088335,-0.026594
ciggaretes_per_day_score_log,-0.035174,0.027113,0.02498,-0.070049,-0.052868


# ADD FEATURE

In [1227]:
df['smoking_score_int_log'] = (
    df['smoker_score'] * df['smoking_duration_score']) ** df['ciggaretes_per_day_log']

df['smoking_score_int_score'] = (
    df['smoker_score'] * df['smoking_duration_score']) ** df['ciggaretes_per_day_score']

df['smoking_score_int_score_2'] = (
    df['smoker_score'] * df['ciggaretes_per_day_score']) * df['smoking_duration_score']

df['smoking_score_int_score_3'] = (
    df['smoker_score'] * (df['ciggaretes_per_day_score'] ** df['smoking_duration_score']))

df['smoking_score_int_score_log'] = df['smoker_score'] ** df['ciggaretes_per_day_score_log']

In [1228]:
smoke_features = ['smoker_score',
                  'ciggaretes_per_day_log', 'smoking_score_int_score', 
                  'smoking_score_int_log', 'smoking_score_int_score_log', 'smoking_score_int_score_2', 'smoking_score_int_score_3']

In [1229]:
check_corrs_with_target(df, col=smoke_features)

corrs_matrix_between:  ['smoker_score', 'ciggaretes_per_day_log', 'smoking_score_int_score', 'smoking_score_int_log', 'smoking_score_int_score_log', 'smoking_score_int_score_2', 'smoking_score_int_score_3']


Unnamed: 0,arterial_hypertension,ACD,IHD,heart_failure,other_cardio_diseases
smoker_score,-0.090244,0.030081,-0.029108,-0.090927,-0.008845
ciggaretes_per_day_log,-0.057114,0.01071,0.003519,-0.088335,-0.026594
smoking_score_int_score,0.059953,-0.016019,0.032694,-0.030612,-0.028614
smoking_score_int_log,0.011216,0.006253,-0.000963,-0.03812,-0.022421
smoking_score_int_score_log,-0.033554,0.037117,-0.023699,-0.07558,-0.057692
smoking_score_int_score_2,-0.022471,0.025437,0.006861,-0.07198,-0.056594
smoking_score_int_score_3,0.034595,-0.006888,0.086485,-0.01101,-0.010346


In [1230]:
df[smoke_features].isna().sum()

smoker_score                   0
ciggaretes_per_day_log         0
smoking_score_int_score        0
smoking_score_int_log          0
smoking_score_int_score_log    0
smoking_score_int_score_2      0
smoking_score_int_score_3      0
dtype: int64

### Колонка passive_smoking

In [1231]:
df.passive_smoking.value_counts(dropna=False)

0    1215
1     378
Name: passive_smoking, dtype: int64

In [1232]:
check_corrs_with_target(df, col='passive_smoking')

corrs_matrix_between:  passive_smoking


arterial_hypertension    0.011831
ACD                     -0.044094
IHD                      0.034347
heart_failure            0.028622
other_cardio_diseases   -0.001483
Name: passive_smoking, dtype: float64

### Колонка passive_smoking_frequency

In [1233]:
df.passive_smoking_frequency.value_counts(dropna=False)

NaN                       1216
1-2 раза в неделю          158
4 и более раз в день        75
2-3 раза в день             59
не менее 1 раза в день      47
3-6 раз в неделю            38
Name: passive_smoking_frequency, dtype: int64

In [1234]:
df.passive_smoking_frequency.fillna(0, inplace=True)

In [1235]:
#Словарь для замены текста на порядок:
passive_smoking_dict = {'1-2 раза в неделю':1, '3-6 раз в неделю':2, 'не менее 1 раза в день':3, '2-3 раза в день':4, '4 и более раз в день':5}

In [1236]:
df.passive_smoking_frequency.replace(passive_smoking_dict, inplace=True)

In [1237]:
check_corrs_with_target(df, col = 'passive_smoking_frequency')

corrs_matrix_between:  passive_smoking_frequency


arterial_hypertension    0.003508
ACD                     -0.046057
IHD                      0.038805
heart_failure            0.052193
other_cardio_diseases    0.017315
Name: passive_smoking_frequency, dtype: float64

### Колонка alcohol

In [1238]:
df.alcohol.value_counts(dropna=False)

употребляю в настоящее время    1046
никогда не употреблял            300
ранее употреблял                 247
Name: alcohol, dtype: int64

In [1239]:
alcohol = pd.get_dummies(df.alcohol)
print('Before replace: ', list(alcohol.columns))
alcohol_col = ['never_drink_alco', 'previous_drink_alco', 'drink_alco_now']
alcohol.columns = alcohol_col
print('After replace: ', list(alcohol.columns))

Before replace:  ['никогда не употреблял', 'ранее употреблял', 'употребляю в настоящее время']
After replace:  ['never_drink_alco', 'previous_drink_alco', 'drink_alco_now']


In [1240]:
df = pd.concat([df, alcohol], axis=1)
df.drop('alcohol', axis=1, inplace=True)

In [1241]:
check_corrs_with_target(df, col = alcohol_col)

corrs_matrix_between:  ['never_drink_alco', 'previous_drink_alco', 'drink_alco_now']


Unnamed: 0,arterial_hypertension,ACD,IHD,heart_failure,other_cardio_diseases
never_drink_alco,0.049645,-0.042137,0.015082,0.059077,0.049747
previous_drink_alco,0.063277,0.060122,0.094059,0.097195,0.019313
drink_alco_now,-0.089252,-0.013329,-0.08568,-0.123345,-0.054927


In [1242]:
df['alcohol_score'] = df['never_drink_alco']*0 + df['previous_drink_alco'] + df['drink_alco_now']*2

In [1243]:
check_corrs_with_target(df, col = 'alcohol_score')

corrs_matrix_between:  alcohol_score


arterial_hypertension   -0.078958
ACD                      0.012484
IHD                     -0.059839
heart_failure           -0.104449
other_cardio_diseases   -0.057996
Name: alcohol_score, dtype: float64

### Колонка alcohol_duration

In [1244]:
df.alcohol_duration.value_counts(dropna=False)

NaN     304
20.0    299
18.0    278
17.0    131
16.0    111
25.0     88
19.0     66
21.0     57
15.0     50
22.0     42
30.0     28
23.0     25
14.0     18
35.0     15
24.0     13
26.0      8
40.0      8
12.0      5
45.0      5
29.0      5
33.0      4
27.0      4
37.0      4
13.0      4
10.0      3
50.0      3
32.0      3
6.0       2
28.0      2
46.0      1
36.0      1
9.0       1
48.0      1
63.0      1
43.0      1
44.0      1
60.0      1
Name: alcohol_duration, dtype: int64

In [1245]:
df['alcohol_duration_score'] = df['alcohol_duration'] // 5

In [1246]:
df['alcohol_duration_log'] = np.log(df['alcohol_duration'] + 1)

In [1247]:
df['alcohol_duration_score'] = np.where(df.alcohol_duration_score>  9, 9, df.alcohol_duration_score)

In [1248]:
alco_features = ['alcohol_duration', 'alcohol_duration_score', 'alcohol_duration_log']

In [1249]:
check_corrs_with_target(df, col = alco_features)

corrs_matrix_between:  ['alcohol_duration', 'alcohol_duration_score', 'alcohol_duration_log']


Unnamed: 0,arterial_hypertension,ACD,IHD,heart_failure,other_cardio_diseases
alcohol_duration,0.101612,0.018077,0.034184,0.076321,0.016368
alcohol_duration_score,0.112572,0.021688,0.03608,0.082668,0.006369
alcohol_duration_log,0.108328,0.022571,0.034945,0.088098,0.008901


In [1250]:
df.alcohol_duration.fillna(0, inplace=True)
df.alcohol_duration_score.fillna(0, inplace=True)
df.alcohol_duration_log.fillna(0, inplace=True)

In [1251]:
df['alcohol_int_score_1'] = df['alcohol_score'] ** df['alcohol_duration_score']

df['alcohol_int_score_2'] = (df['alcohol_score'] * df['alcohol_duration_score'])

df['alcohol_int_score_3'] = df['alcohol_duration_log'] ** df['alcohol_score']


In [1252]:
new_alco_features = ['alcohol_int_score_1', 'alcohol_int_score_2', 'alcohol_int_score_3']

In [1253]:
check_corrs_with_target(df, col = new_alco_features)

corrs_matrix_between:  ['alcohol_int_score_1', 'alcohol_int_score_2', 'alcohol_int_score_3']


Unnamed: 0,arterial_hypertension,ACD,IHD,heart_failure,other_cardio_diseases
alcohol_int_score_1,0.019045,0.011645,-0.015414,-0.012578,-0.015989
alcohol_int_score_2,-0.017408,0.032432,-0.036184,-0.051155,-0.039593
alcohol_int_score_3,-0.062109,0.013989,-0.064165,-0.091707,-0.044619


### Колонка time_fall_asleep

In [1254]:
df.time_fall_asleep = df.time_fall_asleep.apply(lambda x: dt.strptime(x, '%H:%M:%S'))

In [1255]:
df.time_fall_asleep.value_counts()

1900-01-01 23:00:00    518
1900-01-01 00:00:00    343
1900-01-01 22:00:00    291
1900-01-01 01:00:00    113
1900-01-01 23:30:00     73
1900-01-01 21:00:00     63
1900-01-01 22:30:00     62
1900-01-01 02:00:00     34
1900-01-01 00:30:00     30
1900-01-01 21:30:00     22
1900-01-01 23:59:00      9
1900-01-01 01:30:00      7
1900-01-01 23:50:00      6
1900-01-01 03:00:00      4
1900-01-01 02:30:00      2
1900-01-01 20:00:00      2
1900-01-01 00:10:00      2
1900-01-01 01:20:00      1
1900-01-01 00:15:00      1
1900-01-01 12:00:00      1
1900-01-01 00:01:00      1
1900-01-01 20:30:00      1
1900-01-01 22:23:00      1
1900-01-01 21:50:00      1
1900-01-01 09:00:00      1
1900-01-01 00:05:00      1
1900-01-01 04:00:00      1
1900-01-01 00:00:30      1
1900-01-01 05:00:00      1
Name: time_fall_asleep, dtype: int64

### Статья о времени засыпания:
- https://doi.org/10.1093/ehjdh/ztab088

In [1256]:
df['sleep_onset_early_22'] = np.where(df.time_fall_asleep.dt.time.between(time(20,0),time(21,59)), 1, 0)
df['sleep_onset_22'] = np.where(df.time_fall_asleep.dt.time.between(time(22,0),time(22,59)), 1, 0)
df['sleep_onset_23'] = np.where(df.time_fall_asleep.dt.time.between(time(23,0),time(23,59)), 1, 0)
df['sleep_onset_later_after_0'] = np.where(df.time_fall_asleep.dt.time.between(time(0,0),time(4,0)), 1, 0)

#additional:
df['early_onset'] = df['sleep_onset_early_22'] + df['sleep_onset_22']
df['lately_onset'] = df['sleep_onset_23'] + df['sleep_onset_later_after_0']

In [1257]:
sleep_onset_cols = ['sleep_onset_early_22', 'sleep_onset_22',
                    'sleep_onset_23', 'sleep_onset_later_after_0', 'early_onset', 'lately_onset']

In [1258]:
check_corrs_with_target(df, col=sleep_onset_cols)

corrs_matrix_between:  ['sleep_onset_early_22', 'sleep_onset_22', 'sleep_onset_23', 'sleep_onset_later_after_0', 'early_onset', 'lately_onset']


Unnamed: 0,arterial_hypertension,ACD,IHD,heart_failure,other_cardio_diseases
sleep_onset_early_22,0.038944,0.038912,0.00707,0.010225,0.019608
sleep_onset_22,0.027138,0.064104,0.01271,-0.023735,0.012066
sleep_onset_23,-0.013236,-0.048355,0.004281,-0.002796,-0.034675
sleep_onset_later_after_0,-0.031067,-0.024444,-0.017935,0.019204,0.016151
early_onset,0.045152,0.079354,0.015401,-0.016712,0.021255
lately_onset,-0.047608,-0.078761,-0.014505,0.017477,-0.02049


### Колонка time_awakening

In [1259]:
df.time_awakening = df.time_awakening.apply(lambda x: dt.strptime(x, '%H:%M:%S')) 

In [1260]:
df.time_awakening.value_counts()

1900-01-01 06:00:00    396
1900-01-01 07:00:00    374
1900-01-01 08:00:00    208
1900-01-01 06:30:00    116
1900-01-01 09:00:00    105
1900-01-01 05:30:00     79
1900-01-01 05:00:00     73
1900-01-01 07:30:00     64
1900-01-01 10:00:00     43
1900-01-01 08:30:00     24
1900-01-01 04:00:00     23
1900-01-01 04:30:00     13
1900-01-01 09:30:00     11
1900-01-01 11:00:00      7
1900-01-01 06:15:00      6
1900-01-01 06:20:00      5
1900-01-01 12:00:00      5
1900-01-01 03:00:00      4
1900-01-01 07:20:00      4
1900-01-01 00:00:00      3
1900-01-01 06:03:00      3
1900-01-01 06:50:00      3
1900-01-01 06:40:00      3
1900-01-01 05:50:00      3
1900-01-01 10:30:00      2
1900-01-01 06:10:00      2
1900-01-01 06:45:00      2
1900-01-01 07:15:00      2
1900-01-01 03:40:00      1
1900-01-01 05:15:00      1
1900-01-01 06:05:00      1
1900-01-01 05:20:00      1
1900-01-01 08:03:00      1
1900-01-01 07:08:00      1
1900-01-01 05:40:00      1
1900-01-01 03:30:00      1
1900-01-01 04:40:00      1
1

In [1261]:
df['sleep_upset_early_6'] = np.where(df.time_awakening.dt.time.between(time(4,0),time(5,59)), 1, 0)
df['sleep_upset_6'] = np.where(df.time_awakening.dt.time.between(time(6,0),time(6,59)), 1, 0)
df['sleep_upset_7'] = np.where(df.time_awakening.dt.time.between(time(7,0),time(7,59)), 1, 0)
df['sleep_upset_8'] = np.where(df.time_awakening.dt.time.between(time(8,0),time(8,59)), 1, 0)
df['sleep_upset_after_9'] = np.where(df.time_awakening.dt.time.between(time(9,0),time(11,59)), 1, 0)

#additional:
df['early_upset'] = df['sleep_upset_6'] + df['sleep_upset_7']
df['lately_upset'] = df['sleep_upset_8'] + df['sleep_upset_after_9']

In [1262]:
upset_features = ['sleep_upset_early_6', 'sleep_upset_6', 'sleep_upset_7',
                  'sleep_upset_8', 'sleep_upset_after_9', 'early_upset', 'lately_upset']

In [1263]:
check_corrs_with_target(df, col=upset_features)

corrs_matrix_between:  ['sleep_upset_early_6', 'sleep_upset_6', 'sleep_upset_7', 'sleep_upset_8', 'sleep_upset_after_9', 'early_upset', 'lately_upset']


Unnamed: 0,arterial_hypertension,ACD,IHD,heart_failure,other_cardio_diseases
sleep_upset_early_6,0.059959,-0.029609,-0.008345,0.039253,0.031984
sleep_upset_6,-0.035852,0.004285,0.00789,-0.013799,-0.012014
sleep_upset_7,-0.036765,0.031326,-0.008729,0.035565,-0.038354
sleep_upset_8,0.028764,-0.007423,0.03822,0.006858,0.013119
sleep_upset_after_9,0.009271,-0.023065,-0.025812,-0.070504,0.021491
early_upset,-0.068202,0.032787,-0.000377,0.019227,-0.046669
lately_upset,0.030296,-0.022269,0.013601,-0.043607,0.025885


In [1264]:
df['sleeptime'] = (df.time_awakening - df.time_fall_asleep).apply(lambda x: x.seconds//3600)

In [1265]:
df['sleeptime'] = np.where(df['sleeptime'] > 14, df['sleeptime'] - 12, df['sleeptime'])

In [1266]:
df['sleeptime'].value_counts()

7     457
8     453
6     230
9     214
10     93
5      67
11     32
4      22
12     11
3       8
0       4
13      2
Name: sleeptime, dtype: int64

In [1267]:
df['low_amount_sleep'] = np.where(df['sleeptime'] < 6, 1 ,0)
df['high_amout_sleep'] = np.where(df['sleeptime'] > 9, 1, 0)

In [1268]:
sleeptime_features = ['sleeptime', 'low_amount_sleep', 'high_amout_sleep']

In [1269]:
check_corrs_with_target(df, col=sleeptime_features)

corrs_matrix_between:  ['sleeptime', 'low_amount_sleep', 'high_amout_sleep']


Unnamed: 0,arterial_hypertension,ACD,IHD,heart_failure,other_cardio_diseases
sleeptime,0.035149,0.037473,-0.006052,-0.061437,-0.010208
low_amount_sleep,-0.013547,0.010021,-0.016293,0.058856,0.056018
high_amout_sleep,0.02435,0.024505,-0.015854,-0.031118,0.042978


In [1270]:
df.drop(['time_fall_asleep','time_awakening'], axis=1, inplace=True)

### Колонка midday_sleep

In [1271]:
df.midday_sleep.value_counts(dropna=False)

0    1232
1     361
Name: midday_sleep, dtype: int64

In [1272]:
check_corrs_with_target(df, col='midday_sleep')

corrs_matrix_between:  midday_sleep


arterial_hypertension    0.075877
ACD                      0.070712
IHD                      0.049903
heart_failure            0.002388
other_cardio_diseases   -0.056408
Name: midday_sleep, dtype: float64

### Колонка sport_clubs

In [1273]:
df.sport_clubs.value_counts(dropna=False)

0    1491
1     102
Name: sport_clubs, dtype: int64

In [1274]:
check_corrs_with_target(df, col='sport_clubs')

corrs_matrix_between:  sport_clubs


arterial_hypertension   -0.011300
ACD                      0.024806
IHD                      0.000465
heart_failure            0.020270
other_cardio_diseases    0.031178
Name: sport_clubs, dtype: float64

### Колонка religion_clubs

In [1275]:
df.religion_clubs.value_counts(dropna=False)

0    1557
1      36
Name: religion_clubs, dtype: int64

In [1276]:
check_corrs_with_target(df, col = 'religion_clubs')

corrs_matrix_between:  religion_clubs


arterial_hypertension   -0.031819
ACD                     -0.032523
IHD                     -0.014801
heart_failure            0.018303
other_cardio_diseases    0.049226
Name: religion_clubs, dtype: float64

## Подготовка данных

In [1277]:
# Проверка на наличие пропусков в итоговом наборе данных
df.isna().sum().sum()

0

In [1278]:
df

Unnamed: 0,sex,education,job,retired,stop_work_due_disease,diabetes,hepatitis,oncology,chronic_lung_disease,bronchial_asthma,tuberculosis,hiv/aids,intake_medicines,trauma_last_year,fractures,smoking_duration,ciggaretes_per_day,passive_smoking,passive_smoking_frequency,alcohol_duration,midday_sleep,sport_clubs,religion_clubs,train,unknown_id_feature,id_1,id_2,id_3,id_4,married,divorced,widowed,cohabited,single,separated,asian,european,other,is_russian,christian,household,military,senior_specialist,farmer/fisherman,low_qualified,hard_industry,top_management,service,crafters_other,office,junior_specialists,comorbid_count,is_comorbid,fracture_last_year,trauma_on_retire,fracture_on_retire,previous_smoke,smoking,never_smoke,smoker_score,smoking_duration_score,ciggaretes_per_day_log,ciggaretes_per_day_score,ciggaretes_per_day_score_log,smoking_score_int_log,smoking_score_int_score,smoking_score_int_score_2,smoking_score_int_score_3,smoking_score_int_score_log,never_drink_alco,previous_drink_alco,drink_alco_now,alcohol_score,alcohol_duration_score,alcohol_duration_log,alcohol_int_score_1,alcohol_int_score_2,alcohol_int_score_3,sleep_onset_early_22,sleep_onset_22,sleep_onset_23,sleep_onset_later_after_0,early_onset,lately_onset,sleep_upset_early_6,sleep_upset_6,sleep_upset_7,sleep_upset_8,sleep_upset_after_9,early_upset,lately_upset,sleeptime,low_amount_sleep,high_amout_sleep
0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,15.0,20.0,0,0,18.0,0,0,0,1,2,0,1,0,0,1,0,0,0,0,0,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2,3.0,3.044522,2.0,1.098612,233.937014,36.0,12.0,16.0,2.141486,0,0,1,2,3.0,2.944439,8.0,6.0,8.669721,0,1,0,0,1,0,0,1,0,0,0,1,0,8,0,0
1,0,4,0,0,0,1,0,0,0,0,0,0,1,0,1,0.0,0.0,0,0,0.0,1,0,0,1,1,1,0,0,0,0,1,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0.0,0.000000,0.0,0.000000,1.000000,1.0,0.0,0.0,1.000000,1,0,0,0,0.0,0.000000,1.0,0.0,1.000000,0,0,0,1,0,1,1,0,0,0,0,0,0,4,1,0
2,0,4,0,0,0,0,0,0,0,0,0,0,1,0,0,0.0,0.0,1,1,17.0,0,0,0,1,3,0,0,1,0,1,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.0,0.000000,0.0,0.000000,1.000000,1.0,0.0,0.0,1.000000,0,0,1,2,3.0,2.890372,8.0,6.0,8.354249,0,0,1,0,0,1,0,0,1,0,0,1,0,8,0,0
3,1,2,1,0,0,0,0,1,0,0,0,0,1,0,0,12.0,10.0,1,2,13.0,0,0,0,1,2,0,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,1,2.0,2.397895,1.0,0.693147,5.270337,2.0,2.0,1.0,1.000000,0,0,1,2,2.0,2.639057,4.0,4.0,6.964624,0,0,1,0,0,1,0,0,1,0,0,1,0,8,0,0
4,0,2,0,0,1,1,1,0,0,0,0,0,1,0,1,0.0,0.0,1,3,16.0,0,0,0,1,1,1,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,1,0,0,0,0,0,2,1,0,0,0,0,0,1,0,0.0,0.000000,0.0,0.000000,1.000000,1.0,0.0,0.0,1.000000,0,0,1,2,3.0,2.833213,8.0,6.0,8.027098,0,0,1,0,0,1,0,1,0,0,0,1,0,7,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1588,0,4,1,0,0,1,0,0,0,0,0,0,1,1,1,0.0,0.0,0,0,30.0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,0,0.0,0.000000,0.0,0.000000,1.000000,1.0,0.0,0.0,1.000000,0,0,1,2,6.0,3.433987,64.0,12.0,11.792268,0,0,1,0,0,1,0,1,0,0,0,1,0,7,0,0
1589,0,4,0,1,0,0,0,0,0,0,0,0,1,0,0,0.0,0.0,0,0,0.0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.0,0.000000,0.0,0.000000,1.000000,1.0,0.0,0.0,1.000000,1,0,0,0,0.0,0.000000,1.0,0.0,1.000000,0,0,0,1,0,1,0,0,0,0,1,0,1,9,0,0
1590,0,3,1,0,0,0,0,0,0,0,0,0,1,0,0,0.0,0.0,1,3,25.0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0.0,0.000000,0.0,0.000000,1.000000,1.0,0.0,0.0,1.000000,0,1,0,1,5.0,3.258097,1.0,5.0,3.258097,0,1,0,0,1,0,0,0,1,0,0,1,0,9,0,0
1591,0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,1,4,0.0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0.0,0.000000,0.0,0.000000,1.000000,1.0,0.0,0.0,1.000000,1,0,0,0,0.0,0.000000,1.0,0.0,1.000000,0,0,0,1,0,1,0,0,0,1,0,0,1,8,0,0


In [1279]:
df['cardio_score'] = df['sex'] + df['smoking'] * 2 + df['previous_smoke'] * 1 + \
        df['diabetes'] * 2 - df['sport_clubs'] * 3 + \
        df['retired'] * 3 + df['passive_smoking']

In [1280]:
df.corr()

Unnamed: 0,sex,education,job,retired,stop_work_due_disease,diabetes,hepatitis,oncology,chronic_lung_disease,bronchial_asthma,tuberculosis,hiv/aids,intake_medicines,trauma_last_year,fractures,smoking_duration,ciggaretes_per_day,passive_smoking,passive_smoking_frequency,alcohol_duration,midday_sleep,sport_clubs,religion_clubs,train,unknown_id_feature,id_1,id_2,id_3,id_4,married,divorced,widowed,cohabited,single,separated,asian,european,other,is_russian,christian,household,military,senior_specialist,farmer/fisherman,low_qualified,hard_industry,top_management,service,crafters_other,office,junior_specialists,comorbid_count,is_comorbid,fracture_last_year,trauma_on_retire,fracture_on_retire,previous_smoke,smoking,never_smoke,smoker_score,smoking_duration_score,ciggaretes_per_day_log,ciggaretes_per_day_score,ciggaretes_per_day_score_log,smoking_score_int_log,smoking_score_int_score,smoking_score_int_score_2,smoking_score_int_score_3,smoking_score_int_score_log,never_drink_alco,previous_drink_alco,drink_alco_now,alcohol_score,alcohol_duration_score,alcohol_duration_log,alcohol_int_score_1,alcohol_int_score_2,alcohol_int_score_3,sleep_onset_early_22,sleep_onset_22,sleep_onset_23,sleep_onset_later_after_0,early_onset,lately_onset,sleep_upset_early_6,sleep_upset_6,sleep_upset_7,sleep_upset_8,sleep_upset_after_9,early_upset,lately_upset,sleeptime,low_amount_sleep,high_amout_sleep,cardio_score
sex,1.000000,-0.091298,0.132431,-0.130662,0.030411,-0.043548,-0.028260,-0.055943,-0.010112,-0.068898,0.038126,0.015642,-0.160650,0.003998,0.131355,0.254779,0.479741,-0.008747,-0.046456,0.078465,0.056894,-0.013533,-0.034481,0.006269,0.533440,-0.566081,0.572391,-0.007482,0.015642,0.288875,-0.127840,-0.232984,0.007721,-0.076521,-0.028313,0.099624,-0.074647,-0.017058,-0.077662,-0.193820,-0.072714,0.024101,-0.128253,0.028105,0.042986,0.221272,0.079671,-0.094775,0.205339,-0.073882,-0.095700,-0.078749,-0.057228,0.010227,-0.053695,-0.051894,0.274652,0.235106,-0.421452,0.372516,0.241871,0.500839,0.474197,0.525572,0.084114,0.044536,0.377452,-0.015423,0.315199,-0.184105,0.092314,0.081219,0.139758,0.065578,0.151828,-0.043811,0.042760,0.062221,-0.039064,-0.015037,0.074347,-0.047275,-0.033976,0.030527,0.049634,0.017062,0.004007,-0.067864,-0.013824,0.020284,-0.065039,-0.051370,-0.000654,-0.034875,0.295923
education,-0.091298,1.000000,0.124734,-0.103879,-0.082145,-0.082017,0.006857,-0.009262,-0.043254,-0.048303,-0.028894,-0.056855,-0.042997,0.014600,-0.064441,-0.084878,-0.129003,-0.036542,-0.043267,-0.010456,-0.114852,0.135635,-0.009404,-0.054529,-0.070363,0.070862,-0.068295,-0.015347,-0.007659,0.032081,0.049075,-0.099546,-0.022615,0.048786,-0.009383,-0.055319,0.080116,-0.061283,0.064933,0.032393,-0.058901,0.041823,0.547824,-0.041293,-0.315200,-0.169593,0.149983,-0.090821,-0.132210,-0.012554,-0.051671,-0.075278,-0.056373,0.022207,-0.021616,0.004447,-0.036858,-0.100062,0.114478,-0.119567,-0.078674,-0.141800,-0.128052,-0.145868,-0.029819,-0.006493,-0.117245,-0.005875,-0.114888,-0.039443,-0.068472,0.084671,0.070326,-0.014515,0.027039,-0.072664,0.014689,0.052570,-0.018415,-0.054707,-0.000615,0.058446,-0.060200,0.060982,-0.104595,0.015477,0.034404,0.001790,0.030757,0.046792,0.023224,0.008942,-0.027608,-0.001499,-0.235916
job,0.132431,0.124734,1.000000,-0.741556,-0.257180,-0.130979,0.010397,-0.127971,-0.043893,-0.078942,-0.040315,-0.038648,-0.229758,-0.000869,-0.012198,0.051843,0.069567,0.067377,0.053416,0.020377,-0.159606,0.028898,0.003832,-0.047746,0.054003,-0.068323,0.079818,-0.059614,-0.003061,0.026138,0.052588,-0.151971,0.087549,0.006986,-0.018282,0.030703,-0.039568,0.025050,-0.013442,-0.003658,-0.301664,-0.048539,0.054820,-0.026108,0.082588,0.071835,0.090659,0.043974,-0.077498,0.021643,0.013263,-0.145035,-0.121559,0.026444,-0.143129,-0.109796,-0.013519,0.110037,-0.082064,0.105163,0.050636,0.082951,0.063534,0.069183,-0.013571,-0.023473,0.057684,-0.027299,0.076165,-0.057695,-0.108349,0.130097,0.106618,0.016508,0.048841,-0.034929,0.053754,0.094363,-0.012768,0.062980,0.018018,-0.068555,0.051893,-0.052827,0.100406,0.139710,0.012259,-0.187289,-0.115798,0.147133,-0.234443,-0.133863,0.037010,-0.102743,-0.503324
retired,-0.130662,-0.103879,-0.741556,1.000000,-0.088034,0.164985,-0.046133,0.124887,0.029061,0.055876,0.001451,-0.024120,0.226344,0.000645,0.026871,-0.084222,-0.126424,-0.043130,-0.031741,0.000652,0.105708,0.015046,-0.003541,0.049180,-0.098962,0.102642,-0.103249,0.008954,-0.024120,-0.058835,-0.010542,0.187402,-0.102983,-0.040037,0.032717,-0.061977,0.063108,-0.019890,0.048496,0.009188,0.199678,0.038765,-0.028006,0.011842,-0.074361,-0.047158,-0.084290,-0.055865,0.087422,-0.013863,0.012902,0.124568,0.114032,-0.019060,0.193012,0.148062,-0.038898,-0.125541,0.137671,-0.146435,-0.079190,-0.142216,-0.116126,-0.124948,-0.045889,-0.019708,-0.109224,-0.017517,-0.113429,0.058984,0.029286,-0.070890,-0.071712,0.009297,-0.039628,0.067761,-0.001411,-0.034871,0.022579,-0.035714,-0.024269,0.047987,-0.021564,0.024373,-0.085227,-0.108215,0.009654,0.161510,0.052065,-0.096295,0.168350,0.110526,-0.044063,0.068807,0.674486
stop_work_due_disease,0.030411,-0.082145,-0.257180,-0.088034,1.000000,0.028537,0.055936,0.056833,0.046771,0.053006,0.011190,0.070955,0.119102,-0.006025,0.014219,-0.022864,0.044506,-0.006155,-0.016520,-0.021440,0.046725,-0.050237,0.020821,-0.024980,0.023987,-0.029880,0.033892,-0.016762,-0.008365,-0.030867,-0.005066,0.023538,0.012332,0.025403,-0.010248,-0.029180,-0.004026,0.052372,0.010829,-0.029991,0.030328,-0.017784,-0.033834,-0.029791,0.017223,0.002588,-0.029233,-0.006793,0.039965,-0.004808,0.006128,0.101872,0.069015,-0.016525,-0.030980,-0.023765,0.028523,-0.026749,-0.000513,-0.013274,-0.025474,0.019509,0.049971,0.043594,0.053556,0.104011,0.041373,-0.005065,0.013915,-0.005885,0.123970,-0.089653,-0.050911,-0.024635,-0.005978,-0.044038,-0.066832,-0.084107,0.015984,-0.011260,0.000261,-0.003127,-0.002255,-0.003016,-0.019560,-0.031588,-0.021692,0.061313,0.037870,-0.050725,0.076722,0.042092,-0.003755,0.027192,-0.036188
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
lately_upset,-0.065039,0.023224,-0.234443,0.168350,0.076722,0.018401,0.022381,0.047873,-0.011953,0.030734,-0.011539,0.020282,0.064318,0.010318,0.011451,-0.006000,-0.033540,-0.017518,-0.024696,-0.004485,0.010803,0.001914,-0.000605,0.040149,-0.024518,0.026010,-0.028036,0.020179,-0.020564,-0.055450,0.045316,0.008307,0.024004,0.017649,-0.025194,0.023256,-0.024432,0.008840,-0.022144,0.038546,0.086382,0.033471,-0.010703,-0.049962,-0.010958,-0.027780,0.009151,0.001711,-0.007487,-0.008858,-0.006343,0.041766,0.035820,-0.014183,0.024690,-0.000401,-0.010111,-0.015983,0.021710,-0.021155,-0.003856,-0.031261,-0.036654,-0.040415,0.001193,0.040231,-0.021535,-0.014517,-0.024325,-0.024113,0.023272,0.002115,0.013188,-0.005136,0.020217,-0.053041,-0.007491,0.000722,-0.078115,-0.076927,-0.105902,0.213233,-0.111419,0.110400,-0.216620,-0.413608,-0.361113,0.713632,0.591988,-0.735308,1.000000,0.530815,-0.139036,0.433276,0.099755
sleeptime,-0.051370,0.008942,-0.133863,0.110526,0.042092,0.016396,0.003467,0.046329,-0.052277,0.018954,-0.019838,0.034579,0.073513,-0.028335,-0.001428,-0.069777,-0.123463,0.036295,0.014410,0.064498,-0.029131,0.025084,-0.032650,0.022669,-0.011070,0.016400,-0.022393,0.039566,-0.012887,0.021523,0.017535,-0.020440,-0.044741,0.016044,-0.006096,-0.007009,-0.001564,0.013673,0.042115,0.033326,0.049795,0.039877,-0.056517,-0.001944,0.000995,-0.008508,0.021024,0.008389,-0.021512,-0.009004,0.041526,0.010378,0.021487,-0.026782,-0.002152,-0.011327,0.004920,-0.098943,0.079657,-0.098085,-0.065489,-0.100650,-0.128735,-0.124040,-0.083749,-0.068204,-0.130282,-0.009823,-0.111542,-0.084798,0.004503,0.066389,0.081769,0.062425,0.082511,-0.003720,0.064497,0.069357,0.204314,0.275703,0.018991,-0.358253,0.360545,-0.357339,-0.310104,-0.289416,0.049977,0.258609,0.452524,-0.235247,0.530815,1.000000,-0.542898,0.596133,0.029754
low_amount_sleep,-0.000654,-0.027608,0.037010,-0.044063,-0.003755,0.003181,-0.021026,-0.019406,0.078248,0.026510,-0.015198,-0.009225,-0.011094,0.007001,0.007719,0.068412,0.122837,-0.017961,-0.014246,-0.014466,0.056068,-0.015437,0.012437,-0.008145,-0.011888,0.007837,-0.003872,-0.018484,-0.009225,-0.054478,0.017503,0.038517,0.036082,-0.010593,-0.011302,-0.032179,0.020599,0.011937,-0.023112,-0.016728,-0.059903,-0.019612,0.016775,-0.012126,-0.016757,0.008382,-0.020983,0.015405,0.040281,0.009012,0.001125,0.025713,0.006279,-0.010104,-0.034163,-0.026207,0.023427,0.062673,-0.071978,0.075056,0.065274,0.082194,0.122932,0.091577,0.060567,0.110249,0.103969,-0.005880,0.066222,0.032808,0.052236,-0.066832,-0.056337,-0.009593,-0.026126,0.001896,-0.031832,-0.050257,-0.052075,-0.132877,-0.124279,0.264903,-0.149985,0.145029,0.217212,0.032440,-0.110316,-0.107692,-0.072561,-0.070255,-0.139036,-0.542898,1.000000,-0.080128,0.002951
high_amout_sleep,-0.034875,-0.001499,-0.102743,0.068807,0.027192,0.036707,-0.019968,-0.002549,0.002149,0.052013,-0.041194,-0.010919,0.054402,-0.013539,0.020563,-0.046225,-0.071785,0.043300,0.019023,0.035896,0.009206,0.001494,-0.031813,0.010335,-0.000310,0.005590,-0.011931,0.041262,-0.010919,0.026430,-0.002772,-0.014271,-0.034155,0.013834,-0.013377,-0.001449,-0.016285,0.032034,0.000538,0.013273,0.011927,0.036335,-0.005709,-0.020931,-0.015965,0.007483,-0.021569,0.001607,-0.006957,-0.002360,0.030519,0.022839,0.023308,-0.013130,-0.023150,-0.008640,0.012196,-0.070362,0.049596,-0.065516,-0.044772,-0.069228,-0.077872,-0.090125,-0.051802,-0.008983,-0.091117,-0.008001,-0.093386,-0.051308,-0.014781,0.053513,0.057485,0.033165,0.050300,-0.011384,0.045824,0.055733,0.158288,0.152092,-0.062043,-0.145453,0.222257,-0.220506,-0.115020,-0.210174,-0.132056,0.106195,0.490051,-0.326177,0.433276,0.596133,-0.080128,1.000000,0.034175


In [1281]:
#num_cols = ['education', 'smoking_duration', #'ciggaretes_per_day',
            #'passive_smoking_frequency', 'alcohol_duration', 'sleeptime', ]
   #         #'cardio_score', 'comorbid_count', 'risk_job' ]

In [1282]:
scaler = StandardScaler()

for col in df.columns:
    if col != 'train':
        df[col] = StandardScaler().fit_transform(
        df[col].values.reshape(-1, 1))

In [1283]:
train = df[df['train'] == 1].drop('train', axis = 1)
X_test = df[df['train'] == 0].drop('train', axis = 1)

#### Так как модель исходная as is лучше всего предсказыввает артериальную гипертнезнию, то сосредоточимся на предсказании этого показателя. В дальнейшем обучив модель и "подставив" показатели продолжим предсказывать другие

In [1284]:
target_train.corr()

Unnamed: 0,arterial_hypertension,ACD,IHD,heart_failure,other_cardio_diseases
arterial_hypertension,1.0,0.12272,0.296755,0.266396,0.086785
ACD,0.12272,1.0,0.015392,-0.002087,-0.030536
IHD,0.296755,0.015392,1.0,0.522954,0.049801
heart_failure,0.266396,-0.002087,0.522954,1.0,0.138133
other_cardio_diseases,0.086785,-0.030536,0.049801,0.138133,1.0


#### исходя из корреляций между АГ и другими болезнями у метода хорошие перспективы

# Поиск лучшей метрики с помощью логистической регрессии

In [1285]:
# Гиперпараметры логистической регрессии для перебора с помощью GridsearchCV
C = [1, 0.1, 0.01]
iters = [50, 100, 200, 300]
epsilon_stop = [0.0001]

param_grid_logreg = [
    {'C': C,
     'penalty': ['l1'],
     'solver': ['saga', 'liblinear'],
     'class_weight':[None, 'balanced'],
     'multi_class': ['auto', 'ovr'],
     'max_iter': iters,
     'tol': epsilon_stop},
    {'C': C,
     'penalty': ['l2'],
     'solver': ['newton-cg', 'liblinear', 'lbfgs', 'saga'],
     'class_weight':[None, 'balanced'],
     'multi_class': ['auto', 'ovr'],
     'max_iter': iters,
     'tol':epsilon_stop},
    {'penalty': ['none'],
     'solver': ['saga'],
     'class_weight':[None, 'balanced'],
     'multi_class': ['auto', 'ovr'],
     'max_iter': iters,
     'tol':epsilon_stop}]

# Гиперпараметры svc
loss = ['hinge', 'squared_hinge']

param_grid_svc = [
    {'C': C,
     'penalty': ['l1'],
     'loss' : loss,
     'class_weight':[None, 'balanced'],
     'multi_class': ['auto', 'ovr'],
     'max_iter': iters,
     'tol': epsilon_stop},
    {'C': C,
     'penalty': ['l2'],
     'class_weight':[None, 'balanced'],
     'multi_class': ['auto', 'ovr'],
     'max_iter': iters,
     'tol':epsilon_stop}]

In [1286]:
def reduce_low_corrs_with_target(df, target, threshold=0.05, return_predicted=False, X_test=None):    
    mask = df.corr().abs()[target].sort_values() > threshold
    df = df[mask.index[mask]]
    if return_predicted == True:
        test_mask = mask.drop(target)
        X_test = X_test[test_mask.index[test_mask]]
        return df, X_test
    else:
        return df

In [1287]:
# Модель логистической регрессии с балансировкой выборки, флаг return_model отвечает за возврат модели после выполнения функции.
def fit_model(df, target, model='logreg', scoring='f1',
               split=True,
               reduce_corrs=False, threshold_reduce=0.05, 
               grid=False, oversampling=False,  param_grid=None,
               oversampler='random', resample_strategy='auto', 
               undersampler=None, undersample_strategy=None, 
               return_predicted=False, X_test=None, 
               random_state=None):
    
    if reduce_corrs == True:
        df, X_test = reduce_low_corrs_with_target(df=df, target=target, 
                                             threshold=threshold_reduce, return_predicted=return_predicted, 
                                                     X_test=X_test)
    
    print('Used features: ', df.columns)
    
    X = df.drop(target, axis=1)
    y = df[target]
    
    if split == True:
        X_train, X_valid, y_train, y_valid = train_test_split(
                                X, y, test_size=0.20, stratify=y, random_state=random_state)
    else:
        X_train = X.copy()
        y_train = y.copy()
        
    if model == 'logreg':
        #Логистическая регрессия
        clf = LogisticRegression()
        
        # Произведем поиск гиперпараметров GridSearchCV при помощи перебора параметров по сетке param_grid:
        if grid == True:    
            grid_search = GridSearchCV(clf, param_grid, scoring=scoring, n_jobs=1, cv=5)
            grid_search.fit(X, y)
            
        # Печатаем параметры развернуто:
            best_model = grid_search.best_estimator_
            best_parameters = best_model.get_params()
            for param_name in sorted(best_parameters.keys()):
                print('\t%s: %r' % (param_name, best_parameters[param_name]))    

        clf = LogisticRegression(**best_parameters)

    if model == 'svc':
        #SVC
        clf = LinearSVC()
        
        if grid == True:
            grid_search = GridSearchCV(clf, param_grid, scoring=scoring)
            grid_search.fit(X, y)
            
            best_model = grid_search.best_estimator_
            best_parameters = best_model.get_params()
            for param_name in sorted(best_parameters.keys()):
                print('\t%s: %r' % (param_name, best_parameters[param_name]))  
            
                clf = LinearSVC(**best_parameters)
    
    if model == 'kneigh':
        clf = KNeighborsClassifier(n_neighbors=5, weights='distance', n_jobs=-1)

    if model == 'svc2':
        clf = SVC()
        
        #Подсчет метрики на кросс-валидации
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=random_state)
    scores_f1 = cross_val_score(clf, X, y, scoring='f1', cv=cv, n_jobs=-1)
    print('f1 expected: ', np.mean(scores_f1))
    scores_recall_macro = cross_val_score(clf, X, y, scoring='recall_macro', cv=cv, n_jobs=-1)
    print('recall_macro expected: ', np.mean(scores_recall_macro))
    

    
    
    
    
    
    
    # Балансировка выборки
    if oversampling == True:     
        if oversampler == 'random':
            oversampler = RandomOverSampler(sampling_strategy=resample_strategy, random_state=random_state)
            
        if oversampler == 'smote':
            oversampler = SMOTE(sampling_strategy=resample_strategy, random_state=random_state)
            
        X_train, y_train = oversampler.fit_resample(X_train, y_train) 
        
        if undersampler == 'random':
            undersampler = RandomUnderSampler(sampling_strategy=undersample_strategy, random_state=random_state)
            X_train, y_train = undersampler.fit_resample(X_train, y_train)
    
    
    
    if split==True:
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_valid)
        print('valid_recall_score: ', recall_score(y_valid, y_pred, average='macro'))

    else:
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X)
        print('self_recall_score: ', recall_score(y, y_pred, average='macro'))
    

    if return_predicted == True:
        predicted_train = clf.predict(X)
        predicted_test = clf.predict(X_test)
        return predicted_train, predicted_test
    else:
        return clf
    

In [1288]:
df_ah = pd.concat([train, target_train['arterial_hypertension']], axis=1)

In [1289]:
ah = MultiCollinearityEliminator(df_ah, 'arterial_hypertension', 0.6).autoEliminateMulticollinearity()

['retired', 'job', 'cardio_score', 'fracture_last_year', 'trauma_last_year', 'never_smoke', 'smoking_duration', 'smoker_score', 'smoking_duration_score', 'ciggaretes_per_day_log', 'ciggaretes_per_day', 'ciggaretes_per_day_score', 'ciggaretes_per_day_score_log', 'smoking_score_int_score_2', 'smoking_score_int_score_log', 'passive_smoking_frequency', 'passive_smoking', 'never_drink_alco', 'alcohol_duration', 'alcohol_score', 'alcohol_duration_score', 'alcohol_duration_log', 'alcohol_int_score_2', 'alcohol_int_score_3', 'id_1', 'unknown_id_feature', 'id_2', 'european', 'asian', 'is_russian', 'is_comorbid', 'comorbid_count', 'fracture_on_retire', 'trauma_on_retire', 'smoking', 'smoking_score_int_log', 'smoking_score_int_score_3', 'smoking_score_int_score', 'drink_alco_now', 'previous_drink_alco', 'early_onset', 'sleep_onset_22', 'lately_onset', 'lately_upset', 'sleep_upset_8', 'early_upset', 'high_amout_sleep', 'sleeptime'] 

                           arterial_hypertension
asian          

['retired', 'job', 'cardio_score', 'never_smoke', 'smoking_duration', 'smoker_score', 'smoking_duration_score', 'ciggaretes_per_day_log', 'ciggaretes_per_day', 'ciggaretes_per_day_score', 'ciggaretes_per_day_score_log', 'smoking_score_int_score_2', 'smoking_score_int_score_log', 'id_1', 'unknown_id_feature', 'id_2', 'is_comorbid', 'comorbid_count', 'smoking', 'smoking_score_int_log', 'smoking_score_int_score_3', 'smoking_score_int_score', 'drink_alco_now', 'never_drink_alco', 'alcohol_score', 'alcohol_duration_score', 'alcohol_duration_log', 'alcohol_int_score_2', 'alcohol_int_score_3', 'previous_drink_alco', 'early_onset', 'sleep_onset_22', 'lately_onset', 'lately_upset', 'sleep_upset_8', 'early_upset', 'high_amout_sleep', 'sleeptime'] 

                     arterial_hypertension
id_4                              0.003029
other                             0.006621
low_qualified                     0.008645
service                           0.009041
sleep_upset_after_9               0.

['retired', 'job', 'cardio_score', 'never_smoke', 'smoking_duration', 'smoker_score', 'smoking_duration_score', 'ciggaretes_per_day_log', 'ciggaretes_per_day', 'ciggaretes_per_day_score_log', 'smoking_score_int_score_log', 'id_1', 'unknown_id_feature', 'id_2', 'is_comorbid', 'comorbid_count', 'smoking', 'smoking_score_int_score_3', 'smoking_score_int_score', 'drink_alco_now', 'never_drink_alco', 'alcohol_score', 'alcohol_duration_log', 'alcohol_int_score_3', 'previous_drink_alco', 'early_onset', 'sleep_onset_22', 'lately_onset', 'lately_upset', 'sleep_upset_8', 'early_upset'] 

                     arterial_hypertension
id_4                              0.003029
other                             0.006621
low_qualified                     0.008645
service                           0.009041
sleep_upset_after_9               0.009271
...                                    ...
cardio_score                      0.200888
job                               0.224856
diabetes                    

['retired', 'job', 'cardio_score', 'never_smoke', 'smoking_duration', 'smoker_score', 'ciggaretes_per_day_log', 'id_1', 'unknown_id_feature', 'id_2', 'is_comorbid', 'comorbid_count', 'smoking', 'smoking_score_int_score_log', 'ciggaretes_per_day_score_log', 'smoking_score_int_score_3', 'smoking_score_int_score', 'drink_alco_now', 'never_drink_alco', 'alcohol_score', 'alcohol_int_score_3', 'previous_drink_alco', 'lately_onset', 'early_onset'] 

                     arterial_hypertension
id_4                              0.003029
other                             0.006621
low_qualified                     0.008645
service                           0.009041
sleep_upset_after_9               0.009271
...                                    ...
cardio_score                      0.200888
job                               0.224856
diabetes                          0.230891
retired                           0.255916
intake_medicines                  0.452289

[76 rows x 1 columns] 

id_4 

other

['retired', 'job', 'cardio_score', 'id_1', 'unknown_id_feature', 'id_2', 'is_comorbid', 'comorbid_count', 'never_smoke', 'smoking', 'smoker_score', 'drink_alco_now', 'previous_drink_alco', 'alcohol_score', 'alcohol_int_score_3'] 

                     arterial_hypertension
id_4                              0.003029
other                             0.006621
low_qualified                     0.008645
service                           0.009041
sleep_upset_after_9               0.009271
...                                    ...
cardio_score                      0.200888
job                               0.224856
diabetes                          0.230891
retired                           0.255916
intake_medicines                  0.452289

[69 rows x 1 columns] 

id_4 

other 

low_qualified 

service 

sleep_upset_after_9 

trauma_last_year 

sport_clubs 

passive_smoking 

sleep_onset_23 

low_amount_sleep 

previous_smoke 

divorced 

top_management 

is_russian 

alcohol_int_score_1 

['retired', 'job', 'cardio_score', 'id_1', 'unknown_id_feature', 'is_comorbid', 'comorbid_count'] 

                     arterial_hypertension
id_4                              0.003029
other                             0.006621
low_qualified                     0.008645
service                           0.009041
sleep_upset_after_9               0.009271
...                                    ...
cardio_score                      0.200888
job                               0.224856
diabetes                          0.230891
retired                           0.255916
intake_medicines                  0.452289

[63 rows x 1 columns] 

id_4 

other 

low_qualified 

service 

sleep_upset_after_9 

trauma_last_year 

sport_clubs 

passive_smoking 

sleep_onset_23 

low_amount_sleep 

previous_smoke 

divorced 

top_management 

is_russian 

alcohol_int_score_1 

military 

office 

tuberculosis 

hepatitis 

sleep_onset_later_after_0 

religion_clubs 

trauma_on_retire 

sleeptime 

sleep_

[] 



In [1290]:
ah_train, ah_pred = fit_model(ah, 'arterial_hypertension',
            split=True,
            reduce_corrs=True, threshold_reduce=0.09, 
            model='svc2', grid=False, param_grid=None, scoring='f1',
            oversampling=True, oversampler='random', 
            return_predicted=True, X_test=X_test, 
            random_state=42)

Used features:  Index(['oncology', 'bronchial_asthma', 'hard_industry', 'smoking', 'fractures',
       'unknown_id_feature', 'education', 'sex', 'comorbid_count', 'widowed',
       'diabetes', 'retired', 'intake_medicines', 'arterial_hypertension'],
      dtype='object')
f1 expected:  0.7309662156233884
recall_macro expected:  0.7352273919394583
valid_recall_score:  0.7764375413086584


In [1291]:
print('len of predicted: ', sum(ah_pred))
X_test['arterial_hypertension'] = ah_pred
target_train['arterial_hypertension'] = ah_train

len of predicted:  330


In [1292]:
df_ihd = pd.concat([train, target_train[['arterial_hypertension','IHD']]], axis=1)


In [1293]:
ihd = MultiCollinearityEliminator(df_ihd, 'IHD', 0.7).autoEliminateMulticollinearity()

['retired', 'job', 'arterial_hypertension', 'intake_medicines', 'fracture_last_year', 'trauma_last_year', 'never_smoke', 'smoking_duration', 'smoker_score', 'smoking_duration_score', 'ciggaretes_per_day_log', 'ciggaretes_per_day', 'ciggaretes_per_day_score', 'ciggaretes_per_day_score_log', 'smoking_score_int_score_2', 'passive_smoking_frequency', 'passive_smoking', 'never_drink_alco', 'alcohol_duration', 'alcohol_score', 'alcohol_duration_score', 'alcohol_duration_log', 'alcohol_int_score_2', 'alcohol_int_score_3', 'id_1', 'unknown_id_feature', 'id_2', 'european', 'asian', 'is_comorbid', 'comorbid_count', 'fracture_on_retire', 'trauma_on_retire', 'smoking', 'smoking_score_int_score_log', 'drink_alco_now', 'early_onset', 'sleep_onset_22', 'lately_onset', 'lately_upset', 'sleep_upset_8', 'early_upset'] 

                            IHD
office                 0.000286
early_upset            0.000377
sport_clubs            0.000465
smoking_score_int_log  0.000963
european               0.0

['retired', 'job', 'arterial_hypertension', 'intake_medicines', 'never_smoke', 'smoking_duration', 'smoker_score', 'smoking_duration_score', 'ciggaretes_per_day', 'ciggaretes_per_day_score', 'ciggaretes_per_day_score_log', 'passive_smoking_frequency', 'passive_smoking', 'id_1', 'unknown_id_feature', 'id_2', 'is_comorbid', 'comorbid_count', 'fracture_on_retire', 'trauma_on_retire', 'smoking', 'smoking_score_int_score_log', 'alcohol_score', 'never_drink_alco', 'alcohol_duration_score', 'alcohol_int_score_2', 'alcohol_int_score_3', 'drink_alco_now', 'early_onset', 'sleep_onset_22', 'lately_onset', 'lately_upset', 'sleep_upset_8'] 

                            IHD
office                 0.000286
sport_clubs            0.000465
smoking_score_int_log  0.000963
chronic_lung_disease   0.002166
single                 0.004129
...                         ...
cardio_score           0.203711
job                    0.206260
intake_medicines       0.218472
retired                0.221894
arterial_hy

['retired', 'job', 'arterial_hypertension', 'intake_medicines', 'never_smoke', 'smoking_duration', 'smoker_score', 'smoking_duration_score', 'ciggaretes_per_day', 'ciggaretes_per_day_score', 'ciggaretes_per_day_score_log', 'passive_smoking_frequency', 'passive_smoking', 'is_comorbid', 'comorbid_count', 'fracture_on_retire', 'trauma_on_retire', 'smoking', 'smoking_score_int_score_log', 'alcohol_score', 'drink_alco_now', 'alcohol_int_score_2', 'alcohol_int_score_3'] 

                            IHD
office                 0.000286
sport_clubs            0.000465
smoking_score_int_log  0.000963
chronic_lung_disease   0.002166
single                 0.004129
...                         ...
cardio_score           0.203711
job                    0.206260
intake_medicines       0.218472
retired                0.221894
arterial_hypertension  0.262085

[81 rows x 1 columns] 

office 

sport_clubs 

smoking_score_int_log 

chronic_lung_disease 

single 

senior_specialist 

sleep_onset_23 

chri

['retired', 'job', 'arterial_hypertension', 'intake_medicines', 'passive_smoking_frequency', 'passive_smoking', 'is_comorbid', 'comorbid_count', 'fracture_on_retire', 'trauma_on_retire', 'alcohol_score', 'drink_alco_now', 'alcohol_int_score_2', 'alcohol_int_score_3'] 

                            IHD
office                 0.000286
sport_clubs            0.000465
smoking_score_int_log  0.000963
chronic_lung_disease   0.002166
single                 0.004129
...                         ...
cardio_score           0.203711
job                    0.206260
intake_medicines       0.218472
retired                0.221894
arterial_hypertension  0.262085

[74 rows x 1 columns] 

office 

sport_clubs 

smoking_score_int_log 

chronic_lung_disease 

single 

senior_specialist 

sleep_onset_23 

christian 

id_3 

sex 

sleeptime 

sleep_onset_early_22 

sleep_upset_6 

sleep_upset_early_6 

sleep_upset_7 

low_qualified 

crafters_other 

unknown_id_feature 

religion_clubs 

early_onset 

alcoho

['retired', 'job', 'arterial_hypertension', 'intake_medicines'] 

                            IHD
office                 0.000286
sport_clubs            0.000465
smoking_score_int_log  0.000963
chronic_lung_disease   0.002166
single                 0.004129
...                         ...
cardio_score           0.203711
job                    0.206260
intake_medicines       0.218472
retired                0.221894
arterial_hypertension  0.262085

[68 rows x 1 columns] 

office 

sport_clubs 

smoking_score_int_log 

chronic_lung_disease 

single 

senior_specialist 

sleep_onset_23 

christian 

id_3 

sex 

sleeptime 

sleep_onset_early_22 

sleep_upset_6 

sleep_upset_early_6 

sleep_upset_7 

low_qualified 

crafters_other 

unknown_id_feature 

religion_clubs 

early_onset 

alcohol_int_score_1 

high_amout_sleep 

low_amount_sleep 

id_4 

separated 

hiv/aids 

trauma_last_year 

sleep_onset_later_after_0 

cohabited 

hepatitis 

previous_smoke 

hard_industry 

asian 

ciggaret

In [1294]:
#stable 0.738 - recall
ihd_train, ihd_pred = fit_model(ihd, 'IHD', split=False,
                            reduce_corrs=True, threshold_reduce=0.07,
                            model='logreg', grid=True, param_grid=param_grid_logreg, scoring='f1',
                            oversampling=True, oversampler='random', resample_strategy='auto',
                            #undersampler='random', undersample_strategy=0.7,
                            return_predicted=True, X_test=X_test, random_state=42)


Used features:  Index(['farmer/fisherman', 'fractures', 'tuberculosis',
       'stop_work_due_disease', 'drink_alco_now', 'smoking_score_int_score_3',
       'widowed', 'previous_drink_alco', 'is_comorbid', 'diabetes',
       'cardio_score', 'retired', 'arterial_hypertension', 'IHD'],
      dtype='object')
	C: 0.1
	class_weight: 'balanced'
	dual: False
	fit_intercept: True
	intercept_scaling: 1
	l1_ratio: None
	max_iter: 50
	multi_class: 'auto'
	n_jobs: None
	penalty: 'l2'
	random_state: None
	solver: 'newton-cg'
	tol: 0.0001
	verbose: 0
	warm_start: False
f1 expected:  0.36832446678968883
recall_macro expected:  0.7038108720447305
self_recall_score:  0.7287905676927157


In [1295]:
X_test['IHD'] = ihd_pred
target_train['IHD'] = ihd_train

In [1296]:
df_hf = pd.concat([train, target_train[['arterial_hypertension','IHD','heart_failure']]], axis=1)


In [1297]:
hf = MultiCollinearityEliminator(df_hf, 'heart_failure', 0.6).autoEliminateMulticollinearity()

['retired', 'job', 'cardio_score', 'IHD', 'arterial_hypertension', 'intake_medicines', 'fracture_last_year', 'trauma_last_year', 'never_smoke', 'smoking_duration', 'smoker_score', 'smoking_duration_score', 'ciggaretes_per_day_log', 'ciggaretes_per_day', 'ciggaretes_per_day_score', 'ciggaretes_per_day_score_log', 'smoking_score_int_score_2', 'smoking_score_int_score_log', 'passive_smoking_frequency', 'passive_smoking', 'never_drink_alco', 'alcohol_duration', 'alcohol_score', 'alcohol_duration_score', 'alcohol_duration_log', 'alcohol_int_score_2', 'alcohol_int_score_3', 'id_1', 'unknown_id_feature', 'id_2', 'european', 'asian', 'is_russian', 'is_comorbid', 'comorbid_count', 'fracture_on_retire', 'trauma_on_retire', 'smoking', 'smoking_score_int_log', 'smoking_score_int_score_3', 'smoking_score_int_score', 'drink_alco_now', 'previous_drink_alco', 'early_onset', 'sleep_onset_22', 'lately_onset', 'lately_upset', 'sleep_upset_8', 'early_upset', 'high_amout_sleep', 'sleeptime'] 

            

['retired', 'job', 'cardio_score', 'IHD', 'arterial_hypertension', 'intake_medicines', 'never_smoke', 'smoking_duration', 'smoker_score', 'smoking_duration_score', 'ciggaretes_per_day_log', 'ciggaretes_per_day', 'ciggaretes_per_day_score', 'ciggaretes_per_day_score_log', 'smoking_score_int_score_2', 'smoking_score_int_score_log', 'passive_smoking_frequency', 'passive_smoking', 'never_drink_alco', 'alcohol_duration', 'alcohol_score', 'alcohol_duration_log', 'alcohol_int_score_2', 'alcohol_int_score_3', 'id_1', 'unknown_id_feature', 'id_2', 'is_comorbid', 'comorbid_count', 'fracture_on_retire', 'trauma_on_retire', 'smoking', 'smoking_score_int_log', 'drink_alco_now', 'previous_drink_alco', 'early_onset', 'sleep_onset_22', 'lately_onset', 'lately_upset', 'early_upset', 'high_amout_sleep', 'sleeptime'] 

                       heart_failure
midday_sleep                0.002388
sleep_onset_23              0.002796
hepatitis                   0.004706
divorced                    0.005838
fra

['retired', 'job', 'cardio_score', 'IHD', 'arterial_hypertension', 'intake_medicines', 'never_smoke', 'smoking_duration', 'smoker_score', 'smoking_duration_score', 'ciggaretes_per_day_log', 'ciggaretes_per_day', 'ciggaretes_per_day_score', 'ciggaretes_per_day_score_log', 'smoking_score_int_score_2', 'smoking_score_int_score_log', 'id_2', 'id_1', 'is_comorbid', 'comorbid_count', 'fracture_on_retire', 'trauma_on_retire', 'smoking', 'smoking_score_int_log', 'drink_alco_now', 'never_drink_alco', 'alcohol_score', 'alcohol_duration_log', 'alcohol_int_score_2', 'alcohol_int_score_3', 'previous_drink_alco', 'high_amout_sleep', 'sleeptime'] 

                       heart_failure
midday_sleep                0.002388
sleep_onset_23              0.002796
hepatitis                   0.004706
divorced                    0.005838
fracture_last_year          0.009606
...                              ...
retired                     0.175824
job                         0.176134
intake_medicines         

['retired', 'job', 'cardio_score', 'IHD', 'arterial_hypertension', 'intake_medicines', 'never_smoke', 'smoking_duration', 'smoker_score', 'smoking_duration_score', 'ciggaretes_per_day_log', 'ciggaretes_per_day', 'ciggaretes_per_day_score', 'ciggaretes_per_day_score_log', 'smoking_score_int_score_2', 'smoking_score_int_score_log', 'is_comorbid', 'comorbid_count', 'smoking', 'drink_alco_now', 'never_drink_alco', 'alcohol_score', 'alcohol_int_score_3', 'previous_drink_alco'] 

                       heart_failure
midday_sleep                0.002388
sleep_onset_23              0.002796
hepatitis                   0.004706
divorced                    0.005838
fracture_last_year          0.009606
...                              ...
retired                     0.175824
job                         0.176134
intake_medicines            0.176454
arterial_hypertension       0.218182
IHD                         0.286860

[78 rows x 1 columns] 

midday_sleep 

sleep_onset_23 

hepatitis 

divorced

['retired', 'job', 'cardio_score', 'IHD', 'arterial_hypertension', 'intake_medicines', 'never_smoke', 'ciggaretes_per_day', 'smoker_score', 'ciggaretes_per_day_log', 'ciggaretes_per_day_score_log', 'smoking_score_int_score_2', 'smoking_score_int_score_log', 'is_comorbid', 'comorbid_count', 'drink_alco_now', 'previous_drink_alco', 'alcohol_score', 'alcohol_int_score_3'] 

                       heart_failure
midday_sleep                0.002388
sleep_onset_23              0.002796
hepatitis                   0.004706
divorced                    0.005838
fracture_last_year          0.009606
...                              ...
retired                     0.175824
job                         0.176134
intake_medicines            0.176454
arterial_hypertension       0.218182
IHD                         0.286860

[73 rows x 1 columns] 

midday_sleep 

sleep_onset_23 

hepatitis 

divorced 

fracture_last_year 

sleep_onset_early_22 

id_3 

alcohol_int_score_1 

sleep_upset_6 

hiv/aids 

se

['retired', 'job', 'cardio_score', 'IHD', 'arterial_hypertension', 'intake_medicines', 'is_comorbid', 'comorbid_count', 'drink_alco_now', 'previous_drink_alco', 'alcohol_score', 'alcohol_int_score_3'] 

                       heart_failure
midday_sleep                0.002388
sleep_onset_23              0.002796
hepatitis                   0.004706
divorced                    0.005838
fracture_last_year          0.009606
...                              ...
retired                     0.175824
job                         0.176134
intake_medicines            0.176454
arterial_hypertension       0.218182
IHD                         0.286860

[67 rows x 1 columns] 

midday_sleep 

sleep_onset_23 

hepatitis 

divorced 

fracture_last_year 

sleep_onset_early_22 

id_3 

alcohol_int_score_1 

sleep_upset_6 

hiv/aids 

separated 

hard_industry 

religion_clubs 

sleep_onset_later_after_0 

sport_clubs 

christian 

office 

sleep_onset_22 

military 

junior_specialists 

other 

smoking_

['arterial_hypertension', 'intake_medicines', 'IHD'] 

                       heart_failure
midday_sleep                0.002388
sleep_onset_23              0.002796
hepatitis                   0.004706
divorced                    0.005838
fracture_last_year          0.009606
...                              ...
widowed                     0.132974
job                         0.176134
intake_medicines            0.176454
arterial_hypertension       0.218182
IHD                         0.286860

[61 rows x 1 columns] 

midday_sleep 

sleep_onset_23 

hepatitis 

divorced 

fracture_last_year 

sleep_onset_early_22 

id_3 

alcohol_int_score_1 

sleep_upset_6 

hiv/aids 

separated 

hard_industry 

religion_clubs 

sleep_onset_later_after_0 

sport_clubs 

christian 

office 

sleep_onset_22 

military 

junior_specialists 

other 

smoking_score_int_score 

fracture_on_retire 

is_russian 

sleep_upset_7 

service 

senior_specialist 

single 

sleep_upset_early_6 

lately_upset 

onco

In [1298]:
hf['heart_failure'].value_counts(normalize=True)

0    0.899476
1    0.100524
Name: heart_failure, dtype: float64

In [1299]:
hf_train, hf_pred = fit_model(hf, 'heart_failure', 
                            reduce_corrs=True, threshold_reduce=0.06, split=False,
                            model = 'logreg', grid=True, param_grid=param_grid_logreg,
                            oversampling=True, oversampler='random', resample_strategy='auto',
                            return_predicted=True, X_test=X_test,
                            random_state=42)


Used features:  Index(['farmer/fisherman', 'id_4', 'sleeptime', 'sleep_upset_after_9',
       'education', 'sex', 'never_smoke', 'household', 'diabetes',
       'bronchial_asthma', 'comorbid_count', 'drink_alco_now', 'widowed',
       'job', 'IHD', 'heart_failure'],
      dtype='object')
	C: 1
	class_weight: 'balanced'
	dual: False
	fit_intercept: True
	intercept_scaling: 1
	l1_ratio: None
	max_iter: 50
	multi_class: 'auto'
	n_jobs: None
	penalty: 'l1'
	random_state: None
	solver: 'liblinear'
	tol: 0.0001
	verbose: 0
	warm_start: False
f1 expected:  0.32928060541438225
recall_macro expected:  0.7042441860465116
self_recall_score:  0.7283481276678307


In [1300]:
X_test['heart_failure'] = hf_pred
target_train['heart_failure'] = hf_train 

# Предсказание ACD

In [1301]:
df_acd = pd.concat([train, target_train[['arterial_hypertension','IHD','heart_failure', 'ACD']]], axis=1)


In [1302]:
acd = MultiCollinearityEliminator(df_acd, 'ACD', 0.8).autoEliminateMulticollinearity()

['fracture_last_year', 'trauma_last_year', 'never_smoke', 'smoking_duration', 'smoker_score', 'smoking_duration_score', 'ciggaretes_per_day_log', 'ciggaretes_per_day', 'ciggaretes_per_day_score', 'ciggaretes_per_day_score_log', 'smoking_score_int_score_2', 'passive_smoking_frequency', 'passive_smoking', 'never_drink_alco', 'alcohol_duration', 'alcohol_duration_score', 'alcohol_duration_log', 'alcohol_int_score_2', 'id_1', 'unknown_id_feature', 'id_2', 'european', 'asian', 'is_comorbid', 'comorbid_count', 'smoking', 'smoking_score_int_score_log', 'alcohol_score', 'drink_alco_now', 'alcohol_int_score_3', 'early_onset', 'sleep_onset_22', 'lately_onset', 'heart_failure', 'IHD'] 

                             ACD
smoking_duration_score  0.000033
comorbid_count          0.000460
chronic_lung_disease    0.001782
office                  0.002407
smoking_duration        0.003147
...                          ...
IHD                     0.089094
stop_work_due_disease   0.097109
arterial_hypertens

['passive_smoking_frequency', 'passive_smoking', 'never_drink_alco', 'alcohol_duration', 'alcohol_duration_score', 'alcohol_duration_log', 'alcohol_int_score_2', 'id_1', 'unknown_id_feature', 'id_2', 'european', 'asian', 'smoker_score', 'smoking', 'ciggaretes_per_day_score_log', 'ciggaretes_per_day_score', 'smoking_score_int_score_2', 'smoking_score_int_score_log', 'alcohol_int_score_3', 'drink_alco_now', 'early_onset', 'sleep_onset_22', 'lately_onset', 'heart_failure', 'IHD'] 

                            ACD
chronic_lung_disease   0.001782
office                 0.002407
oncology               0.003835
sleep_upset_6          0.004285
smoking_score_int_log  0.006253
...                         ...
IHD                    0.089094
stop_work_due_disease  0.097109
arterial_hypertension  0.102496
crafters_other         0.109801
intake_medicines       0.114712

[89 rows x 1 columns] 

chronic_lung_disease 

office 

oncology 

sleep_upset_6 

smoking_score_int_log 

smoking_score_int_score_

['passive_smoking_frequency', 'passive_smoking', 'never_drink_alco', 'alcohol_duration', 'alcohol_duration_score', 'alcohol_duration_log', 'id_1', 'unknown_id_feature', 'id_2', 'early_onset', 'sleep_onset_22', 'lately_onset', 'heart_failure', 'IHD'] 

                            ACD
chronic_lung_disease   0.001782
office                 0.002407
oncology               0.003835
sleep_upset_6          0.004285
smoking_score_int_log  0.006253
...                         ...
IHD                    0.089094
stop_work_due_disease  0.097109
arterial_hypertension  0.102496
crafters_other         0.109801
intake_medicines       0.114712

[82 rows x 1 columns] 

chronic_lung_disease 

office 

oncology 

sleep_upset_6 

smoking_score_int_log 

smoking_score_int_score_3 

sleep_upset_8 

is_russian 

household 

separated 

hiv/aids 

id_4 

divorced 

low_amount_sleep 

trauma_on_retire 

alcohol_int_score_1 

widowed 

fracture_last_year 

hepatitis 

military 

smoking_score_int_score 

hard_i

['id_1', 'unknown_id_feature', 'id_2', 'lately_onset', 'early_onset'] 

                            ACD
chronic_lung_disease   0.001782
office                 0.002407
oncology               0.003835
sleep_upset_6          0.004285
smoking_score_int_log  0.006253
...                         ...
IHD                    0.089094
stop_work_due_disease  0.097109
arterial_hypertension  0.102496
crafters_other         0.109801
intake_medicines       0.114712

[76 rows x 1 columns] 

chronic_lung_disease 

office 

oncology 

sleep_upset_6 

smoking_score_int_log 

smoking_score_int_score_3 

sleep_upset_8 

is_russian 

household 

separated 

hiv/aids 

id_4 

divorced 

low_amount_sleep 

trauma_on_retire 

alcohol_int_score_1 

widowed 

fracture_last_year 

hepatitis 

military 

smoking_score_int_score 

hard_industry 

other 

married 

service 

senior_specialist 

is_comorbid 

lately_upset 

sleep_upset_after_9 

sleep_onset_later_after_0 

high_amout_sleep 

sport_clubs 

cohabited 

In [1303]:
print(get_top_abs_correlations(hf, 15))

print(acd.corr().abs()['ACD'].sort_values(ascending=False))

sleep_upset_after_9   lately_upset                 0.574470
previous_smoke        never_smoke                  0.574012
sleep_onset_23        sleep_onset_later_after_0    0.570364
sex                   id_2                         0.567111
job                   IHD                          0.561911
lately_upset          sleeptime                    0.549760
hepatitis             comorbid_count               0.546712
education             senior_specialist            0.542282
fracture_last_year    fracture_on_retire           0.527405
sleeptime             low_amount_sleep             0.525741
married               widowed                      0.502923
diabetes              comorbid_count               0.495416
chronic_lung_disease  comorbid_count               0.485648
married               divorced                     0.465079
sleep_upset_after_9   sleeptime                    0.454157
dtype: float64
ACD                      1.000000
intake_medicines         0.114712
crafters_other   

In [1304]:
acd['ACD'].value_counts(normalize=True)

0    0.957068
1    0.042932
Name: ACD, dtype: float64

In [1305]:
#df_acd['train'] = 1
#X_test['train'] = 0
#acd_ench = pd.concat([df_acd, X_test.drop('other_cardio_diseases', axis=1)])
#acd_ench['have_cardio_probs'] = acd_ench['arterial_hypertension'] + acd_ench['IHD'] + acd_ench['heart_failure']
#acd_ench['have_cardio_probs'] = scaler.fit_transform(acd_ench['have_cardio_probs'].reshape(-1,1))

#acd = acd_ench[acd_ench.train == 1]
#X_test = acd_ench[acd_ench.train == 0]

In [1306]:
acd_ench

Unnamed: 0,sex,education,job,retired,stop_work_due_disease,diabetes,hepatitis,oncology,chronic_lung_disease,bronchial_asthma,tuberculosis,hiv/aids,intake_medicines,trauma_last_year,fractures,smoking_duration,ciggaretes_per_day,passive_smoking,passive_smoking_frequency,alcohol_duration,midday_sleep,sport_clubs,religion_clubs,unknown_id_feature,id_1,id_2,id_3,id_4,married,divorced,widowed,cohabited,single,separated,asian,european,other,is_russian,christian,household,military,senior_specialist,farmer/fisherman,low_qualified,hard_industry,top_management,service,crafters_other,office,junior_specialists,comorbid_count,is_comorbid,fracture_last_year,trauma_on_retire,fracture_on_retire,previous_smoke,smoking,never_smoke,smoker_score,smoking_duration_score,ciggaretes_per_day_log,ciggaretes_per_day_score,ciggaretes_per_day_score_log,smoking_score_int_log,smoking_score_int_score,smoking_score_int_score_2,smoking_score_int_score_3,smoking_score_int_score_log,never_drink_alco,previous_drink_alco,drink_alco_now,alcohol_score,alcohol_duration_score,alcohol_duration_log,alcohol_int_score_1,alcohol_int_score_2,alcohol_int_score_3,sleep_onset_early_22,sleep_onset_22,sleep_onset_23,sleep_onset_later_after_0,early_onset,lately_onset,sleep_upset_early_6,sleep_upset_6,sleep_upset_7,sleep_upset_8,sleep_upset_after_9,early_upset,lately_upset,sleeptime,low_amount_sleep,high_amout_sleep,cardio_score,arterial_hypertension,IHD,heart_failure,ACD,train,have_cardio_probs
0,1.534172,-1.603579,0.917398,-0.680301,-0.235936,-0.324780,-0.360260,-0.217571,-0.290536,-0.20292,-0.133759,-0.035455,-1.289915,-0.237415,-0.737295,0.561143,1.343394,-0.557773,-0.455480,0.189159,-0.541313,-0.261554,-0.152057,1.804157,-1.937840,1.975231,-0.071044,-0.035455,0.836724,-0.379995,-0.427350,-0.296841,-0.22537,-0.043437,-0.123678,0.147678,-0.07948,0.248998,0.412872,-0.301821,-0.075378,-0.561642,-0.126269,2.465766,-0.295588,-0.189105,-0.384310,-0.321225,-0.312831,-0.310407,-0.601161,-0.663570,-0.196116,-0.131306,-0.100727,-0.502353,1.850676,-1.157241,1.634791,0.662452,1.528732,1.422087,1.741210,0.390658,-0.028440,1.725543,-0.023887,2.489955,-0.481683,-0.428377,0.723149,0.672209,-0.003202,0.412302,-0.155943,0.163433,0.542194,-0.24326,1.870829,-0.78357,-0.717118,1.611191,-1.603667,-0.373477,1.402313,-0.622600,-0.413912,-0.343358,0.788796,-0.580008,0.305904,-0.260181,-0.30797,0.453997,0,0,0,0,1,0
1,-0.651817,1.171545,-1.090040,-0.680301,-0.235936,3.079004,-0.360260,-0.217571,-0.290536,-0.20292,-0.133759,-0.035455,0.775245,-0.237415,1.356308,-0.767616,-0.571704,-0.557773,-0.455480,-1.771065,1.847361,-0.261554,-0.152057,-0.502442,0.516038,-0.506270,-0.071044,-0.035455,-1.195138,2.631610,-0.427350,-0.296841,-0.22537,-0.043437,-0.123678,0.147678,-0.07948,0.248998,0.412872,-0.301821,-0.075378,1.780493,-0.126269,-0.405554,-0.295588,-0.189105,-0.384310,-0.321225,-0.312831,-0.310407,1.005633,1.506999,-0.196116,-0.131306,-0.100727,-0.502353,-0.540343,0.864124,-0.793388,-0.754673,-0.796418,-0.496105,-0.596337,-0.230346,-0.029262,-0.497198,-0.026324,-0.401755,2.076054,-0.428377,-1.382841,-1.856313,-1.703536,-2.034425,-0.312957,-1.568379,-1.544389,-0.24326,-0.534522,-0.78357,1.394470,-0.620659,0.623571,2.677542,-0.713108,-0.622600,-0.413912,-0.343358,-1.267754,-0.580008,-2.371596,3.843472,-0.30797,-0.071979,1,1,1,1,1,3
2,-0.651817,1.171545,-1.090040,-0.680301,-0.235936,-0.324780,-0.360260,-0.217571,-0.290536,-0.20292,-0.133759,-0.035455,0.775245,-0.237415,-0.737295,-0.767616,-0.571704,1.792843,0.280402,0.080257,-0.541313,-0.261554,-0.152057,4.110756,-1.937840,-0.506270,14.075688,-0.035455,0.836724,-0.379995,-0.427350,-0.296841,-0.22537,-0.043437,-0.123678,0.147678,-0.07948,0.248998,0.412872,-0.301821,-0.075378,1.780493,-0.126269,-0.405554,-0.295588,-0.189105,-0.384310,-0.321225,-0.312831,-0.310407,-0.601161,-0.663570,-0.196116,-0.131306,-0.100727,-0.502353,-0.540343,0.864124,-0.793388,-0.754673,-0.796418,-0.496105,-0.596337,-0.230346,-0.029262,-0.497198,-0.026324,-0.401755,-0.481683,-0.428377,0.723149,0.672209,-0.003202,0.367374,-0.155943,0.163433,0.456368,-0.24326,-0.534522,1.27621,-0.717118,-0.620659,0.623571,-0.373477,-0.713108,1.606168,-0.413912,-0.343358,0.788796,-0.580008,0.305904,-0.260181,-0.30797,-0.597955,0,0,0,0,1,0
3,1.534172,-1.603579,0.917398,-0.680301,-0.235936,-0.324780,-0.360260,4.596194,-0.290536,-0.20292,-0.133759,-0.035455,0.775245,-0.237415,-0.737295,0.295391,0.385845,1.792843,1.016284,-0.355348,-0.541313,-0.261554,-0.152057,1.804157,-1.937840,1.975231,-0.071044,-0.035455,0.836724,-0.379995,-0.427350,-0.296841,-0.22537,-0.043437,-0.123678,0.147678,-0.07948,0.248998,-2.422061,-0.301821,-0.075378,-0.561642,-0.126269,2.465766,-0.295588,-0.189105,-0.384310,-0.321225,-0.312831,-0.310407,1.005633,1.506999,-0.196116,-0.131306,-0.100727,1.990632,-0.540343,-1.157241,0.420701,0.190077,1.034893,0.462991,0.878491,-0.218961,-0.029238,-0.126741,-0.026172,-0.401755,-0.481683,-0.428377,0.723149,0.672209,-0.569980,0.158540,-0.245665,-0.413838,0.078314,-0.24326,-0.534522,1.27621,-0.717118,-0.620659,0.623571,-0.373477,-0.713108,1.606168,-0.413912,-0.343358,0.788796,-0.580008,0.305904,-0.260181,-0.30797,0.453997,1,0,0,0,1,1
4,-0.651817,-1.603579,-1.090040,-0.680301,4.238430,3.079004,2.775773,-0.217571,-0.290536,-0.20292,-0.133759,-0.035455,0.775245,-0.237415,1.356308,-0.767616,-0.571704,1.792843,1.752166,-0.028644,-0.541313,-0.261554,-0.152057,-0.502442,0.516038,-0.506270,-0.071044,-0.035455,0.836724,-0.379995,-0.427350,-0.296841,-0.22537,-0.043437,-0.123678,0.147678,-0.07948,0.248998,0.412872,-0.301821,-0.075378,-0.561642,-0.126269,-0.405554,3.383092,-0.189105,-0.384310,-0.321225,-0.312831,-0.310407,2.612427,1.506999,-0.196116,-0.131306,-0.100727,-0.502353,-0.540343,0.864124,-0.793388,-0.754673,-0.796418,-0.496105,-0.596337,-0.230346,-0.029262,-0.497198,-0.026324,-0.401755,-0.481683,-0.428377,0.723149,0.672209,-0.003202,0.319877,-0.155943,0.163433,0.367365,-0.24326,-0.534522,1.27621,-0.717118,-0.620659,0.623571,-0.373477,1.402313,-0.622600,-0.413912,-0.343358,0.788796,-0.580008,-0.363471,-0.260181,-0.30797,0.453997,1,1,1,0,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1588,-0.651817,1.171545,0.917398,-0.680301,-0.235936,3.079004,-0.360260,-0.217571,-0.290536,-0.20292,-0.133759,-0.035455,0.775245,4.212028,1.356308,-0.767616,-0.571704,-0.557773,-0.455480,1.495974,-0.541313,-0.261554,-0.152057,-0.502442,0.516038,-0.506270,-0.071044,-0.035455,0.836724,-0.379995,-0.427350,-0.296841,-0.22537,-0.043437,-0.123678,0.147678,-0.07948,0.248998,0.412872,-0.301821,-0.075378,1.780493,-0.126269,-0.405554,-0.295588,-0.189105,-0.384310,-0.321225,-0.312831,-0.310407,1.005633,1.506999,5.099020,-0.131306,-0.100727,-0.502353,-0.540343,0.864124,-0.793388,-0.754673,-0.796418,-0.496105,-0.596337,-0.230346,-0.029262,-0.497198,-0.026324,-0.401755,-0.481683,-0.428377,0.723149,0.672209,1.697132,0.819099,1.100164,1.895246,1.391697,-0.24326,-0.534522,1.27621,-0.717118,-0.620659,0.623571,-0.373477,1.402313,-0.622600,-0.413912,-0.343358,0.788796,-0.580008,-0.363471,-0.260181,-0.30797,-0.071979,1,1,1,0,0,3
1589,-0.651817,1.171545,-1.090040,1.469937,-0.235936,-0.324780,-0.360260,-0.217571,-0.290536,-0.20292,-0.133759,-0.035455,0.775245,-0.237415,-0.737295,-0.767616,-0.571704,-0.557773,-0.455480,-1.771065,-0.541313,-0.261554,-0.152057,-0.502442,0.516038,-0.506270,-0.071044,-0.035455,-1.195138,-0.379995,2.340002,-0.296841,-0.22537,-0.043437,-0.123678,0.147678,-0.07948,0.248998,0.412872,-0.301821,-0.075378,1.780493,-0.126269,-0.405554,-0.295588,-0.189105,-0.384310,-0.321225,-0.312831,-0.310407,-0.601161,-0.663570,-0.196116,-0.131306,-0.100727,-0.502353,-0.540343,0.864124,-0.793388,-0.754673,-0.796418,-0.496105,-0.596337,-0.230346,-0.029262,-0.497198,-0.026324,-0.401755,2.076054,-0.428377,-1.382841,-1.856313,-1.703536,-2.034425,-0.312957,-1.568379,-1.544389,-0.24326,-0.534522,-0.78357,1.394470,-0.620659,0.623571,-0.373477,-0.713108,-0.622600,-0.413912,2.912412,-1.267754,1.724114,0.975279,-0.260181,-0.30797,0.453997,1,1,1,0,0,3
1590,-0.651817,-0.216017,0.917398,-0.680301,-0.235936,-0.324780,-0.360260,-0.217571,-0.290536,-0.20292,-0.133759,-0.035455,0.775245,-0.237415,-0.737295,-0.767616,-0.571704,1.792843,1.752166,0.951468,-0.541313,-0.261554,-0.152057,-0.502442,0.516038,-0.506270,-0.071044,-0.035455,-1.195138,-0.379995,-0.427350,3.368804,-0.22537,-0.043437,-0.123678,0.147678,-0.07948,0.248998,0.412872,-0.301821,-0.075378,-0.561642,-0.126269,-0.405554,-0.295588,-0.189105,-0.384310,-0.321225,-0.312831,3.221579,-0.601161,-0.663570,-0.196116,-0.131306,-0.100727,-0.502353,-0.540343,0.864124,-0.793388,-0.754673,-0.796418,-0.496105,-0.596337,-0.230346,-0.029262,-0.497198,-0.026324,-0.401755,-0.481683,2.334393,-1.382841,-0.592052,1.130354,0.672940,-0.312957,-0.125202,-0.930063,-0.24326,1.870829,-0.78357,-0.717118,1.611191,-1.603667,-0.373477,-0.713108,1.606168,-0.413912,-0.343358,0.788796,-0.580008,0.975279,-0.260181,-0.30797,-0.597955,1,1,1,0,0,3
1591,-0.651817,-0.216017,0.917398,-0.680301,-0.235936,-0.324780,-0.360260,-0.217571,-0.290536,-0.20292,-0.133759,-0.035455,-1.289915,-0.237415,-0.737295,-0.767616,-0.571704,1.792843,2.488048,-1.771065,-0.541313,-0.261554,-0.152057,-0.502442,0.516038,-0.506270,-0.071044,-0.035455,0.836724,-0.379995,-0.427350,-0.296841,-0.22537,-0.043437,-0.123678,0.147678,-0.07948,0.248998,0.412872,-0.301821,-0.075378,-0.561642,-0.126269,-0.405554,-0.295588,-0.189105,2.602063,-0.321225,-0.312831,-0.310407,-0.601161,-0.663570,-0.196116,-0.131306,-0.100727,-0.502353,-0.540343,0.864124,-0.793388,-0.754673,-0.796418,-0.496105,-0.596337,-0.230346,-0.029262,-0.497198,-0.026324,-0.401755,2.076054,-0.428377,-1.382841,-1.856313,-1.703536,-2.034425,-0.312957,-1.568379,-1.544389,-0.24326,-0.534522,-0.78357,1.394470,-0.620659,0.623571,-0.373477,-0.713108,-0.622600,2.415970,-0.343358,-1.267754,1.724114,0.305904,-0.260181,-0.30797,-0.597955,0,0,0,0,0,0


In [1320]:
acd_train, acd_pred = fit_model(acd, 'ACD', split=False,
                            reduce_corrs=True, threshold_reduce=0.07,
                            model='logreg', grid=True, param_grid=param_grid_logreg, 
                            oversampling=True, oversampler='random', resample_strategy=0.4,
                            return_predicted=True, X_test=X_test, random_state=42)


Used features:  Index(['fractures', 'midday_sleep', 'sex', 'job', 'early_onset', 'id_1', 'IHD',
       'stop_work_due_disease', 'arterial_hypertension', 'crafters_other',
       'intake_medicines', 'ACD'],
      dtype='object')
	C: 0.1
	class_weight: 'balanced'
	dual: False
	fit_intercept: True
	intercept_scaling: 1
	l1_ratio: None
	max_iter: 200
	multi_class: 'auto'
	n_jobs: None
	penalty: 'l2'
	random_state: None
	solver: 'saga'
	tol: 0.0001
	verbose: 0
	warm_start: False
f1 expected:  0.12194229255688295
recall_macro expected:  0.626515117396265
self_recall_score:  0.7177643165928377


In [1321]:
X_test['ACD'] = acd_pred
target_train['ACD'] = acd_train

# Предсказание other

In [1322]:
df_other = pd.concat([train, target_train[['arterial_hypertension',
                   'IHD', 'heart_failure', 'ACD', 'other_cardio_diseases']]], axis=1)

In [1323]:
df_other.corr()

Unnamed: 0,sex,education,job,retired,stop_work_due_disease,diabetes,hepatitis,oncology,chronic_lung_disease,bronchial_asthma,tuberculosis,hiv/aids,intake_medicines,trauma_last_year,fractures,smoking_duration,ciggaretes_per_day,passive_smoking,passive_smoking_frequency,alcohol_duration,midday_sleep,sport_clubs,religion_clubs,unknown_id_feature,id_1,id_2,id_3,id_4,married,divorced,widowed,cohabited,single,separated,asian,european,other,is_russian,christian,household,military,senior_specialist,farmer/fisherman,low_qualified,hard_industry,top_management,service,crafters_other,office,junior_specialists,comorbid_count,is_comorbid,fracture_last_year,trauma_on_retire,fracture_on_retire,previous_smoke,smoking,never_smoke,smoker_score,smoking_duration_score,ciggaretes_per_day_log,ciggaretes_per_day_score,ciggaretes_per_day_score_log,smoking_score_int_log,smoking_score_int_score,smoking_score_int_score_2,smoking_score_int_score_3,smoking_score_int_score_log,never_drink_alco,previous_drink_alco,drink_alco_now,alcohol_score,alcohol_duration_score,alcohol_duration_log,alcohol_int_score_1,alcohol_int_score_2,alcohol_int_score_3,sleep_onset_early_22,sleep_onset_22,sleep_onset_23,sleep_onset_later_after_0,early_onset,lately_onset,sleep_upset_early_6,sleep_upset_6,sleep_upset_7,sleep_upset_8,sleep_upset_after_9,early_upset,lately_upset,sleeptime,low_amount_sleep,high_amout_sleep,cardio_score,arterial_hypertension,IHD,heart_failure,ACD,other_cardio_diseases
sex,1.000000,-0.101709,0.181715,-0.160482,0.012543,-0.047180,-0.003931,-0.054216,0.003134,-0.059957,0.015783,0.019931,-0.183618,-0.023634,0.130975,0.254139,0.521598,0.025240,-0.016034,0.056609,0.065984,-0.013911,-0.024533,0.516997,-0.560211,0.567111,-0.002776,0.019931,0.286947,-0.139257,-0.249467,0.035309,-0.065230,-0.030028,0.119025,-0.099037,-0.002776,-0.108588,-0.166641,-0.051991,0.015740,-0.148631,0.032663,0.061889,0.193439,0.115247,-0.067938,0.206256,-0.070722,-0.140489,-0.058935,-0.034638,-0.018455,-0.070970,-0.067427,0.254651,0.257688,-0.425092,0.385318,0.239119,0.507539,0.515328,0.533212,0.090068,0.092522,0.386061,-0.020533,0.321805,-0.178735,0.087221,0.074785,0.133346,0.040848,0.142071,-0.076422,0.021316,0.050737,-0.039181,-0.012241,0.095141,-0.070852,-0.031484,0.027862,0.028573,0.032659,0.020258,-0.085402,0.002563,0.050024,-0.068917,-0.034043,-0.006933,-0.028429,0.280551,-0.122655,-0.109870,-0.220544,0.056102,-0.271227
education,-0.101709,1.000000,0.141569,-0.120333,-0.085095,-0.095664,-0.014166,-0.009097,-0.064727,-0.113857,-0.034824,-0.070613,-0.008593,0.030019,-0.080400,-0.050920,-0.133900,-0.026029,-0.043667,-0.020878,-0.108812,0.159528,-0.016454,-0.072817,0.074777,-0.072046,-0.014566,-0.007765,0.024634,0.070510,-0.099317,-0.056120,0.067108,0.023658,-0.066276,0.091461,-0.065089,0.072878,0.038025,-0.067072,0.047419,0.542282,-0.033684,-0.310723,-0.146340,0.142777,-0.108061,-0.128465,-0.025176,-0.030514,-0.124829,-0.093951,0.035744,-0.022820,0.010788,-0.027294,-0.089607,0.098346,-0.104411,-0.043281,-0.131292,-0.136610,-0.146513,0.007377,-0.020984,-0.101048,-0.005582,-0.106764,-0.028071,-0.055472,0.065877,0.054080,-0.026445,0.015187,-0.068519,0.004334,0.041180,-0.009717,-0.075227,0.000803,0.069313,-0.074633,0.074942,-0.093231,0.023890,0.030562,-0.006994,0.020450,0.050993,0.008504,0.003597,-0.019631,-0.032805,-0.255370,-0.183389,-0.183794,-0.222609,-0.118432,0.100020
job,0.181715,0.141569,1.000000,-0.744162,-0.235818,-0.135528,0.017977,-0.126516,0.002411,-0.056535,-0.036183,-0.048023,-0.207699,0.002783,-0.056837,0.043199,0.087342,0.058003,0.056115,0.024839,-0.145778,0.024712,-0.021219,0.048745,-0.071984,0.091936,-0.090079,-0.002161,0.052148,0.059395,-0.216635,0.088584,0.035255,-0.002161,0.049145,-0.046005,0.008236,-0.014997,-0.028401,-0.281503,-0.046999,0.055327,-0.030134,0.117761,0.073879,0.107958,0.028804,-0.098061,0.020664,-0.021831,-0.118169,-0.101981,0.018694,-0.141124,-0.107836,-0.036690,0.135682,-0.085904,0.120429,0.040807,0.096576,0.083550,0.090285,0.018145,-0.015424,0.086089,-0.033708,0.115128,-0.074243,-0.111584,0.146742,0.126202,0.016728,0.061919,-0.041760,0.055575,0.104891,-0.006856,0.059142,0.006358,-0.056449,0.051220,-0.053400,0.109295,0.133705,-0.003268,-0.198186,-0.063828,0.125942,-0.208706,-0.118117,0.018374,-0.092054,-0.490205,-0.313918,-0.561911,-0.537326,-0.405770,-0.147837
retired,-0.160482,-0.120333,-0.744162,1.000000,-0.066448,0.174259,-0.051355,0.113311,-0.001056,0.057585,-0.026362,-0.032520,0.218822,-0.000878,0.073037,-0.080811,-0.138937,-0.031715,-0.035530,0.001515,0.109357,0.019553,0.009290,-0.094135,0.099061,-0.100486,0.017020,-0.032520,-0.075082,-0.025552,0.236775,-0.100435,-0.057860,0.016006,-0.078791,0.071459,-0.008987,0.073312,0.052271,0.197934,0.040718,-0.028190,0.021872,-0.083542,-0.034665,-0.103042,-0.080893,0.100563,-0.001193,0.025573,0.110612,0.097575,-0.020398,0.189642,0.144909,-0.016638,-0.147559,0.139087,-0.158270,-0.073221,-0.150297,-0.129837,-0.135125,-0.045667,-0.036345,-0.115013,-0.023203,-0.135175,0.068726,0.022369,-0.072508,-0.078056,0.014062,-0.046497,0.076239,0.005790,-0.033828,0.041091,-0.034249,-0.033373,0.045437,-0.010568,0.012223,-0.081484,-0.090543,0.019240,0.164142,-0.001279,-0.069710,0.135007,0.088615,-0.043949,0.058572,0.667119,0.331273,0.615308,0.570377,0.320649,0.168658
stop_work_due_disease,0.012543,-0.085095,-0.235818,-0.066448,1.000000,0.018050,0.021950,0.069072,0.009853,0.000606,0.035404,0.096669,0.085992,-0.009930,0.007357,-0.027081,0.010039,0.025512,0.006754,-0.051149,0.065412,-0.041376,0.063250,0.010888,-0.018062,0.023473,-0.019330,-0.010305,-0.040942,-0.000304,0.042652,0.003458,0.021904,-0.010305,-0.030284,-0.026369,0.095332,-0.027114,-0.049952,0.019280,-0.016320,-0.055748,-0.030284,0.032726,-0.008670,-0.043877,0.015700,0.028818,-0.022347,0.055544,0.053765,0.035780,-0.017851,-0.030284,-0.023141,-0.002445,-0.007480,0.008344,-0.008796,-0.030047,-0.000205,0.017626,0.018885,-0.004986,-0.017192,0.008619,-0.007301,-0.002512,0.026548,0.099026,-0.098744,-0.073454,-0.050029,-0.042675,-0.043317,-0.084369,-0.101500,-0.011810,0.000348,0.005679,0.000086,-0.005752,0.006274,-0.036988,-0.023564,0.003634,0.062542,0.001376,-0.019399,0.052744,0.035965,-0.017101,-0.001618,-0.021951,0.090199,0.188359,0.123622,0.169330,0.033461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
arterial_hypertension,-0.122655,-0.183389,-0.313918,0.331273,0.090199,0.317874,-0.032376,0.139921,0.041158,0.102496,0.017820,0.042344,0.779945,0.023334,0.134991,-0.110167,-0.029183,0.005970,-0.010755,-0.023254,0.067881,-0.042142,-0.026100,-0.110074,0.105382,-0.092465,-0.068338,-0.003609,-0.083733,-0.033686,0.223045,-0.027470,-0.093086,-0.003609,-0.018549,-0.000773,0.030172,-0.016104,0.071566,0.092889,0.008839,-0.115513,0.060887,0.021397,-0.141844,-0.076856,0.029288,0.036170,0.060872,0.074755,0.235785,0.232914,0.001405,0.045000,-0.008104,-0.010504,-0.125415,0.115277,-0.132762,-0.108789,-0.083020,-0.011929,-0.031885,-0.038968,0.044881,-0.041151,0.029901,-0.070163,0.083467,0.095888,-0.141841,-0.127721,-0.008694,-0.065250,-0.025240,-0.062154,-0.114356,0.123125,0.006994,-0.005260,-0.057508,0.069804,-0.067178,0.032928,-0.017610,-0.050917,0.034635,0.027170,-0.063558,0.047669,0.084946,-0.015852,0.067594,0.275445,1.000000,0.687602,0.593271,0.444928,0.463433
IHD,-0.109870,-0.183794,-0.561911,0.615308,0.188359,0.395492,-0.055688,0.132160,0.059027,0.110629,0.073420,0.061192,0.526023,-0.032203,0.125991,-0.044220,-0.018252,-0.012632,-0.020554,-0.001750,0.143059,-0.028993,0.001432,-0.080638,0.083016,-0.077764,-0.038743,0.013448,-0.085226,-0.057563,0.279176,-0.105967,-0.046776,0.013448,-0.034759,0.036533,-0.013155,0.001051,0.085148,0.155958,0.036419,-0.081321,0.113800,-0.028334,-0.048440,-0.134404,0.005363,0.052816,-0.017366,0.033617,0.270256,0.244674,-0.064708,0.097293,0.030198,0.029465,-0.105446,0.065993,-0.093133,-0.040161,-0.050437,-0.004096,-0.014563,0.000997,0.049971,-0.008354,0.043338,-0.052927,0.073542,0.212370,-0.225057,-0.173800,0.012185,-0.053867,0.005466,-0.083838,-0.168013,0.075926,0.024650,-0.036066,-0.019441,0.061871,-0.060038,-0.017470,-0.041040,-0.012018,0.074025,0.016715,-0.050568,0.072974,0.086998,-0.010793,0.064932,0.518869,0.687602,1.000000,0.855915,0.463903,0.342346
heart_failure,-0.220544,-0.222609,-0.537326,0.570377,0.123622,0.317403,-0.005558,0.116756,0.095624,0.137666,0.084434,0.065453,0.447156,0.022122,0.106590,-0.110340,-0.079563,-0.008681,-0.021428,-0.029074,0.154405,-0.012140,0.041104,-0.126608,0.145475,-0.147912,-0.034011,0.065453,-0.095663,-0.073353,0.312233,-0.104990,-0.055232,0.016695,-0.043652,0.041171,-0.007880,0.003914,0.071162,0.258953,-0.019890,-0.099685,0.124919,-0.034068,-0.056338,-0.124651,0.003714,-0.002015,-0.004514,0.021694,0.277250,0.246045,-0.006024,0.141776,0.081284,-0.043465,-0.135640,0.150604,-0.159087,-0.105455,-0.129266,-0.060799,-0.081106,-0.026017,0.049339,-0.057225,0.046224,-0.075542,0.128229,0.191179,-0.252240,-0.217232,-0.013933,-0.102288,0.036390,-0.104283,-0.193317,0.034794,0.005081,-0.046055,0.027333,0.022600,-0.020928,0.019641,0.008386,0.002628,0.061070,-0.119343,0.010491,-0.032862,-0.071975,0.051856,-0.077868,0.400940,0.593271,0.855915,1.000000,0.360010,0.330331
ACD,0.056102,-0.118432,-0.405770,0.320649,0.169330,0.134728,0.013484,0.004389,0.004798,0.033715,0.049651,0.063931,0.474005,0.016697,0.399269,0.048575,0.051861,0.000022,-0.019816,0.046472,0.220543,-0.000460,-0.036323,0.177301,-0.176170,0.163065,0.068063,0.015552,-0.004529,-0.042021,0.102177,-0.046584,-0.038273,0.015552,-0.046290,0.030272,0.016208,0.028230,-0.010733,0.083504,0.009306,-0.111188,0.054066,-0.061763,0.038304,-0.080913,-0.037831,0.211538,-0.049462,0.030804,0.090899,0.100751,0.025020,0.037340,0.034922,0.095100,-0.052326,-0.032250,-0.007358,0.045401,0.045041,0.053358,0.056632,0.039943,0.056011,0.029547,0.045426,-0.039425,-0.023278,0.084199,-0.047265,-0.017529,0.052292,0.027884,0.039057,0.016878,-0.021890,0.164401,0.213479,-0.126797,-0.132992,0.282172,-0.280150,0.066165,-0.040144,-0.070438,0.050973,0.031469,-0.103144,0.064200,0.154499,-0.009340,0.164365,0.287929,0.444928,0.463903,0.360010,1.000000,0.210396


In [1324]:
other = MultiCollinearityEliminator(df_other, 'other_cardio_diseases', 0.8).autoEliminateMulticollinearity()

['fracture_last_year', 'trauma_last_year', 'never_smoke', 'smoking_duration', 'smoker_score', 'smoking_duration_score', 'ciggaretes_per_day_log', 'ciggaretes_per_day', 'ciggaretes_per_day_score', 'ciggaretes_per_day_score_log', 'smoking_score_int_score_2', 'passive_smoking_frequency', 'passive_smoking', 'never_drink_alco', 'alcohol_duration', 'alcohol_duration_score', 'alcohol_duration_log', 'alcohol_int_score_2', 'id_1', 'unknown_id_feature', 'id_2', 'european', 'asian', 'is_comorbid', 'comorbid_count', 'smoking', 'smoking_score_int_score_log', 'alcohol_score', 'drink_alco_now', 'alcohol_int_score_3', 'early_onset', 'sleep_onset_22', 'lately_onset', 'heart_failure', 'IHD'] 

                           other_cardio_diseases
tuberculosis                            0.001560
id_3                                    0.004085
separated                               0.008843
id_4                                    0.008843
sleep_onset_later_after_0               0.011026
...                  

['never_smoke', 'smoking_duration', 'smoker_score', 'smoking_duration_score', 'ciggaretes_per_day_log', 'ciggaretes_per_day', 'ciggaretes_per_day_score', 'ciggaretes_per_day_score_log', 'smoking_score_int_score_2', 'id_1', 'unknown_id_feature', 'id_2', 'european', 'asian', 'is_comorbid', 'comorbid_count', 'smoking', 'smoking_score_int_score_log', 'alcohol_score', 'never_drink_alco', 'alcohol_duration_log', 'drink_alco_now', 'alcohol_int_score_3', 'alcohol_int_score_2', 'heart_failure', 'IHD'] 

                           other_cardio_diseases
tuberculosis                            0.001560
id_3                                    0.004085
separated                               0.008843
id_4                                    0.008843
sleep_onset_later_after_0               0.011026
...                                          ...
sex                                     0.271227
heart_failure                           0.330331
IHD                                     0.342346
arterial_h

['never_smoke', 'smoking_duration', 'smoker_score', 'smoking_duration_score', 'ciggaretes_per_day_log', 'ciggaretes_per_day', 'ciggaretes_per_day_score', 'ciggaretes_per_day_score_log', 'smoking_score_int_score_2', 'id_1', 'unknown_id_feature', 'id_2', 'is_comorbid', 'comorbid_count', 'smoking', 'smoking_score_int_score_log', 'alcohol_score', 'drink_alco_now', 'heart_failure', 'IHD'] 

                           other_cardio_diseases
tuberculosis                            0.001560
id_3                                    0.004085
separated                               0.008843
id_4                                    0.008843
sleep_onset_later_after_0               0.011026
...                                          ...
sex                                     0.271227
heart_failure                           0.330331
IHD                                     0.342346
arterial_hypertension                   0.463433
intake_medicines                        0.517206

[87 rows x 1 columns] 

['ciggaretes_per_day_log', 'ciggaretes_per_day', 'ciggaretes_per_day_score', 'ciggaretes_per_day_score_log', 'smoking_score_int_score_2', 'is_comorbid', 'comorbid_count', 'smoker_score', 'smoking', 'never_smoke', 'smoking_score_int_score_log', 'heart_failure', 'IHD'] 

                           other_cardio_diseases
tuberculosis                            0.001560
id_3                                    0.004085
separated                               0.008843
id_4                                    0.008843
sleep_onset_later_after_0               0.011026
...                                          ...
sex                                     0.271227
heart_failure                           0.330331
IHD                                     0.342346
arterial_hypertension                   0.463433
intake_medicines                        0.517206

[82 rows x 1 columns] 

tuberculosis 

id_3 

separated 

id_4 

sleep_onset_later_after_0 

single 

divorced 

cohabited 

cardio_score 

s

['smoker_score', 'smoking', 'ciggaretes_per_day_log', 'smoking_score_int_score_log', 'smoking_score_int_score_2', 'heart_failure', 'IHD'] 

                           other_cardio_diseases
tuberculosis                            0.001560
id_3                                    0.004085
separated                               0.008843
id_4                                    0.008843
sleep_onset_later_after_0               0.011026
...                                          ...
sex                                     0.271227
heart_failure                           0.330331
IHD                                     0.342346
arterial_hypertension                   0.463433
intake_medicines                        0.517206

[77 rows x 1 columns] 

tuberculosis 

id_3 

separated 

id_4 

sleep_onset_later_after_0 

single 

divorced 

cohabited 

cardio_score 

smoking_score_int_score_3 

military 

previous_smoke 

fractures 

sleep_upset_8 

stop_work_due_disease 

farmer/fisherman 

alco

In [1325]:
other['other_cardio_diseases'].value_counts(normalize=True)

0    0.594764
1    0.405236
Name: other_cardio_diseases, dtype: float64

In [1328]:
other_train, other_pred = fit_model(other, 'other_cardio_diseases', split=False,
                            reduce_corrs=True, threshold_reduce=0.05,
                            model='logreg', grid=True, param_grid=param_grid_logreg,
                            oversampling=True, oversampler='random', resample_strategy=0.7,
                            return_predicted=True, X_test=X_test, random_state=42)


Used features:  Index(['passive_smoking', 'crafters_other', 'hepatitis', 'hiv/aids',
       'sleeptime', 'early_onset', 'previous_drink_alco', 'sleep_onset_23',
       'sleep_upset_7', 'is_russian', 'sleep_onset_early_22', 'other',
       'married', 'smoking_score_int_score', 'trauma_last_year',
       'sleep_upset_early_6', 'trauma_on_retire', 'office', 'sleep_upset_6',
       'education', 'household', 'lately_upset', 'widowed',
       'sleep_upset_after_9', 'diabetes', 'religion_clubs', 'bronchial_asthma',
       'drink_alco_now', 'european', 'junior_specialists',
       'smoking_score_int_log', 'top_management', 'low_amount_sleep', 'job',
       'early_upset', 'chronic_lung_disease', 'oncology', 'retired',
       'senior_specialist', 'high_amout_sleep', 'id_2', 'christian', 'ACD',
       'hard_industry', 'low_qualified', 'comorbid_count', 'smoker_score',
       'smoking_score_int_score_log', 'midday_sleep', 'sex', 'IHD',
       'arterial_hypertension', 'intake_medicines', 'other_car

In [1329]:
X_test['other_cardio_diseases'] = other_pred
target_train['other_cardio_diseases'] = other_train

In [1330]:
result = X_test[['arterial_hypertension','ACD', 'IHD', 'heart_failure', 'other_cardio_diseases']].copy()

In [1331]:
submit = pd.read_csv('sample_solution.csv')

In [1332]:
submit.iloc[:,1:] = result.values

In [1333]:
submit.iloc[:,1:].sum()

Артериальная гипертензия              330
ОНМК                                  196
Стенокардия, ИБС, инфаркт миокарда    218
Сердечная недостаточность             205
Прочие заболевания сердца             256
dtype: int64

In [1336]:
submit.to_csv('submit_38-lin.csv', index=False)

In [None]:
submit.iloc[:,1:].sum()

In [1334]:
prev_sub = pd.read_csv('submit_37-lin.csv')

In [1335]:
prev_sub.iloc[:,1:].sum()

Артериальная гипертензия              334
ОНМК                                  186
Стенокардия, ИБС, инфаркт миокарда    219
Сердечная недостаточность             209
Прочие заболевания сердца             244
dtype: int64

In [6445]:
prev_sub.iloc[:,1:].sum()

Артериальная гипертензия              334
ОНМК                                  148
Стенокардия, ИБС, инфаркт миокарда    310
Сердечная недостаточность             230
Прочие заболевания сердца             330
dtype: int64

In [None]:
from imblearn.ensemble import BalancedBaggingClassifier

In [4286]:
model = EasyEnsembleClassifier(n_estimators=50)

In [4357]:
X = ah.drop('arterial_hypertension', axis=1)
y = ah['arterial_hypertension']

In [4393]:
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=5, random_state=1)
# evaluate model
scores = cross_val_score(bg, X, y, scoring='f1', cv=cv, n_jobs=-1)
# summarize performance
print(np.mean(scores))

0.7022705405700509


In [4365]:
params = {'C': 0.1, 'class_weight': 'balanced', 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 50,
          'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}

In [4392]:
bg = BaggingClassifier(n_estimators=100, max_features=1.0, max_samples=1.0,
                       base_estimator=LogisticRegression(**params), n_jobs=-1)

In [None]:
print("Top Absolute Correlations")
print(get_top_abs_correlations(ihd_df, 5))

In [4274]:
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier

In [4280]:
clf = BaggingClassifier(base_estimator=None, n_estimators=10, random_state=0)

In [None]:
from sklearn.

In [4270]:
model = SVC()

In [4268]:
determinant = np.prod(np.linalg.eig(ah.corr())[0])
print(f'Детерминант корреляционной матрицы равен: {determinant}')

Детерминант корреляционной матрицы равен: 0.01920121930924493


In [None]:
def fit_random_forest(model, target, oversampling=False, grid=False, param_grid=None, 
                      return_model=False, return_predicted=False, X_test=None, random_state=None):
    
    X = model.drop(target, axis=1)
    y = model[target]
   # X_train, X_valid, y_train, y_valid = train_test_split(
    #    X, y, test_size=0.30, random_state=42, stratify=y)

    clf = RandomForestClassifier()
   # clf.fit(X_train, y_train)

    # Балансировка выборки
    if oversampling == True:
        oversam = RandomOverSampler(sampling_strategy='minority')
        X, y = oversam.fit_resample(X, y)

    if grid == True:
        grid_search = GridSearchCV(
            clf, param_grid, scoring='recall_macro', n_jobs=-1, cv=5)
        grid_search.fit(X, y)
        grid_search.best_params_

    # Печатаем параметры развернуто:
        best_model = grid_search.best_estimator_
        best_parameters = best_model.get_params()
        for param_name in sorted(best_parameters.keys()):
            print('\t%s: %r' % (param_name, best_parameters[param_name]))
        clf = best_model

        
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=random_state)
    scores = cross_val_score(clf, X, y, scoring='recall_macro', cv=cv, n_jobs=-1)
    print('mean_recall_macro: ', np.mean(scores))


    if return_predicted == True:
        clf.fit(X,y)
        predicted_target = clf.predict(X)
        predicted_test = clf.predict(X_test)
        return (predicted_target, predicted_test)

In [None]:
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

In [None]:
def get_top_abs_correlations(df, n=5):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

In [None]:
print("Top Absolute Correlations")
#print(get_top_abs_correlations(arterial_model_2, 5))