**Определение сердечно-сосудистых заболеваний**

target - наличие сердечно-сосудистых заболеваний (ССЗ)

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score, confusion_matrix

In [2]:
df = pd.read_csv('D:\Study\Machine Learning in Business\Manual\Lesson9/train_case2.csv', sep=';')
# Удалим 2 столбца
df = df.drop(['id', 'cholesterol', 'gluc'], axis=1)
df.head(3)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,0,0,1,0
1,20228,1,156,85.0,140,90,0,0,1,1
2,18857,1,165,64.0,130,70,0,0,0,1


Предобработка

In [4]:
df.describe()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,smoke,alco,active,cardio
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,19468.865814,1.349571,164.359229,74.20569,128.817286,96.630414,0.088129,0.053771,0.803729,0.4997
std,2467.251667,0.476838,8.210126,14.395757,154.011419,188.47253,0.283484,0.225568,0.397179,0.500003
min,10798.0,1.0,55.0,10.0,-150.0,-70.0,0.0,0.0,0.0,0.0
25%,17664.0,1.0,159.0,65.0,120.0,80.0,0.0,0.0,1.0,0.0
50%,19703.0,1.0,165.0,72.0,120.0,80.0,0.0,0.0,1.0,0.0
75%,21327.0,2.0,170.0,82.0,140.0,90.0,0.0,0.0,1.0,1.0
max,23713.0,2.0,250.0,200.0,16020.0,11000.0,1.0,1.0,1.0,1.0


Поиск пропусков

In [5]:
pd.DataFrame(df.isnull().sum()).T

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,smoke,alco,active,cardio
0,0,0,0,0,0,0,0,0,0,0


In [6]:
(df['age']/365).describe()

count    70000.000000
mean        53.339358
std          6.759594
min         29.583562
25%         48.394521
50%         53.980822
75%         58.430137
max         64.967123
Name: age, dtype: float64

Аномалии height -> замена на медиану

In [7]:
df.loc[df['height'] > 220, 'height'] = df['height'].median()

Аномалии ap_hi

In [8]:
# замена знака
df.loc[df['ap_hi'] <= 0, 'ap_hi'] = np.abs(df['ap_hi'])

In [9]:
# уменьшение разрядов
df.loc[(df['ap_hi'] > 500) & (df['ap_hi'] < 10000), 'ap_hi'] = df['ap_hi'] / 10

In [10]:
# уменьшение разрядов
df.loc[(df['ap_hi'] > 10000), 'ap_hi'] = df['ap_hi'] / 100

In [11]:
# замена на медиану
df.loc[(df['ap_hi'] > 250), 'ap_hi'] = df['ap_hi'].median()

In [12]:
# увеличение разрядов
df.loc[(df['ap_hi'] < 50), 'ap_hi'] = df['ap_hi'] * 10

In [14]:
df['ap_hi'].describe()

count    70000.000000
mean       126.997107
std         17.114820
min         60.000000
25%        120.000000
50%        120.000000
75%        140.000000
max        240.000000
Name: ap_hi, dtype: float64

Аномалия ap_lo

In [15]:
#  -> замена знака
df.loc[df['ap_lo'] < 0, 'ap_lo'] = np.abs(df['ap_lo'])

In [16]:
# -> замена на медиану
df.loc[df['ap_lo'] == 0, 'ap_lo'] = np.median(df['ap_lo'])

In [17]:
df.loc[df['ap_lo'] > 200, 'ap_lo'] = df['ap_lo'] / 10

In [18]:
df.loc[df['ap_lo'] > 200, 'ap_lo'] = df['ap_lo'] / 10

In [19]:
df.loc[df['ap_lo'] > 200, 'ap_lo'] = df['ap_lo'] / 10

In [20]:
df.loc[df['ap_lo'] < 50, 'ap_lo'] = df['ap_lo'] * 10

In [21]:
df.loc[df['ap_lo'] < 50, 'ap_lo'] = df['ap_lo'] * 10

In [22]:
s = df['ap_lo'] > 150
df.loc[s, ['ap_hi', 'ap_lo']] = df.loc[s, ['ap_lo', 'ap_hi']].values

In [23]:
df['ap_lo'].describe()

count    70000.000000
mean        81.686803
std          9.961945
min         50.000000
25%         80.000000
50%         80.000000
75%         90.000000
max        240.000000
Name: ap_lo, dtype: float64

Преобразование gender

In [24]:
df.groupby('gender').mean()

Unnamed: 0_level_0,age,height,weight,ap_hi,ap_lo,smoke,alco,active,cardio
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,19510.124577,161.353745,72.565605,126.365133,81.197424,0.017856,0.0255,0.802021,0.496727
2,19392.097875,169.947895,77.257307,128.550544,82.597363,0.21888,0.106375,0.806906,0.505231


In [25]:
# Примем, что средний вес мужчин > среднего веса женщин
df['gender'] = df['gender'].map({1: 0, 
                                 2: 1})

In [26]:
df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,smoke,alco,active,cardio
0,18393,1,168,62.0,110.0,80.0,0,0,1,0
1,20228,0,156,85.0,140.0,90.0,0,0,1,1
2,18857,0,165,64.0,130.0,70.0,0,0,0,1
3,17623,1,169,82.0,150.0,100.0,0,0,1,1
4,17474,0,156,56.0,100.0,60.0,0,0,0,0


Разделим наши данные на тренировочную и тестовую выборки

In [27]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['cardio', ], axis=1),
    df['cardio'], 
    random_state=0)

Добавим классификатор и запустим кросс-валидацию

In [28]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [29]:
# classifier = RandomForestClassifier(n_estimators=200, random_state=42)
classifier = DecisionTreeClassifier(max_depth=5, random_state=42)
# classifier = XGBClassifier(random_state=42)

In [30]:
%%time

#запустим кросс-валидацию
cv_scores = cross_val_score(classifier, X_train, y_train, cv=16, scoring='roc_auc')
cv_score = np.mean(cv_scores)
cv_score_std = np.std(cv_scores)
print('CV score is {}+-{}'.format(cv_score, cv_score_std))

#обучим пайплайн на всем тренировочном датасете
classifier.fit(X_train, y_train)
y_score = classifier.predict_proba(X_test)[:, 1]

CV score is 0.7863046003341406+-0.0072098004566204495
Wall time: 1.13 s


In [31]:
b=1
precision, recall, thresholds = precision_recall_curve(y_test.values, y_score)
fscore = (1+b**2)*(precision * recall) / (b**2*precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.402778, F-Score=0.729, Precision=0.660, Recall=0.813


In [32]:
X_test_2 = pd.DataFrame(
    {
        'age': 18,
        'gender': 1,  # 1 == man
        'height': 164,
        'weight': 54.0,
        'ap_hi': 200,
        'ap_lo': 80,
        'smoke': 0,
        'alco': 0,
        'active': 1
    },
    index=[80282])

In [33]:
classifier.predict(X_test_2)

array([1], dtype=int64)

In [34]:
classifier.predict_proba(X_test_2)

array([[0.13611111, 0.86388889]])

In [36]:
import dill

with open("model_rfc.dill", "wb") as f:
    dill.dump(classifier, f)

In [37]:
import pickle 

with open('model_dtc.pkl', 'wb') as file:
    pickle.dump(classifier,file)

In [41]:
#загрузка модели

# with open(MODEL_FILE_PATH,'rb') as file:
    # model_load=pickle.load(file)

In [40]:
!pip freeze


alabaster @ file:///home/ktietz/src/ci/alabaster_1611921544520/work
anaconda-client @ file:///C:/ci/anaconda-client_1635342725944/work
anaconda-navigator==2.1.1
anaconda-project @ file:///tmp/build/80754af9/anaconda-project_1626085644852/work
anyio @ file:///C:/ci/anyio_1620153135622/work/dist
appdirs==1.4.4
argh==0.26.2
argon2-cffi @ file:///C:/ci/argon2-cffi_1613037869401/work
arrow @ file:///C:/ci/arrow_1617738834352/work
asn1crypto @ file:///tmp/build/80754af9/asn1crypto_1596577642040/work
astroid @ file:///C:/ci/astroid_1628063282661/work
astropy @ file:///C:/ci/astropy_1629829318700/work
async-generator @ file:///home/ktietz/src/ci/async_generator_1611927993394/work
atomicwrites==1.4.0
attrs @ file:///tmp/build/80754af9/attrs_1620827162558/work
autopep8 @ file:///tmp/build/80754af9/autopep8_1620866417880/work
Babel @ file:///tmp/build/80754af9/babel_1620871417480/work
backcall @ file:///home/ktietz/src/ci/backcall_1611930011877/work
backports.functools-lru-cache @ file:///tmp/bui