1. https://dyakonov.org/2017/07/28/auc-roc-площадь-под-кривой-ошибок/
2. https://en.wikipedia.org/wiki/Receiver_operating_characteristic

# Определение ССЗ

## Описание

ссылка на соревнование - https://mlbootcamp.ru/ru/round/12/sandbox/

Датасет сформирован из 100.000 реальных клинических анализов, и в нём используются признаки, которые можно разбить на 3 группы:

| Объективные признаки  | Результаты измерения                   | Субъективные признаки (0/1) |
|-----------------------|----------------------------------------|-----------------------------|
| Возраст (в днях)      | Артериальное давление верхнее и нижнее | Курение                     |
| Рост                  | Холестерин                             | Употребление Алкоголя       |
| Вес                   | Глюкоза                                | Физическая активность       |
| Пол                   |                                        |                             |

Все показатели даны на момент осмотра.

`target` - наличие сердечно-сосудистых заболеваний (ССЗ)

#### Импорт библиотек

In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score, confusion_matrix

#### Загрузка датасета

Таргет - наличие сердечно-сосудистых заболеваний (ССЗ)

In [3]:
df = pd.read_csv('train_case2.csv', sep=';')
# Удалим 2 столбца
df = df.drop(['id', 'cholesterol', 'gluc'], axis=1)
df.head(3)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,0,0,1,0
1,20228,1,156,85.0,140,90,0,0,1,1
2,18857,1,165,64.0,130,70,0,0,0,1


## Предобработка

In [4]:
df.describe()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,smoke,alco,active,cardio
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,19468.865814,1.349571,164.359229,74.20569,128.817286,96.630414,0.088129,0.053771,0.803729,0.4997
std,2467.251667,0.476838,8.210126,14.395757,154.011419,188.47253,0.283484,0.225568,0.397179,0.500003
min,10798.0,1.0,55.0,10.0,-150.0,-70.0,0.0,0.0,0.0,0.0
25%,17664.0,1.0,159.0,65.0,120.0,80.0,0.0,0.0,1.0,0.0
50%,19703.0,1.0,165.0,72.0,120.0,80.0,0.0,0.0,1.0,0.0
75%,21327.0,2.0,170.0,82.0,140.0,90.0,0.0,0.0,1.0,1.0
max,23713.0,2.0,250.0,200.0,16020.0,11000.0,1.0,1.0,1.0,1.0


In [5]:
# поиск пропусков
pd.DataFrame(df.isnull().sum()).T

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,smoke,alco,active,cardio
0,0,0,0,0,0,0,0,0,0,0


In [6]:
(df['age']/365).describe()

count    70000.000000
mean        53.339358
std          6.759594
min         29.583562
25%         48.394521
50%         53.980822
75%         58.430137
max         64.967123
Name: age, dtype: float64

In [7]:
df['gender'].value_counts()

1    45530
2    24470
Name: gender, dtype: int64

#### Аномалия `height` -> замена на медиану

In [8]:
df.loc[df['height'] > 220, 'height'] = df['height'].median()

#### Аномалия `ap_hi`

In [9]:
# -> замена знака
df.loc[df['ap_hi'] <= 0, 'ap_hi'] = np.abs(df['ap_hi'])

In [10]:
df.loc[(df['ap_hi'] > 500) & (df['ap_hi'] < 10000), 'ap_hi'] = df['ap_hi'] / 10

In [11]:
df.loc[(df['ap_hi'] > 10000), 'ap_hi'] = df['ap_hi'] / 100

In [12]:
# TODO удалить!

df.loc[(df['ap_hi'] > 250), :]

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,smoke,alco,active,cardio
40330,20322,1,162,50.0,309.0,0,0,0,1,0
40831,19806,1,162,67.0,401.0,80,0,0,1,1


In [13]:
df.ap_hi.describe()

count    70000.000000
mean       126.707721
std         18.086981
min          1.000000
25%        120.000000
50%        120.000000
75%        140.000000
max        401.000000
Name: ap_hi, dtype: float64

In [14]:
df.loc[df['ap_hi'] <= 0, 'ap_hi']

Series([], Name: ap_hi, dtype: float64)

#### Аномалия `ap_lo`

In [15]:
#  -> замена знака
df.loc[df['ap_lo'] < 0, 'ap_lo'] = np.abs(df['ap_lo'])

In [16]:
# -> замена на медиану
df.loc[df['ap_lo'] == 0, 'ap_lo'] = np.median(df['ap_lo'])

In [17]:
df.loc[df['ap_lo'] > 200, 'ap_lo'] = df['ap_lo'] / 10

In [18]:
s = df['ap_lo'] > 150
df.loc[s, ['ap_hi', 'ap_lo']] = df.loc[s, ['ap_lo', 'ap_hi']].values

#### Преобразование `gender`

In [19]:
df.groupby('gender').mean()

Unnamed: 0_level_0,age,height,weight,ap_hi,ap_lo,smoke,alco,active,cardio
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,19510.124577,161.353745,72.565605,126.150659,81.127551,0.017856,0.0255,0.802021,0.496727
2,19392.097875,169.947895,77.257307,128.482697,82.550781,0.21888,0.106375,0.806906,0.505231


In [20]:
df['gender'] = df['gender'].map({1: 0, 
                                 2: 1})

In [21]:
df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,smoke,alco,active,cardio
0,18393,1,168,62.0,110.0,80.0,0,0,1,0
1,20228,0,156,85.0,140.0,90.0,0,0,1,1
2,18857,0,165,64.0,130.0,70.0,0,0,0,1
3,17623,1,169,82.0,150.0,100.0,0,0,1,1
4,17474,0,156,56.0,100.0,60.0,0,0,0,0


#### Разделим наши данные на тренировочную и тестовую выборки

In [22]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['cardio', ], axis=1), 
                                                    df['cardio'], random_state=0)

#### Преобразование

К полям:
<!-- - `gender`, `cholesterol` применим OHE-кодирование -->
<!-- - `age`, `height`, `weight`, `ap_hi`, `ap_lo` - standardScaler -->
<!-- - `gluc`, `smoke`, `alco`, `active` - оставим пока как есть -->

In [23]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]


class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]


# class OHEEncoder(BaseEstimator, TransformerMixin):
#     def __init__(self, key):
#         self.key = key
#         self.columns = []

#     def fit(self, X, y=None):
#         self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
#         return self

#     def transform(self, X):
#         X = pd.get_dummies(X, prefix=self.key)
#         test_columns = [col for col in X.columns]
#         for col_ in test_columns:
#             if col_ not in self.columns:
#                 X[col_] = 0
#         return X[self.columns]

from sklearn.preprocessing import StandardScaler

# continuos_cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']
# cat_cols = ['gender', 'cholesterol']
# base_cols = ['gluc', 'smoke', 'alco', 'active']

base_cols = [
    'age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'smoke', 'alco', 'active']

# continuos_transformers = []
# cat_transformers = []
base_transformers = []

# for cont_col in continuos_cols:
#     transfomer = Pipeline([('selector', NumberSelector(key=cont_col)),
#                            ('standard', StandardScaler())])
#     continuos_transformers.append((cont_col, transfomer))

# for cat_col in cat_cols:
#     cat_transformer = Pipeline([('selector', ColumnSelector(key=cat_col)),
#                                 ('ohe', OHEEncoder(key=cat_col))])
#     cat_transformers.append((cat_col, cat_transformer))

for base_col in base_cols:
    base_transformer = Pipeline([('selector', NumberSelector(key=base_col))])
    base_transformers.append((base_col, base_transformer))

In [24]:
feature_processing

NameError: name 'feature_processing' is not defined

Теперь объединим все наши трансформеры с помощью FeatureUnion

In [25]:
from sklearn.pipeline import FeatureUnion

feats = FeatureUnion(
#     continuos_transformers + 
#     cat_transformers + 
    base_transformers
)
feature_processing = Pipeline([('feats', feats)])

feature_processing.fit_transform(X_train);

Добавим классификатор и запустим кросс-валидацию

In [26]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

# classifier = RandomForestClassifier(n_estimators=200,
#                                     random_state=42)
classifier = XGBClassifier(random_state=42)

In [27]:
# %%time
# from sklearn.model_selection import train_test_split,KFold,GridSearchCV


# params = {'n_estimators': [50, 100, 200, 1000], 'max_depth': [3, 5, 7, 10]}
# gs = GridSearchCV(rf_model,
#                   params,
#                   scoring='roc_auc',
#                   cv=KFold(n_splits=3, random_state=21, shuffle=True),
#                   n_jobs=-1)
# gs.fit(X_train, y_train)

In [28]:
# gs.best_params_

In [29]:
# classifier = RandomForestClassifier(n_estimators=200,
#                                     random_state=42)

In [30]:
# classifier = Pipeline([
#     ('features', feats),
#     ('classifier', RandomForestClassifier(n_estimators =
#                                           random_state=42)),
# ])

In [31]:
%%time
#запустим кросс-валидацию
cv_scores = cross_val_score(classifier, X_train, y_train, cv=16, scoring='roc_auc')
cv_score = np.mean(cv_scores)
cv_score_std = np.std(cv_scores)
print('CV score is {}+-{}'.format(cv_score, cv_score_std))

#обучим пайплайн на всем тренировочном датасете
classifier.fit(X_train, y_train)
y_score = classifier.predict_proba(X_test)[:, 1]

CV score is 0.7869941438142204+-0.00803825856704992
Wall time: 37.3 s


In [32]:
b=1
precision, recall, thresholds = precision_recall_curve(y_test.values, y_score)
fscore = (1+b**2)*(precision * recall) / (b**2*precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.329106, F-Score=0.729, Precision=0.644, Recall=0.842


In [33]:
classifier.predict(X_test)

array([0, 0, 0, ..., 0, 1, 1], dtype=int64)

In [34]:
X_test.loc[[56267, 5719], :]

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,smoke,alco,active
56267,19815,0,164,54.0,110.0,80.0,0,0,1
5719,21149,0,153,54.0,120.0,80.0,0,0,1


In [40]:
X_test_2 = pd.DataFrame(
    {
        'age': 18,
        'gender': 1,
        'height': 164,
        'weight': 54.0,
        'ap_hi': 200,
        'ap_lo': 80,
        'smoke': 0,
        'alco': 0,
        'active': 1
    },
    index=[80282])

In [41]:
classifier.predict_proba(X_test_2)

array([[0.24938053, 0.7506195 ]], dtype=float32)

In [43]:
import pickle 

MODEL_FILE_PATH='model_xgb_final.pkl'

with open(MODEL_FILE_PATH,'wb') as file:
    pickle.dump(classifier,file)

In [44]:
# загрузка модели

# with open(MODEL_FILE_PATH,'rb') as file:
#     model_load=pickle.load(file)