In [114]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

In [115]:
data = pd.read_csv('dataset_57_hypothyroid.csv')
data.replace('?', np.nan, inplace=True)
# Параметры ниже константны (TBG = NaN)
data.drop(['TBG_measured', 'TBG'], axis=1, inplace=True)
data.dropna(subset=['Class'], axis=0, inplace=True)

In [116]:
data.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,referral_source,Class
0,41,F,f,f,f,f,f,f,f,f,...,t,2.5,t,125,t,1.14,t,109.0,SVHC,negative
1,23,F,f,f,f,f,f,f,f,f,...,t,2.0,t,102,f,,f,,other,negative
2,46,M,f,f,f,f,f,f,f,f,...,f,,t,109,t,0.91,t,120.0,other,negative
3,70,F,t,f,f,f,f,f,f,f,...,t,1.9,t,175,f,,f,,other,negative
4,70,F,f,f,f,f,f,f,f,f,...,t,1.2,t,61,t,0.87,t,70.0,SVI,negative


In [117]:
for col in data.columns:
    data[col] = pd.to_numeric(data[col], errors='ignore')

In [118]:
report = ProfileReport(data)
# report.to_notebook_iframe()

## Выводы по EDA

Target в выборке несбалансирован. Accuracy для метрики не подойдет. Так как задача касается здоровья, наилучшим вариантом будут метрики с несимметричными ошибками точность и полнота. Числовые значения будут заполены средним значением, категориальные most frequent. <br>
Категориальные признаки будут заменены с  использованием One-Hot encoder <br>
В данных также есть повторяющиеся строки, от которых следует избавиться

In [119]:
bool_cols = [col for col in data.columns if data[col].nunique() == 2 and data[col][0] in ['t', 'f']]
for col in bool_cols:
    data[col] = data[col] == 't'

In [120]:
data.drop_duplicates()
y = data['Class']
data.drop('Class', axis = 1, inplace=True)


In [139]:
X_train, X_valid, y_train, y_valid = train_test_split(data, y, stratify=y)

In [140]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

## Pipeline для предобработки данных

### Состоит из:
<ul>
    <li>Заполнение пропусков в числовых и категориальных значений</li>
    <li>Кодирование категориальных признаков</li>
    <li>Далее Pipeline соберется в другой, содержащий модель</li>
</ul>

In [166]:
num_trans = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler',  StandardScaler())
    
])
cat_trans = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

cat_cols = [col for col in data.columns if data[col].dtype =='object']
num_cols = [col for col in data.columns if data[col].dtype in ['int64', 'float64']]



preprocessor = ColumnTransformer(
transformers =[
    ('num', num_trans, num_cols),
    ('cat', cat_trans, cat_cols),
], remainder='passthrough')

In [167]:
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import recall_score, precision_score, log_loss, f1_score


# логистическая регрессия без регуляризации
model = LogisticRegression(penalty='none')
logistic_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

logistic_pipeline.fit(X_train, y_train)

preds = logistic_pipeline.predict(X_valid)
prob_preds=logistic_pipeline.predict_proba(X_valid)


In [168]:
print('logloss ', log_loss(y_valid, prob_preds))
print('Полнота ', recall_score(y_valid, preds, average='macro'))
print('Точность ', precision_score(y_valid, preds, average='macro'))
print('F1 ', f1_score(y_valid, preds, average='macro'))

logloss  0.16195918293890094
Полнота  0.5711925287356322
Точность  0.6316940592364322
F1  0.597603121516165


In [169]:
logistic_pipeline.steps

[('preprocessor',
  ColumnTransformer(remainder='passthrough',
                    transformers=[('num',
                                   Pipeline(steps=[('imputer',
                                                    SimpleImputer(strategy='median')),
                                                   ('scaler', StandardScaler())]),
                                   ['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI']),
                                  ('cat',
                                   Pipeline(steps=[('imputer',
                                                    SimpleImputer(strategy='most_frequent')),
                                                   ('encoder',
                                                    OneHotEncoder(handle_unknown='ignore'))]),
                                   ['sex', 'referral_source'])])),
 ('model', LogisticRegression(penalty='none'))]

## Логистическая регрессия с разными видами регуляризации

### L2

In [170]:
model = LogisticRegression(penalty='l2')
logistic_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])
logistic_pipeline.fit(X_train, y_train)
preds = logistic_pipeline.predict(X_valid)
prob_preds = logistic_pipeline.predict_proba(X_valid)
print('logloss', log_loss(y_valid, prob_preds))
print('Полнота ', recall_score(y_valid, preds, average='macro'))
print('Точность ',precision_score(y_valid, preds, average='macro'))
print('F1 ', f1_score(y_valid, preds, average='macro'))


logloss 0.15734298918820386
Полнота  0.4678879310344828
Точность  0.6316626530202689
F1  0.5235750173250173


###  Elastic net

In [171]:
model = LogisticRegression(solver = 'saga' ,penalty='elasticnet', l1_ratio=0.5)
logistic_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])
logistic_pipeline.fit(X_train, y_train)
preds = logistic_pipeline.predict(X_valid)
prob_preds = logistic_pipeline.predict_proba(X_valid)
print('logloss', log_loss(y_valid, prob_preds))
print('Полнота ', recall_score(y_valid, preds, average='macro'))
print('Точность ',precision_score(y_valid, preds, average='macro'))
print('F1 ', f1_score(y_valid, preds, average='macro'))

logloss 0.1599561320207567
Полнота  0.4519755747126437
Точность  0.6101275791463223
F1  0.5045386704509309


### L1

In [172]:
model = LogisticRegression(solver = 'saga' ,penalty='l1')
logistic_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])
logistic_pipeline.fit(X_train, y_train)
prob_preds = logistic_pipeline.predict_proba(X_valid)
print('logloss', log_loss(y_valid, prob_preds))
print('Полнота ', recall_score(y_valid, preds, average='macro'))
print('Точность ',precision_score(y_valid, preds, average='macro'))
print('F1 ', f1_score(y_valid, preds, average='macro'))

logloss 0.1600471598218187
Полнота  0.4519755747126437
Точность  0.6101275791463223
F1  0.5045386704509309


## Выводы

Так как в задаче речь идет о здоровье человека, в этом случае будет выгодным взять модель без регуляризации, так как этот вариант имеет наибольшую полноту, то есть наилучшим образом определяет больных. 

## Кросс-валидация для L2

In [178]:
from sklearn.model_selection import cross_val_score

model = LogisticRegression(penalty='l2')
logistic_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])
scores = cross_val_score(logistic_pipeline, data, y, cv=5, scoring='f1_macro')
scores

array([0.57150255, 0.4819837 , 0.73578074, 0.76659627, 0.80144852])

# KNN

In [151]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

model = KNeighborsClassifier(algorithm='kd_tree')
KNN_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

KNN_pipeline.fit(X_train, y_train)
preds=KNN_pipeline.predict(X_valid)
print('Полнота ', recall_score(y_valid, preds, average='macro'))
print('Точность ',precision_score(y_valid, preds, average='macro'))
print('F1 ', f1_score(y_valid, preds, average='macro'))


Полнота  0.3845545977011494
Точность  0.5471362876254181
F1  0.425310178219059


In [159]:
model = KNeighborsClassifier(algorithm='brute')
KNN_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

KNN_pipeline.fit(X_train, y_train)
preds=KNN_pipeline.predict(X_valid)
prob_preds = KNN_pipeline.predict_proba(X_valid)
print('logloss', log_loss(y_valid, prob_preds))
print('Полнота ', recall_score(y_valid, preds, average='macro'))
print('Точность ',precision_score(y_valid, preds, average='macro'))
print('F1 ', f1_score(y_valid, preds, average='macro'))


logloss 1.1131013413175477
Полнота  0.3845545977011494
Точность  0.5471362876254181
F1  0.425310178219059


In [153]:
model = KNeighborsClassifier(algorithm='ball_tree')
KNN_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

KNN_pipeline.fit(X_train, y_train)
preds=KNN_pipeline.predict(X_valid)

print('Полнота ', recall_score(y_valid, preds, average='macro'))
print('Точность ',precision_score(y_valid, preds, average='macro'))
print('F1 ', f1_score(y_valid, preds, average='macro'))


Полнота  0.3845545977011494
Точность  0.5471362876254181
F1  0.425310178219059
