In [38]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import warnings
warnings.filterwarnings('ignore')

In [15]:
data = pd.read_csv('dataset_57_hypothyroid.csv')
data.replace('?', np.nan, inplace=True)
# Параметры ниже константны (TBG = NaN)
data.drop(['TBG_measured', 'TBG'], axis=1, inplace=True)
data.dropna(subset=['Class'], axis=0, inplace=True)

In [17]:
data.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,referral_source,Class
0,41,F,f,f,f,f,f,f,f,f,...,t,2.5,t,125,t,1.14,t,109.0,SVHC,negative
1,23,F,f,f,f,f,f,f,f,f,...,t,2.0,t,102,f,,f,,other,negative
2,46,M,f,f,f,f,f,f,f,f,...,f,,t,109,t,0.91,t,120.0,other,negative
3,70,F,t,f,f,f,f,f,f,f,...,t,1.9,t,175,f,,f,,other,negative
4,70,F,f,f,f,f,f,f,f,f,...,t,1.2,t,61,t,0.87,t,70.0,SVI,negative


In [18]:
for col in data.columns:
    data[col] = pd.to_numeric(data[col], errors='ignore')

In [24]:
report = ProfileReport(data)
report.to_notebook_iframe()

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=42.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




## Выводы по EDA

Target в выборке несбалансирован. Accuracy для метрики не подойдет. Так как задача касается здоровья, наилучшим вариантом будут метрики с несимметричными ошибками точность и полнота. Числовые значения будут заполены средним значением, категориальные most frequent. <br>
Категориальные признаки будут заменены с  использованием One-Hot encoder

In [22]:
cols_with_missing = [col for col in data.columns if data[col].isnull().any()]
cols_with_missing

['age', 'sex', 'TSH', 'T3', 'TT4', 'T4U', 'FTI']

In [25]:
y = data['Class']
data.drop('Class', axis = 1, inplace=True)

In [26]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

## Pipeline для предобработки данных

### Состоит из:
<ul>
    <li>Заполнение пропусков в числовых и категориальных значений</li>
    <li>Кодирование категориальных признаков</li>
    <li>Далее Pipeline соберется в другой, содержащий модель</li>
</ul>

In [28]:
num_trans = SimpleImputer(strategy='median')
cat_trans = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

cat_cols = [col for col in data.columns if data[col].dtype =='object']
num_cols = [col for col in data.columns if data[col].dtype !='object']

preprocessor = ColumnTransformer(
transformers =[
    ('num', num_trans, num_cols),
    ('cat', cat_trans, cat_cols)
])

In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score 

# логистическая регрессия без регуляризации
model = LogisticRegression(penalty='none')
binary_y = y!='negative'
logistic_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])
X_train, X_valid, y_train, y_valid = train_test_split(data, binary_y)
logistic_pipeline.fit(X_train, y_train)
preds = logistic_pipeline.predict(X_valid)

In [45]:
print('Полнота ', recall_score(preds, y_valid))
print('Точность ',precision_score(preds, y_valid))

Полнота  0.9574468085106383
Точность  0.6428571428571429


## Логистическая регрессия с разными видами регуляризации

### L2

In [48]:
model = LogisticRegression(penalty='l2')
binary_y = y!='negative'
logistic_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])
X_train, X_valid, y_train, y_valid = train_test_split(data, binary_y)
logistic_pipeline.fit(X_train, y_train)
preds = logistic_pipeline.predict(X_valid)
print('Полнота ', recall_score(preds, y_valid))
print('Точность ',precision_score(preds, y_valid))

Полнота  0.9696969696969697
Точность  0.3902439024390244


###  Elastic net

In [52]:
model = LogisticRegression(solver = 'saga' ,penalty='elasticnet', l1_ratio=0.5)
binary_y = y!='negative'
logistic_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])
X_train, X_valid, y_train, y_valid = train_test_split(data, binary_y)
logistic_pipeline.fit(X_train, y_train)
preds = logistic_pipeline.predict(X_valid)
print('Полнота ', recall_score(preds, y_valid))
print('Точность ',precision_score(preds, y_valid))

Полнота  0.9069767441860465
Точность  0.5652173913043478


### L1

In [53]:
model = LogisticRegression(solver = 'saga' ,penalty='l1')
binary_y = y!='negative'
logistic_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])
X_train, X_valid, y_train, y_valid = train_test_split(data, binary_y)
logistic_pipeline.fit(X_train, y_train)
preds = logistic_pipeline.predict(X_valid)
print('Полнота ', recall_score(preds, y_valid))
print('Точность ',precision_score(preds, y_valid))

Полнота  0.9
Точность  0.5454545454545454


## Выводы

Так как в задаче речь идет о здоровье человека, в этом случае будет выгодным взять модель с регуляризацией l2, так как этот вариант имеет наибольшую полноту, то есть наилучшим образом определяет больных.

# KNN

In [60]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

model = KNeighborsClassifier()
KNN_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

X_train, X_valid, y_train, y_valid = train_test_split(data, y)
KNN_pipeline.fit(X_train, y_train)
preds=KNN_pipeline.predict(X_valid)
print('F1 ', f1_score(preds, y_valid), average='micro')


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].