In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

In [3]:
train_data=pd.read_csv('heart_adapt_train.csv')
test_data=pd.read_csv('heart_adapt_test.csv')

In [4]:
train_data.head()
train_data.info()
train_data.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 589 entries, 0 to 588
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             533 non-null    float64
 1   Sex             589 non-null    object 
 2   ChestPainType   589 non-null    object 
 3   RestingBP       588 non-null    float64
 4   Cholesterol     462 non-null    float64
 5   FastingBS       589 non-null    int64  
 6   RestingECG      589 non-null    object 
 7   MaxHR           589 non-null    int64  
 8   ExerciseAngina  589 non-null    object 
 9   Oldpeak         589 non-null    float64
 10  ST_Slope        589 non-null    object 
 11  HeartDisease    589 non-null    int64  
dtypes: float64(4), int64(3), object(5)
memory usage: 55.3+ KB


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
count,533.0,589,589,588.0,462.0,589.0,589,589.0,589,589.0,589,589.0
unique,,2,4,,,,3,,2,,3,
top,,M,ASY,,,,Normal,,N,,Flat,
freq,,477,348,,,,345,,325,,325,
mean,54.195122,,,133.358844,245.632035,0.258065,,134.893039,,0.937521,,0.646859
std,9.532661,,,18.851852,58.599184,0.437942,,24.942596,,1.071318,,0.478352
min,28.0,,,80.0,85.0,0.0,,63.0,,-2.6,,0.0
25%,48.0,,,120.0,209.0,0.0,,117.0,,0.0,,0.0
50%,55.0,,,130.0,240.0,0.0,,135.0,,0.8,,1.0
75%,61.0,,,144.0,279.75,1.0,,154.0,,1.6,,1.0


In [5]:
test_data.head()
test_data.info()
test_data.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197 entries, 0 to 196
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             177 non-null    float64
 1   Sex             197 non-null    object 
 2   ChestPainType   197 non-null    object 
 3   RestingBP       197 non-null    float64
 4   Cholesterol     160 non-null    float64
 5   FastingBS       197 non-null    int64  
 6   RestingECG      197 non-null    object 
 7   MaxHR           197 non-null    int64  
 8   ExerciseAngina  197 non-null    object 
 9   Oldpeak         197 non-null    float64
 10  ST_Slope        197 non-null    object 
 11  HeartDisease    197 non-null    int64  
dtypes: float64(4), int64(3), object(5)
memory usage: 18.6+ KB


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
count,177.0,197,197,197.0,160.0,197.0,197,197.0,197,197.0,197,197.0
unique,,2,4,,,,3,,2,,3,
top,,M,ASY,,,,Normal,,N,,Flat,
freq,,160,121,,,,120,,107,,103,
mean,54.677966,,,132.614213,248.0625,0.228426,,134.812183,,1.079188,,0.64467
std,8.740789,,,15.97841,65.717107,0.420888,,24.557933,,1.173006,,0.479833
min,35.0,,,95.0,100.0,0.0,,60.0,,-0.5,,0.0
25%,48.0,,,120.0,207.75,0.0,,118.0,,0.0,,0.0
50%,55.0,,,130.0,237.5,0.0,,134.0,,1.0,,1.0
75%,61.0,,,140.0,275.25,0.0,,152.0,,1.8,,1.0


In [6]:
expected_target_colum="HeartDisease"
assert expected_target_colum in train_data.columns, f'Целевой признак {expected_target_colum}'
assert expected_target_colum in test_data.columns, f"Целевой признак {expected_target_colum}"

In [7]:
X_train=train_data.drop(columns=expected_target_colum)
y_train=train_data[expected_target_colum]
X_test=test_data.drop(columns=expected_target_colum)
y_test=test_data[expected_target_colum]

In [8]:
numeric_features=['Age', 'RestingBP', 'Cholesterol', 'MaxHR','Oldpeak' ]
categorial_features=['Sex', 'ChestPainType', 'RestingECG','ExerciseAngina', 'ST_Slope']

In [9]:
preprocessor=ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numeric_features),
        ('cat', OneHotEncoder(), categorial_features)
    ])

In [10]:
pipeline=Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier())
])

In [17]:
param_grid={
    'classifier__n_estimators':[50, 100, 200],
    'classifier__max_depth':[None, 10, 20, 30],
    'classifier__min_samples_split':[2, 5, 10]
}

grid_search=GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [18]:
best_pipeline=grid_search.best_estimator_
y_pred_proba=best_pipeline.predict_proba(X_test)[:,1]

In [19]:
roc_auc=roc_auc_score(y_test, y_pred_proba)
print(roc_auc)

0.9006749156355455
