In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Загружаем данные и сразу переводим общие столбцы в двух датасетах к одному типу данных

heart_path = "heart (1).csv"
cardio_path = "baseline/daniil_vasilev_baseline_cml/data/cardio_train_correct.parquet"

heart_data = pd.read_csv(heart_path)
heart_data.columns = heart_data.columns.str.lower()
heart_data['cholesterol_int'] = heart_data['cholesterol'].copy()  # так как в одной из моделей нужен этот признак в числовом виде, а не булевом
heart_data['cholesterol'] = heart_data['cholesterol'].apply(lambda value: 0 if value <= 210 else 1)  # а в другой модели нужно булевое значение его

cardio_data = pd.read_parquet(cardio_path).drop('id', axis=1)
cardio_data['age'] = cardio_data['age'].astype(int)
cardio_data = cardio_data.rename(columns={'cardio': 'target'})
heart_data.shape, cardio_data.shape

((1025, 15), (70000, 12))

In [3]:
full_df = heart_data.merge(cardio_data, on=['age', 'cholesterol', 'target']).reset_index(drop=True)
print(f'Получили {full_df.shape[0]} строк после объединения двух датасетов')
full_df.head()

Получили 410127 строк после объединения двух датасетов


Unnamed: 0,age,sex,cheastpaintype,restingbp,cholesterol,fastingbs,restingecg,maxhr,exerciseangina,oldpeak,...,cholesterol_int,gender,height,weight,ap_hi,ap_lo,gluc,smoke,alco,active
0,52,1,0,125,1,0,1,168,0,1.0,...,212,0,167.0,80.0,190,90,0,0,1,0
1,52,1,0,125,1,0,1,168,0,1.0,...,212,0,177.980827,63.0,110,70,0,0,0,1
2,52,1,0,125,1,0,1,168,0,1.0,...,212,1,173.0,75.0,130,80,0,1,1,0
3,52,1,0,125,1,0,1,168,0,1.0,...,212,0,167.0,70.0,110,70,0,0,0,0
4,52,1,0,125,1,0,1,168,0,1.0,...,212,0,163.0,63.0,120,80,1,0,0,1


In [4]:
X, y = full_df.drop('target', axis=1), full_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=full_df['target'])

In [5]:
heart_features = heart_data.columns.to_list().copy()
heart_features.remove('target')

cardio_features = cardio_data.columns.to_list().copy()
cardio_features.remove('target')

#### Модель датасета cardio_data (Дани)

In [None]:
xgb_params = {
            "n_estimators": 500,
            "learning_rate": 0.1,
            "max_depth": 3,
            "objective": "binary:logistic",
            "eval_metric": "logloss",
            "random_state": 12,
            "n_splits": 4
}

X_train_train, X_val, y_train_train, y_val = train_test_split(X_train[cardio_features], y_train, test_size=0.25, random_state=42)
model_xgb = XGBClassifier(**xgb_params)
model_xgb.fit(X_train_train, y_train_train, eval_set=[(X_val, y_val)], verbose=False)

In [143]:
pred_xgb = model_xgb.predict(X_test[cardio_features])
auc_xgb = roc_auc_score(y_test, pred_xgb)
print(f'ROC-AUC модели XGBoost = {round(auc_xgb, 3)}')

ROC-AUC модели XGBoost = 0.821


#### Модель датасета heart_data (Вани)

In [6]:
heart_X = X_train[heart_features].drop(columns=['cholesterol']).rename(columns={'cholesterol_int': 'cholesterol'})
model_rf = RandomForestClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=42)
model_rf.fit(heart_X, y_train)

In [7]:
heart_X_test = X_test[heart_features].drop(columns=['cholesterol']).rename(columns={'cholesterol_int': 'cholesterol'})
pred_rf = model_rf.predict(heart_X_test)
auc_rf = roc_auc_score(y_test, pred_rf)
print(f'ROC-AUC модели случайного леса = {round(auc_rf, 3)}')

ROC-AUC модели случайного леса = 1.0


In [14]:
# pd.DataFrame(model_rf.feature_importances_, index=heart_X.columns, columns=['feature_importance']).sort_values(by='feature_importance', ascending=False)

#### Ансамблируем

In [145]:
# с порогом из безлайна - 0.5
prob_xgb = model_xgb.predict_proba(X_test[cardio_features])[:, 1]
prob_rf = model_rf.predict_proba(heart_X_test)[:, 1]
ansambl_pred = (prob_xgb + prob_rf) / 2
THRESHOLD = 0.5
final_pred = [1 if val >= THRESHOLD else 0 for val in ansambl_pred]
auc_ansambl = roc_auc_score(y_test, final_pred)
print(f'ROC-AUC ансамбля с порогом по умолчанию = {round(auc_ansambl, 3)}')

ROC-AUC ансамбля с порогом по умолчанию = 1.0


In [147]:
# будем перебирать порог на валидационной выборке
X_train_train, X_val, y_train_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)
results = []
for THRESHOLD in np.arange(0.05, 1.01, 0.05):
    heart_X_val = X_val[heart_features].drop(columns=['cholesterol']).rename(columns={'cholesterol_int': 'cholesterol'})
    prob_xgb = model_xgb.predict_proba(X_val[cardio_features])[:, 1]
    prob_rf = model_rf.predict_proba(heart_X_val)[:, 1]
    ansambl_pred = (prob_xgb + prob_rf) / 2
    final_pred = [1 if val >= THRESHOLD else 0 for val in ansambl_pred]
    auc = roc_auc_score(y_val, final_pred)
    accuracy = accuracy_score(y_val, final_pred)
    results.append((THRESHOLD, auc, accuracy))

results_df = pd.DataFrame(results, columns=['threshold', 'roc_auc', 'accuracy']).sort_values(by='roc_auc', ascending=False)
results_df

Unnamed: 0,threshold,roc_auc,accuracy
9,0.5,1.0,1.0
10,0.55,0.99777,0.997568
8,0.45,0.992566,0.993238
11,0.6,0.986308,0.985071
7,0.4,0.9787,0.980624
12,0.65,0.971066,0.968452
6,0.35,0.961045,0.964564
13,0.7,0.947404,0.942652
5,0.3,0.936671,0.942392
14,0.75,0.92491,0.918126
