In [1]:
# Базовые импорты
import h2o
import pandas as pd
from h2o.estimators import H2ORandomForestEstimator, H2OGradientBoostingEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

# Инициализация H2O
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "17.0.16" 2025-07-15; OpenJDK Runtime Environment (build 17.0.16+8-Ubuntu-0ubuntu124.04.1); OpenJDK 64-Bit Server VM (build 17.0.16+8-Ubuntu-0ubuntu124.04.1, mixed mode, sharing)
  Starting server from /home/konstantin/anaconda3/lib/python3.12/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp__jd975d
  JVM stdout: /tmp/tmp__jd975d/h2o_konstantin_started_from_python.out
  JVM stderr: /tmp/tmp__jd975d/h2o_konstantin_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Europe/Moscow
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.7
H2O_cluster_version_age:,6 months and 2 days
H2O_cluster_name:,H2O_from_python_konstantin_p0ql0h
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.859 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [2]:
train = h2o.import_file("train.csv")
test = h2o.import_file("test.csv")

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [4]:
# Правильно определяем предикторы (исключаем целевую и дубликаты)
response = "HeartDisease"
predictors = [col for col in train.columns if col != response]

# Преобразуем целевую в фактор
train[response] = train[response].asfactor()
test[response] = test[response].asfactor()

print(f"Predictors: {len(predictors)} features")
print(f"Response: {response}")

Predictors: 20 features
Response: HeartDisease


In [9]:
# 1. Логистическая регрессия (GLM)
glm_model = H2OGeneralizedLinearEstimator(
    family="binomial",
    model_id="glm_heart_disease_corrected",
    lambda_search=True,
    nfolds=5
)

glm_model.train(
    x=predictors,
    y=response,
    training_frame=train,
    validation_frame=test
)

print("=== GLM RESULTS (CORRECTED) ===")
glm_perf = glm_model.model_performance(test)
print(f"AUC: {glm_perf.auc()}")

glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
=== GLM RESULTS (CORRECTED) ===
AUC: 0.9224951727097189


In [5]:
# 2. Случайный лес (RF)
rf_model = H2ORandomForestEstimator(
    model_id="rf_heart_disease_corrected",
    ntrees=100,
    max_depth=20,
    nfolds=5,
    seed=42
)

rf_model.train(
    x=predictors,
    y=response,
    training_frame=train,
    validation_frame=test
)

print("=== RANDOM FOREST RESULTS (CORRECTED) ===")
rf_perf = rf_model.model_performance(test)
print(f"AUC: {rf_perf.auc()}")

drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
=== RANDOM FOREST RESULTS (CORRECTED) ===
AUC: 0.9350729457198026


In [6]:
# 3. Градиентный бустинг (GBM)
gbm_model = H2OGradientBoostingEstimator(
    model_id="gbm_heart_disease_corrected", 
    ntrees=100,
    max_depth=6,
    learn_rate=0.1,
    nfolds=5,
    seed=42
)

gbm_model.train(
    x=predictors,
    y=response,
    training_frame=train,
    validation_frame=test
)

print("=== GBM RESULTS (CORRECTED) ===") 
gbm_perf = gbm_model.model_performance(test)
print(f"AUC: {gbm_perf.auc()}")

gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
=== GBM RESULTS (CORRECTED) ===
AUC: 0.9102124007723664


In [11]:
# Альтернативный способ получения метрик
print(f"\n=== ALTERNATIVE METRICS ===")
print("GLM Metrics:")
print(f"Accuracy: {glm_perf.accuracy()[0][1]:.4f}")
print(f"Precision: {glm_perf.precision()[0][1]:.4f}")
print(f"Recall: {glm_perf.recall()[0][1]:.4f}")

print("\nRF Metrics:")
print(f"Accuracy: {rf_perf.accuracy()[0][1]:.4f}")
print(f"Precision: {rf_perf.precision()[0][1]:.4f}")
print(f"Recall: {rf_perf.recall()[0][1]:.4f}")

print("\nGBM Metrics:")
print(f"Accuracy: {gbm_perf.accuracy()[0][1]:.4f}")
print(f"Precision: {gbm_perf.precision()[0][1]:.4f}")
print(f"Recall: {gbm_perf.recall()[0][1]:.4f}")


=== ALTERNATIVE METRICS ===
GLM Metrics:
Accuracy: 0.8768
Precision: 1.0000
Recall: 1.0000

RF Metrics:
Accuracy: 0.8768
Precision: 1.0000
Recall: 1.0000

GBM Metrics:
Accuracy: 0.8587
Precision: 1.0000
Recall: 1.0000


In [10]:
# Получаем предсказания вероятностей от всех моделей
glm_pred = glm_model.predict(test)
rf_pred = rf_model.predict(test)
gbm_pred = gbm_model.predict(test)

# Блендинг путем усреднения вероятностей класса 1
blended_probs = (glm_pred["p1"] + rf_pred["p1"] + gbm_pred["p1"]) / 3

# Создаем DataFrame с результатами блендинга
blended_result = test[response].cbind(blended_probs)
blended_result.set_names(["actual", "blended_probability"])

print(f"\n=== BLENDING RESULTS ===")

# Создаем бинарные предсказания из вероятностей (порог 0.5)
blended_pred = (blended_probs > 0.5).ifelse(1, 0)
blended_result = blended_result.cbind(blended_pred)
blended_result.set_names(["actual", "blended_probability", "blended_pred"])

# Оцениваем accuracy блендинга
blended_accuracy = (blended_result["actual"] == blended_result["blended_pred"]).mean()[0]
print(f"Blended Model Accuracy: {blended_accuracy:.4f}")

# Precision и Recall для блендинга (правильный расчет)
blended_cm = blended_result["actual"].cbind(blended_pred)

# Преобразуем в pandas для удобства расчета
blended_df = blended_cm.as_data_frame()

tp = len(blended_df[(blended_df['actual'] == 1) & (blended_df['C1'] == 1)])
fp = len(blended_df[(blended_df['actual'] == 0) & (blended_df['C1'] == 1)])
fn = len(blended_df[(blended_df['actual'] == 1) & (blended_df['C1'] == 0)])

blended_precision = tp / (tp + fp) if (tp + fp) > 0 else 0
blended_recall = tp / (tp + fn) if (tp + fn) > 0 else 0

print(f"Blended Precision: {blended_precision:.4f}")
print(f"Blended Recall: {blended_recall:.4f}")

# Сравнение с индивидуальными моделями
print(f"\n=== COMPARISON ===")
print(f"GLM  - Accuracy: 0.8768, Precision: 1.0000, Recall: 1.0000")
print(f"RF   - Accuracy: 0.8732, Precision: 1.0000, Recall: 1.0000") 
print(f"GBM  - Accuracy: 0.8587, Precision: 1.0000, Recall: 1.0000")
print(f"BLEND - Accuracy: {blended_accuracy:.4f}, Precision: {blended_precision:.4f}, Recall: {blended_recall:.4f}")

glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%

=== BLENDING RESULTS ===
Blended Model Accuracy: 0.8514
Blended Precision: 0.8462
Blended Recall: 0.9051

=== COMPARISON ===
GLM  - Accuracy: 0.8768, Precision: 1.0000, Recall: 1.0000
RF   - Accuracy: 0.8732, Precision: 1.0000, Recall: 1.0000
GBM  - Accuracy: 0.8587, Precision: 1.0000, Recall: 1.0000
BLEND - Accuracy: 0.8514, Precision: 0.8462, Recall: 0.9051





In [12]:
print("\n=== FEATURE IMPORTANCE ===")

print("\nRandom Forest Feature Importance:")
rf_importance = rf_model.varimp(use_pandas=True)
print(rf_importance.head(10))

print("\nGBM Feature Importance:") 
gbm_importance = gbm_model.varimp(use_pandas=True)
print(gbm_importance.head(10))

print("\nGLM Coefficients (первые 10 по модулю):")
glm_coef = glm_model.coef()
# Сортируем по абсолютному значению
sorted_coef = sorted(glm_coef.items(), key=lambda x: abs(x[1]), reverse=True)
for feature, coef in sorted_coef[:10]:
    print(f"{feature}: {coef:.4f}")


=== FEATURE IMPORTANCE ===

Random Forest Feature Importance:
      variable  relative_importance  scaled_importance  percentage
0           Up          1985.530029           1.000000    0.172227
1         Flat          1417.010620           0.713669    0.122913
2      Oldpeak          1226.107178           0.617521    0.106354
3          ASY          1223.855835           0.616387    0.106158
4        MaxHR           974.127014           0.490613    0.084497
5          Age           760.412109           0.382977    0.065959
6    RestingBP           709.807251           0.357490    0.061569
7  Cholesterol           639.535156           0.322098    0.055474
8            Y           468.829102           0.236123    0.040667
9            N           456.634888           0.229981    0.039609

GBM Feature Importance:
      variable  relative_importance  scaled_importance  percentage
0           Up           312.367950           1.000000    0.411036
1      Oldpeak            99.235588      

In [13]:
print("=== KEY CLINICAL INSIGHTS ===")
print("🎯 КРИТИЧЕСКИЕ ДИАГНОСТИЧЕСКИЕ МАРКЕРЫ:")
print("1. ЭКГ-ПАРАМЕТРЫ: ST_Slope (Up/Flat) + Oldpeak - основные предикторы")
print("2. БЕССИМПТОМНАЯ ИШЕМИЯ: ASY chest pain - высокий риск")
print("3. ЛАБОРАТОРНЫЕ: FastingBS > 120 mg/dL - значимый фактор")
print("4. ДЕМОГРАФИЯ: Мужской пол (M) - фактор риска")

print("\n=== RECOMMENDATIONS FOR POWER BI DASHBOARD ===")
print("📈 ОСНОВНЫЕ ВИЗУАЛИЗАЦИИ:")
print("1. Важность признаков в ML-модели (бар-чарт)")
print("2. Распределение ST_Slope по диагнозам (pie chart)")
print("3. Связь ChestPainType с HeartDisease (stacked bar)")
print("4. Oldpeak vs HeartDisease (box plot)")
print("5. Демографические профили риска (heatmap)")

=== KEY CLINICAL INSIGHTS ===
🎯 КРИТИЧЕСКИЕ ДИАГНОСТИЧЕСКИЕ МАРКЕРЫ:
1. ЭКГ-ПАРАМЕТРЫ: ST_Slope (Up/Flat) + Oldpeak - основные предикторы
2. БЕССИМПТОМНАЯ ИШЕМИЯ: ASY chest pain - высокий риск
3. ЛАБОРАТОРНЫЕ: FastingBS > 120 mg/dL - значимый фактор
4. ДЕМОГРАФИЯ: Мужской пол (M) - фактор риска

=== RECOMMENDATIONS FOR POWER BI DASHBOARD ===
📈 ОСНОВНЫЕ ВИЗУАЛИЗАЦИИ:
1. Важность признаков в ML-модели (бар-чарт)
2. Распределение ST_Slope по диагнозам (pie chart)
3. Связь ChestPainType с HeartDisease (stacked bar)
4. Oldpeak vs HeartDisease (box plot)
5. Демографические профили риска (heatmap)


In [14]:
# Создаем DataFrame с важностью признаков
feature_importance_data = {
    'feature': ['ST_Slope_Up', 'ChestPainType_ASY', 'Oldpeak', 'ST_Slope_Flat', 
                'MaxHR', 'FastingBS', 'ChestPainType_ATA', 'Sex_M', 'Age', 'RestingBP'],
    'importance_score': [0.172, 0.106, 0.105, 0.123, 0.086, 0.035, 0.028, 0.025, 0.066, 0.062],
    'coefficient': [-1.115, 1.145, 0.452, 0.909, -0.210, 0.964, -1.066, 0.675, 0.189, 0.105]
}

feature_importance_df = pd.DataFrame(feature_importance_data)
feature_importance_df.to_csv('feature_importance_glm.csv', index=False)

In [15]:
# Метрики GLM модели
metrics_data = {
    'metric': ['Accuracy', 'Precision', 'Recall', 'AUC', 'F1-Score'],
    'value': [0.8768, 1.0000, 1.0000, 0.9219, 1.0000],
    'model': ['GLM', 'GLM', 'GLM', 'GLM', 'GLM']
}

metrics_df = pd.DataFrame(metrics_data)
metrics_df.to_csv('model_metrics_glm.csv', index=False)

In [16]:
# Получаем предсказания из GLM модели
glm_pred = glm_model.predict(test)

# Создаем DataFrame с предсказаниями
predictions_df = test[['HeartDisease']].cbind(glm_pred[['predict', 'p0', 'p1']])
predictions_pd = predictions_df.as_data_frame()

# Сохраняем в CSV
predictions_pd.to_csv('predictions_glm.csv', index=False)

glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%



