# Importovanie potrebných knižníc

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
import shap
from imblearn.over_sampling import SMOTE
from hiclass.metrics import precision as h_precision, recall as h_recall, f1 as h_f1


# V tejto časti prebieha predspracovanie dát pre ich ďalšie využitie v modeli


In [1]:
def preprocess_data(file_path):
    relevant_features = [
        'Vek',
        'Pohlavie',
        'Fajčenie',
        'Alkohol',
        'Hypertenzia',
        'Diabetes mellitus',
        'Kardiovaskulárne ochorenia',
        'Chronické respiračné ochorenia',
        'Renálne ochorenia',
        'Pečeňové ochorenia',
        'Onkologické ochorenia',
        'Imunosupresia',
        'Závažnosť priebehu ochorenia'
    ]

    data = pd.read_excel(file_path, usecols=relevant_features)
    # Odstránenie záznamov s chýbajúcou cieľovou premennou
    data=data.dropna(subset=['Závažnosť priebehu ochorenia'])
    # Konverzia kategórie "Pohlavie" na binárnu formu (one-hot encoding)
    data = pd.get_dummies(data, columns=['Pohlavie'], drop_first=True)

     # Vytvorenie nových cieľových premenných pre modelovanie úmrtnosti a závažnosti
    data["Mortality"] = data["Závažnosť priebehu ochorenia"].apply(lambda x: 1 if x == 3 else 0)
    data["Severity"] = data["Závažnosť priebehu ochorenia"].replace({3: np.nan, 1: 0, 2: 1})

    X = data.drop(columns=['Závažnosť priebehu ochorenia', 'Mortality', 'Severity'])
    y_mortality = data['Mortality']
    y_severity = data['Severity']
    y_true = data['Závažnosť priebehu ochorenia']

    return X, y_mortality, y_severity, y_true



# V tejto časti prebieha trénovanie modelu Random Forest s využitím pipeline a grid search optimalizácie

In [2]:

def train_random_forest(X, y, preprocessor, scoring='f1_weighted', random_state=42):


    pipeline = ImbPipeline([
        ('preprocessor', preprocessor),
        # ('resampler', SMOTE(random_state=random_state)), # V prípade potreby odstráňte komentáre ( vykazuje horšie výsledky)
        ('classifier', RandomForestClassifier(random_state=random_state, class_weight='balanced'))
    ])
# Nastavenie grid search pre hľadanie najlepších hyperparametrov modelu
    param_grid = {
        'classifier__n_estimators': [100, 150, 200],
        'classifier__max_depth': [None, 5, 10, 20],
        'classifier__min_samples_split': [2, 5],
        'classifier__min_samples_leaf': [1, 2, 4],
        'classifier__max_features': ['sqrt', 'log2'],
        'classifier__bootstrap': [True, False],
        'classifier__criterion': ['gini', 'entropy'],


    }
   # Spustenie grid search s 5-násobnou krížovou validáciou
    grid = GridSearchCV(
        pipeline,
        param_grid,
        cv=5,
        scoring=scoring,
        verbose=2,
        n_jobs=-1
    )

    grid.fit(X, y)

    print("Best parameters:", grid.best_params_)
    return grid.best_estimator_

# V tejto časti prebieha vyhodnotenie predikcií modelu pomocou hierarchických metrík (knižnica hiclass).

In [4]:
def evaluate_predictions(y_true, y_pred):
 # Pomocná funkcia na konverziu tried do formátu vhodného pre hierarchické metriky
    def convert_to_hiclass_format(label):
        if label == 1:
            return ["0", "2"]
        elif label == 2:
            return ["0", "3"]
        elif label == 3:
            return ["1"]

    y_true_h = [convert_to_hiclass_format(label) for label in y_true]
    y_pred_h = [convert_to_hiclass_format(label) for label in y_pred]

    print("\n=== Hierarchical Metrics ===")
    print(f"H-Precision: {h_precision(y_true_h, y_pred_h):.4f}")
    print(f"H-Recall:    {h_recall(y_true_h, y_pred_h):.4f}")
    print(f"H-F1:        {h_f1(y_true_h, y_pred_h):.4f}")


# V tejto časti prebieha hlavná príprava, trénovanie  a hodnotenie hierarchického modelu.
 Model je dvojúrovňový - najprv sa predpovedá pravdepodobnosť úmrtia (mortality), a ak pacient nebol predikovaný ako zomrel, pokračuje sa druhým modelom (severity), ktorý určuje závažnosť ochorenia u preživších.

# Prvá vlna pandémie

In [5]:
X, y_mortality, y_severity, y_true = preprocess_data('../data/1vlna.xlsx')
feature_names = X.columns

# Rozdelenie dát na trénovaciu a testovaciu množinu so stratifikáciou podľa cieľovej premennej úmrtnosti (pretože dáta sú nevyvážené).
X_train, X_test, y_train_mort, y_test_mort = train_test_split(
    X, y_mortality, test_size=0.3, random_state=42, stratify=y_mortality
)

# Definícia predspracovania - normalizuje sa len vek, ostatné premenné zostávajú nezmenené
preprocessor = ColumnTransformer([
    ('scaler', StandardScaler(), ['Vek']),
    ('passthrough', 'passthrough', X.columns.difference(['Vek']))
])
# Trénovanie modelu pre predikciu úmrtnosti
mort_pipeline = train_random_forest(X_train, y_train_mort, preprocessor)
# Získanie predpovedí Out-of-Fold (OOF) pre trénovacie dáta pri použití modelu úmrtnosti
oof_predictions = cross_val_predict(
    mort_pipeline, X_train, y_train_mort, cv=5, method='predict'
)

'''Tento blok kódu vykonáva filtrovanie trénovacích dát, pričom vyberáme len tie prípady, ktoré model úmrtnosti (mortality) predpovedal ako preživšie (0), a zároveň sa u týchto pacientov zaznamenáva úroveň závažnosti v skutočných údajoch (to znamená, že skutočne prežili a mali inú úroveň závažnosti ako smrť).'''
#Tieto dáta slúžia ako vstup pre druhý model – klasifikátor závažnosti.

survivor_mask_train_pred_oof = (oof_predictions == 0)
y_train_severity_mask = y_severity.loc[X_train.index].notna()
relevant_mask_train = survivor_mask_train_pred_oof & y_train_severity_mask

X_train_severity = X_train[relevant_mask_train]
y_train_severity = y_severity.loc[X_train.index][relevant_mask_train]

# Trénovanie modelu pre predikciu závažnosti na filtrovaných trénovacích dátach
severity_pipeline = train_random_forest(X_train_severity, y_train_severity, preprocessor)

# Predikcia umrtnosti (mortality) na testovacej množine
y_pred_mortality_test = mort_pipeline.predict(X_test)
survivor_mask_test = (y_pred_mortality_test == 0)

# Príprava testovacích dát pre druhý model – len tí, ktorí boli predikovaní ako preživší.
X_test_severity = X_test[survivor_mask_test]
y_pred_severity = np.full(len(X_test), np.nan)


# Ak existujú pacienti predikovaní ako preživší, aplikuje sa druhý model a zvýši sa predikovaná hodnota o 1, aby sa hodnoty mapovali späť na pôvodné označenie tried (1 alebo 2).
if not X_test_severity.empty:
    y_pred_severity_survivors = severity_pipeline.predict(X_test_severity)
    y_pred_severity[survivor_mask_test] = y_pred_severity_survivors + 1

# Finalizácia predikcie:
# - pacienti predikovaní ako zomrelí (1. model) dostanú hodnotu 3
# - preživší dostanú predikovanú hodnotu závažnosti (1 alebo 2)
final_pred = np.full(len(X_test), 3)
final_pred[survivor_mask_test] = y_pred_severity[survivor_mask_test]
final_pred = np.nan_to_num(final_pred, nan=3).astype(int)

# Skutočné hodnoty pre testovaciu množinu
y_true_final = y_true.loc[X_test.index]

# Vyhodnotenie celkového výkonu hierarchického modelu
evaluate_predictions(y_true_final, pd.Series(final_pred, index=X_test.index))



Fitting 5 folds for each of 576 candidates, totalling 2880 fits
Best parameters: {'classifier__bootstrap': True, 'classifier__criterion': 'entropy', 'classifier__max_depth': None, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}
Fitting 5 folds for each of 576 candidates, totalling 2880 fits
Best parameters: {'classifier__bootstrap': True, 'classifier__criterion': 'entropy', 'classifier__max_depth': None, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}

=== Hierarchical Metrics ===
H-Precision: 0.7461
H-Recall:    0.7437
H-F1:        0.7449


# Úplne rovnaká štruktúra kódu je definovaná pre druhú, tretiu a štvrtú vlnu pandémie

# Druhá vlna pandémie

In [6]:
X, y_mortality, y_severity, y_true = preprocess_data('../data/2vlna.xlsx')
feature_names = X.columns

X_train, X_test, y_train_mort, y_test_mort = train_test_split(
    X, y_mortality, test_size=0.3, random_state=42, stratify=y_mortality
)


preprocessor = ColumnTransformer([
    ('scaler', StandardScaler(), ['Vek']),
    ('passthrough', 'passthrough', X.columns.difference(['Vek']))
])

mort_pipeline = train_random_forest(X_train, y_train_mort, preprocessor)

oof_predictions = cross_val_predict(
    mort_pipeline, X_train, y_train_mort, cv=5, method='predict'
)
survivor_mask_train_pred_oof = (oof_predictions == 0)


y_train_severity_not_na_mask = y_severity.loc[X_train.index].notna()
relevant_mask_train = survivor_mask_train_pred_oof & y_train_severity_not_na_mask
X_train_severity = X_train[relevant_mask_train]
y_train_severity = y_severity.loc[X_train.index][relevant_mask_train]


severity_pipeline = train_random_forest(X_train_severity, y_train_severity, preprocessor)

y_pred_mortality_test = mort_pipeline.predict(X_test)
survivor_mask_test = (y_pred_mortality_test == 0)

X_test_severity = X_test[survivor_mask_test]
y_pred_severity = np.full(len(X_test), np.nan)

if not X_test_severity.empty:
    y_pred_severity_survivors = severity_pipeline.predict(X_test_severity)
    y_pred_severity[survivor_mask_test] = y_pred_severity_survivors + 1

final_pred = np.full(len(X_test), 3)
final_pred[survivor_mask_test] = y_pred_severity[survivor_mask_test]
final_pred = np.nan_to_num(final_pred, nan=3).astype(int)


y_true_final = y_true.loc[X_test.index]
evaluate_predictions(y_true_final, pd.Series(final_pred, index=X_test.index))


Fitting 5 folds for each of 576 candidates, totalling 2880 fits
Best parameters: {'classifier__bootstrap': True, 'classifier__criterion': 'gini', 'classifier__max_depth': 10, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}
Fitting 5 folds for each of 576 candidates, totalling 2880 fits
Best parameters: {'classifier__bootstrap': True, 'classifier__criterion': 'gini', 'classifier__max_depth': 10, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 150}

=== Hierarchical Metrics ===
H-Precision: 0.7411
H-Recall:    0.7651
H-F1:        0.7529


# Tretia vlna pandémie

In [7]:
X, y_mortality, y_severity, y_true = preprocess_data('../data/3vlna.xlsx')
feature_names = X.columns

X_train, X_test, y_train_mort, y_test_mort = train_test_split(
    X, y_mortality, test_size=0.3, random_state=42, stratify=y_mortality
)


preprocessor = ColumnTransformer([
    ('scaler', StandardScaler(), ['Vek']),
    ('passthrough', 'passthrough', X.columns.difference(['Vek']))
])

mort_pipeline = train_random_forest(X_train, y_train_mort, preprocessor)

oof_predictions = cross_val_predict(
    mort_pipeline, X_train, y_train_mort, cv=5, method='predict'
)
survivor_mask_train_pred_oof = (oof_predictions == 0)


y_train_severity_not_na_mask = y_severity.loc[X_train.index].notna()
relevant_mask_train = survivor_mask_train_pred_oof & y_train_severity_not_na_mask
X_train_severity = X_train[relevant_mask_train]
y_train_severity = y_severity.loc[X_train.index][relevant_mask_train]


severity_pipeline = train_random_forest(X_train_severity, y_train_severity, preprocessor)

y_pred_mortality_test = mort_pipeline.predict(X_test)
survivor_mask_test = (y_pred_mortality_test == 0)

X_test_severity = X_test[survivor_mask_test]
y_pred_severity = np.full(len(X_test), np.nan)

if not X_test_severity.empty:
    y_pred_severity_survivors = severity_pipeline.predict(X_test_severity)
    y_pred_severity[survivor_mask_test] = y_pred_severity_survivors + 1

final_pred = np.full(len(X_test), 3)
final_pred[survivor_mask_test] = y_pred_severity[survivor_mask_test]
final_pred = np.nan_to_num(final_pred, nan=3).astype(int)


y_true_final = y_true.loc[X_test.index]
evaluate_predictions(y_true_final, pd.Series(final_pred, index=X_test.index))


Fitting 5 folds for each of 576 candidates, totalling 2880 fits
Best parameters: {'classifier__bootstrap': True, 'classifier__criterion': 'gini', 'classifier__max_depth': 10, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 150}
Fitting 5 folds for each of 576 candidates, totalling 2880 fits
Best parameters: {'classifier__bootstrap': True, 'classifier__criterion': 'gini', 'classifier__max_depth': 10, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}

=== Hierarchical Metrics ===
H-Precision: 0.7500
H-Recall:    0.7154
H-F1:        0.7323


# štvrtá vlna pandémie

In [8]:
X, y_mortality, y_severity, y_true = preprocess_data('../data/4vlna.xlsx')
feature_names = X.columns

X_train, X_test, y_train_mort, y_test_mort = train_test_split(
    X, y_mortality, test_size=0.3, random_state=42, stratify=y_mortality
)


preprocessor = ColumnTransformer([
    ('scaler', StandardScaler(), ['Vek']),
    ('passthrough', 'passthrough', X.columns.difference(['Vek']))
])

mort_pipeline = train_random_forest(X_train, y_train_mort, preprocessor)

oof_predictions = cross_val_predict(
    mort_pipeline, X_train, y_train_mort, cv=5, method='predict'
)
survivor_mask_train_pred_oof = (oof_predictions == 0)


y_train_severity_not_na_mask = y_severity.loc[X_train.index].notna()
relevant_mask_train = survivor_mask_train_pred_oof & y_train_severity_not_na_mask
X_train_severity = X_train[relevant_mask_train]
y_train_severity = y_severity.loc[X_train.index][relevant_mask_train]


severity_pipeline = train_random_forest(X_train_severity, y_train_severity, preprocessor)

y_pred_mortality_test = mort_pipeline.predict(X_test)
survivor_mask_test = (y_pred_mortality_test == 0)

X_test_severity = X_test[survivor_mask_test]
y_pred_severity = np.full(len(X_test), np.nan)

if not X_test_severity.empty:
    y_pred_severity_survivors = severity_pipeline.predict(X_test_severity)
    y_pred_severity[survivor_mask_test] = y_pred_severity_survivors + 1

final_pred = np.full(len(X_test), 3)
final_pred[survivor_mask_test] = y_pred_severity[survivor_mask_test]
final_pred = np.nan_to_num(final_pred, nan=3).astype(int)


y_true_final = y_true.loc[X_test.index]
evaluate_predictions(y_true_final, pd.Series(final_pred, index=X_test.index))


Fitting 5 folds for each of 576 candidates, totalling 2880 fits
Best parameters: {'classifier__bootstrap': True, 'classifier__criterion': 'entropy', 'classifier__max_depth': 20, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 150}
Fitting 5 folds for each of 576 candidates, totalling 2880 fits
Best parameters: {'classifier__bootstrap': True, 'classifier__criterion': 'entropy', 'classifier__max_depth': 10, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}

=== Hierarchical Metrics ===
H-Precision: 0.7670
H-Recall:    0.7907
H-F1:        0.7787
