In [1]:
!pip install imblearn
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.calibration import CalibratedClassifierCV
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import (
    confusion_matrix, recall_score, precision_score, f1_score,
    roc_auc_score, roc_curve, auc,accuracy_score
)

df_encoded = pd.read_csv("df_encoded.csv")
print(len(df_encoded))

X = df_encoded.drop(columns=['hypotenzia'])
y = df_encoded['hypotenzia']


num_cols = X.select_dtypes(include='number').columns.tolist()
scaler = MinMaxScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn->imblearn)
  Downloading sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
Downloading sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.13.0 imblearn-0.0 sklearn-compat-0.1.3
3517


In [2]:

# grid svc 


# Pipeline so SMOTE a kalibrovaným SVM
pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('clf', CalibratedClassifierCV(
        estimator=SVC(probability=True, random_state=42),
        cv=3, method='sigmoid'
    ))
])

# GridSearch cez parametre vo vnútri SVC
param_grid = {
    'clf__estimator__C': [0.1, 1, 3,5,7, 10],
    'clf__estimator__gamma': [0.001, 0.01, 0.1],
    'clf__estimator__kernel': ['rbf'],
    'clf__estimator__class_weight': ['balanced']
}

# Spustenie GridSearch
grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='recall',
    cv=5,
    verbose=1,
    n_jobs=-1
)


grid.fit(X_train, y_train)


Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [3]:

# grid MLP
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score
)

# 1. Pipeline: SMOTE + MLP
mlp_pipeline = Pipeline(steps=[
    ('smote', SMOTE(random_state=42)),
    ('mlp', MLPClassifier(max_iter=1000, random_state=42))
])


param_grid_mlp = {
    'mlp__hidden_layer_sizes': [(50, 50), (100,), (30, 30, 30)],
    'mlp__activation': ['relu', 'tanh'],
    'mlp__alpha': [0.0001, 0.01],
    'mlp__learning_rate': ['constant', 'adaptive']
}


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_mlp = GridSearchCV(
    estimator=mlp_pipeline,
    param_grid=param_grid_mlp,
    scoring='accuracy',
    cv=cv,
    n_jobs=-1,
    verbose=1
)


grid_mlp.fit(X_train, y_train)


best_mlp = grid_mlp.best_estimator_


y_proba = best_mlp.predict_proba(X_test)[:, 1]
y_pred = (y_proba >= 0.5).astype(int)


tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_proba)

print("\n parametre (MLP):")
print(grid_mlp.best_params_)




Fitting 5 folds for each of 24 candidates, totalling 120 fits





 parametre (MLP):
{'mlp__activation': 'relu', 'mlp__alpha': 0.0001, 'mlp__hidden_layer_sizes': (50, 50), 'mlp__learning_rate': 'constant'}



✅ Najlepšie parametre (MLP):
{'mlp__activation': 'relu', 'mlp__alpha': 0.0001, 'mlp__hidden_layer_sizes': (50, 50), 'mlp__learning_rate': 'constant'}

📊 Výsledky na testovacej množine:
TP: 478, FP: 76, FN: 103, TN: 47
🔁 Recall:    0.823
🎯 Precision: 0.863
📊 F1-score:  0.842
🚀 AUC-ROC:   0.693


In [4]:

# grid RF 
from sklearn.ensemble import RandomForestClassifier


rf_pipeline = Pipeline(steps=[
    ('smote', SMOTE(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42))
])


param_grid_rf = {
    'rf__n_estimators': [100, 200,300,400],
    'rf__max_depth': [5, 10, None,15,20],
    'rf__min_samples_split': [2, 5,7],
    'rf__class_weight': [None, 'balanced']
}


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_rf = GridSearchCV(
    estimator=rf_pipeline,
    param_grid=param_grid_rf,
    scoring='accuracy',
    cv=cv,
    n_jobs=-1,
    verbose=1
)


grid_rf.fit(X_train, y_train)

best_rf = grid_rf.best_estimator_

# Testovanie 
y_proba = best_rf.predict_proba(X_test)[:, 1]
y_pred = (y_proba >= 0.5).astype(int)

#  Výsledky
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_proba)


print(grid_rf.best_params_)



Fitting 5 folds for each of 120 candidates, totalling 600 fits
{'rf__class_weight': None, 'rf__max_depth': None, 'rf__min_samples_split': 5, 'rf__n_estimators': 200}


In [7]:
!pip install xgboost
# grid XGB2
from xgboost import XGBClassifier



xgb_pipeline = Pipeline(steps=[
    ('smote', SMOTE(random_state=42)),
    ('xgb', XGBClassifier(eval_metric='logloss', random_state=42))
])

param_grid = {
    'xgb__n_estimators': [100, 200,300],
    'xgb__max_depth': [3, 5, 7,10],
    'xgb__learning_rate': [0.01, 0.1,0.2],
    'xgb__subsample': [0.6,0.7,0.8, 1.0],
    'xgb__colsample_bytree': [0.6,0.7, 1.0]
}


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(
    estimator=xgb_pipeline,
    param_grid=param_grid,
    scoring='accuracy',
    cv=cv,
    n_jobs=-1,
    verbose=1
)


grid.fit(X_train, y_train)


best_model = grid.best_estimator_

y_proba = best_model.predict_proba(X_test)[:, 1]
y_pred = (y_proba >= 0.5).astype(int)  


tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_proba)

print("\n Najlepšie parametre:")
print(grid.best_params_)



Fitting 5 folds for each of 432 candidates, totalling 2160 fits

 Najlepšie parametre:
{'xgb__colsample_bytree': 0.6, 'xgb__learning_rate': 0.1, 'xgb__max_depth': 10, 'xgb__n_estimators': 300, 'xgb__subsample': 0.7}


In [8]:

#Grid Decision tree
from sklearn.tree import DecisionTreeClassifier


param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy'],
    'class_weight': [None, 'balanced']
}


dt = DecisionTreeClassifier(random_state=42)
grid = GridSearchCV(dt, param_grid, scoring='roc_auc', cv=3, n_jobs=-1)
grid.fit(X_train, y_train)


best_dt = grid.best_estimator_


def evaluate_model(name, model):
    y_proba = model.predict_proba(X_test)[:, 1]
    y_pred = (y_proba >= 0.5).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    return {
        "Model": name,
        "TP": tp, "FP": fp, "FN": fn, "TN": tn,
        "Recall": round(recall_score(y_test, y_pred), 3),
        "Precision": round(precision_score(y_test, y_pred), 3),
        "F1": round(f1_score(y_test, y_pred), 3),
        "ROC AUC": round(roc_auc_score(y_test, y_proba), 3)
    }


baseline_dt = DecisionTreeClassifier(random_state=42)
baseline_dt.fit(X_train, y_train)


results = [
    evaluate_model("DecisionTree (default)", baseline_dt),
    evaluate_model("DecisionTree (tuned)", best_dt)
]

results_df = pd.DataFrame(results)
print(results_df)


print("\n Najlepšie parametre:")
print(grid.best_params_)


                    Model   TP  FP   FN  TN  Recall  Precision     F1  ROC AUC
0  DecisionTree (default)  502  90   79  33   0.864      0.848  0.856    0.566
1    DecisionTree (tuned)  402  30  179  93   0.692      0.931  0.794    0.756

 Najlepšie parametre:
{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [9]:
# knn grid
from sklearn.neighbors import KNeighborsClassifier


param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['minkowski'],
    'p': [1, 2]  
}


knn = KNeighborsClassifier()
grid = GridSearchCV(knn, param_grid, scoring='roc_auc', cv=3, n_jobs=-1)
grid.fit(X_train, y_train)


best_knn = grid.best_estimator_


def evaluate_model(name, model):
    y_proba = model.predict_proba(X_test)[:, 1]
    
    y_pred = (y_proba >= 0.5).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    return {
        "Model": name,
        "TP": tp, "FP": fp, "FN": fn, "TN": tn,
        "Recall": round(recall_score(y_test, y_pred), 3),
        "Precision": round(precision_score(y_test, y_pred), 3),
        "F1": round(f1_score(y_test, y_pred), 3),
        "ROC AUC": round(roc_auc_score(y_test, y_proba), 3)
    }


baseline_knn = KNeighborsClassifier()
baseline_knn.fit(X_train, y_train)


results = [
    evaluate_model("KNN (default)", baseline_knn),
    evaluate_model("KNN (tuned)", best_knn)
]

results_df = pd.DataFrame(results)
print(results_df)


print("\n Najlepšie parametre pre KNN:")
print(grid.best_params_)


           Model   TP   FP  FN  TN  Recall  Precision     F1  ROC AUC
0  KNN (default)  553  110  28  13   0.952      0.834  0.889     0.61
1    KNN (tuned)  568  118  13   5   0.978      0.828  0.897     0.61

 Najlepšie parametre pre KNN:
{'metric': 'minkowski', 'n_neighbors': 11, 'p': 1, 'weights': 'uniform'}


In [10]:

# grid lr
from sklearn.linear_model import LogisticRegression

logreg_pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('logreg', LogisticRegression(max_iter=1000, solver='liblinear'))
])


param_grid_logreg = {
    'logreg__C': [0.01, 0.1, 1, 10, 100],
    'logreg__penalty': ['l1', 'l2'],
    'logreg__solver': ['liblinear'], 
}


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_logreg = GridSearchCV(
    estimator=logreg_pipeline,
    param_grid=param_grid_logreg,
    scoring='recall',  
    cv=cv,
    n_jobs=-1,
    verbose=1
)


grid_logreg.fit(X_train, y_train)


best_logreg = grid_logreg.best_estimator_
y_proba = best_logreg.predict_proba(X_test)[:, 1]
y_pred = (y_proba >= 0.5).astype(int)


tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print("\n Najlepšie parametre:")
print(grid_logreg.best_params_)


Fitting 5 folds for each of 10 candidates, totalling 50 fits

 Najlepšie parametre:
{'logreg__C': 100, 'logreg__penalty': 'l1', 'logreg__solver': 'liblinear'}


In [11]:
# knn grid
from sklearn.neighbors import KNeighborsClassifier


param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['minkowski'],
    'p': [1, 2]  # p=1 → manhattan, p=2 → euclidean
}


knn = KNeighborsClassifier()
grid = GridSearchCV(knn, param_grid, scoring='roc_auc', cv=3, n_jobs=-1)
grid.fit(X_train, y_train)


best_knn = grid.best_estimator_


def evaluate_model(name, model):
    y_proba = model.predict_proba(X_test)[:, 1]
    y_pred = (y_proba >= 0.5).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    return {
        "Model": name,
        "TP": tp, "FP": fp, "FN": fn, "TN": tn,
        "Recall": round(recall_score(y_test, y_pred), 3),
        "Precision": round(precision_score(y_test, y_pred), 3),
        "F1": round(f1_score(y_test, y_pred), 3),
        "ROC AUC": round(roc_auc_score(y_test, y_proba), 3)
    }


baseline_knn = KNeighborsClassifier()
baseline_knn.fit(X_train, y_train)


results = [
    evaluate_model("KNN (default)", baseline_knn),
    evaluate_model("KNN (tuned)", best_knn)
]

results_df = pd.DataFrame(results)
print(results_df)

print("\n Najlepšie parametre pre KNN:")
print(grid.best_params_)

           Model   TP   FP  FN  TN  Recall  Precision     F1  ROC AUC
0  KNN (default)  553  110  28  13   0.952      0.834  0.889     0.61
1    KNN (tuned)  568  118  13   5   0.978      0.828  0.897     0.61

 Najlepšie parametre pre KNN:
{'metric': 'minkowski', 'n_neighbors': 11, 'p': 1, 'weights': 'uniform'}


In [13]:
#  hladanie trashholdu 
import numpy as np
import pandas as pd

base_rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=5,
    min_samples_split=7,
    class_weight='balanced',
    random_state=42
)
calibrated_rf = CalibratedClassifierCV(estimator=base_rf, cv=3)

base_svm = SVC(kernel='rbf', probability=True, class_weight='balanced', random_state=42)
calibrated_svm = CalibratedClassifierCV(estimator=base_svm, cv=3)


#  Definovanie modelov
models = {
    'Logistická regresia': LogisticRegression(max_iter=1000, C=1, penalty='l2', solver='liblinear') ,
    'Rozhodovací strom': DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3, min_samples_leaf=1, min_samples_split=2),
    'Náhodný les': RandomForestClassifier(n_estimators=165,max_depth=7,min_samples_split=5,random_state=42),
    'XGBoost': XGBClassifier(
        colsample_bytree=0.6, learning_rate=0.01, max_depth=9,
        n_estimators=150, subsample=0.7
    ),
    'MLP': MLPClassifier(activation='tanh', alpha=0.01, hidden_layer_sizes=(50, 50), learning_rate='constant'),
    'K-Nearest Neighbors': KNeighborsClassifier(metric='manhattan', n_neighbors=3,weights='distance'),
    'SVM': SVC(C=10, probability=True, random_state=42, gamma=0.1)
}

# Prahy
thresholds = np.arange(0.3, 0.91, 0.05)
results = []


for name, model in models.items():
    model.fit(X_train, y_train)
    
    # Získaj pravdepodobnosti 
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
    else:
        y_proba = model.decision_function(X_test)

    for thresh in thresholds:
        y_pred = (y_proba >= thresh).astype(int)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        results.append({
            'Model': name,
            'Threshold': round(thresh, 2),
            'TP': tp, 'FP': fp, 'FN': fn, 'TN': tn,
            'Recall': round(recall, 3),
            'Precision': round(precision, 3),
            'F1-score': round(f1, 3)
        })
results_df = pd.DataFrame(results)


best_thresholds = results_df.sort_values(by=['Model', 'Recall'], ascending=[True, False]).drop_duplicates('Model')

print(" Najlepšie prahy :")
print(best_thresholds[['Model', 'Threshold', 'F1-score', 'Recall', 'Precision', 'FP', 'TP']])


 Najlepšie prahy :
                  Model  Threshold  F1-score  Recall  Precision   FP   TP
65  K-Nearest Neighbors        0.3     0.898   0.978      0.830  116  568
0   Logistická regresia        0.3     0.905   1.000      0.826  122  581
52                  MLP        0.3     0.905   1.000      0.826  122  581
26          Náhodný les        0.3     0.904   1.000      0.825  123  581
13    Rozhodovací strom        0.3     0.904   1.000      0.825  123  581
78                  SVM        0.3     0.905   0.998      0.827  121  580
39              XGBoost        0.3     0.904   1.000      0.825  123  581
