In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, roc_curve
)
from sklearn.preprocessing import LabelEncoder, StandardScaler, label_binarize

files = ['wave1_imputed.xlsx', 'wave2_imputed.xlsx', 'wave3_imputed.xlsx', 'wave4_imputed.xlsx']
results = []

for file in files:
    df = pd.read_excel(file)

    X = df.drop(columns=['Závažnosť priebehu ochorenia'])
    y = df['Závažnosť priebehu ochorenia']

    y = y.astype('category').cat.codes
    class_labels = np.unique(y)
    y_bin = label_binarize(y, classes=class_labels)

    non_numeric_columns = X.select_dtypes(include=['object']).columns
    print(f"Non-numeric columns in {file}:", non_numeric_columns)

    le = LabelEncoder()
    for column in non_numeric_columns:
        X[column] = le.fit_transform(X[column].astype(str))

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    X_train, X_test, y_train, y_test, y_bin_train, y_bin_test = train_test_split(
        X_scaled, y, y_bin, test_size=0.2, random_state=42
    )

    model = SVC(kernel='rbf', C=1, gamma='scale', probability=True, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)

    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

    acc = report['accuracy']
    f1_macro = report['macro avg']['f1-score']
    precision_macro = report['macro avg']['precision']
    recall_macro = report['macro avg']['recall']

    try:
        auc_macro = roc_auc_score(y_bin_test, y_pred_proba, average='macro', multi_class='ovr')
    except ValueError:
        auc_macro = np.nan  

    results.append({
        'dataset': file,
        'accuracy': round(acc, 4),
        'f1_macro': round(f1_macro, 4),
        'precision_macro': round(precision_macro, 4),
        'recall_macro': round(recall_macro, 4),
        'auc_macro': round(auc_macro, 4) if not np.isnan(auc_macro) else 'N/A'
    })

    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {file}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.savefig(f'conf_matrix_{file.replace(".xlsx", "")}_SVM.png')
    plt.close()

    plt.figure(figsize=(10, 8))
    for i in range(len(class_labels)):
        fpr, tpr, _ = roc_curve(y_bin_test[:, i], y_pred_proba[:, i])
        auc = roc_auc_score(y_bin_test[:, i], y_pred_proba[:, i])
        plt.plot(fpr, tpr, label=f'Trieda {class_labels[i]} (AUC = {auc:.2f})')

    plt.plot([0, 1], [0, 1], color='navy', linestyle='--')  
    plt.title(f'ROC krivka pre {file}')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc='lower right')
    plt.tight_layout()
    plt.savefig(f'roc_curve_{file.replace(".xlsx", "")}_SVM.png')
    plt.close()

results_df = pd.DataFrame(results)
results_df.to_csv('results_SVM.csv', index=False)

print("Výsledky uložené do 'results_SVM.csv', obrázky ROC kriviek a Confusion Matrix uložené do PNG súborov.")


Non-numeric columns in wave1_imputed.xlsx: Index(['Meno', 'Pohlavie', 'Dátum príjmu', 'Dátum prepustenia', 'HLN Dg.',
       'Diagnózy', 'DRG výkony', 'Liečba', 'SVLZ správy', 'Mikrobiológia ',
       'Epikríza', 'Terajšie ochorenie', 'Dôvod hospitalizácie',
       'Objektívny nález', 'Osobná anamnéza', 'Lieková anamnéza',
       'Návyková anamnéza', 'Epidemiologická anamnéza'],
      dtype='object')
Non-numeric columns in wave2_imputed.xlsx: Index(['Meno', 'Pohlavie', 'Dátum príjmu', 'Dátum prepustenia', 'HLN Dg.',
       'Diagnózy', 'DRG výkony', 'Liečba', 'SVLZ správy', 'Mikrobiológia ',
       'Epikríza', 'Terajšie ochorenie', 'Dôvod hospitalizácie',
       'Objektívny nález', 'Osobná anamnéza', 'Lieková anamnéza',
       'Návyková anamnéza', 'Epidemiologická anamnéza'],
      dtype='object')
Non-numeric columns in wave3_imputed.xlsx: Index(['Meno', 'Pohlavie', 'Dátum príjmu', 'Dátum prepustenia', 'HLN Dg.',
       'Diagnózy', 'DRG výkony', 'Liečba', 'SVLZ správy', 'Mikrobiológia '