In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, auc
from sklearn.preprocessing import LabelEncoder, label_binarize

files = ['wave1_imputed.xlsx', 'wave2_imputed.xlsx', 'wave3_imputed.xlsx', 'wave4_imputed.xlsx']
results = []

for file in files:
    df = pd.read_excel(file)

    X = df.drop(columns=['Závažnosť priebehu ochorenia'])
    y = df['Závažnosť priebehu ochorenia']

    y = y.astype('category').cat.codes
    class_labels = np.unique(y)
    y_bin = label_binarize(y, classes=class_labels)

    non_numeric_columns = X.select_dtypes(include=['object']).columns
    print(f"Non-numeric columns in {file}:", non_numeric_columns)
    
    for column in non_numeric_columns:
        le = LabelEncoder()
        X[column] = le.fit_transform(X[column].astype(str))

    X_train, X_test, y_train, y_test, y_bin_train, y_bin_test = train_test_split(
        X, y, y_bin, test_size=0.2, random_state=42
    )

    model = LogisticRegression(solver='lbfgs', max_iter=2000) 
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)

    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    acc = report['accuracy']
    f1_macro = report['macro avg']['f1-score']
    precision_macro = report['macro avg']['precision']
    recall_macro = report['macro avg']['recall']

    try:
        auc_macro = roc_auc_score(y_bin_test, y_pred_proba, average='macro', multi_class='ovr')
    except ValueError:
        auc_macro = np.nan

    results.append({
        'dataset': file,
        'accuracy': round(acc, 4),
        'f1_macro': round(f1_macro, 4),
        'precision_macro': round(precision_macro, 4),
        'recall_macro': round(recall_macro, 4),
        'auc_macro': round(auc_macro, 4) if not np.isnan(auc_macro) else 'N/A'
    })

    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {file}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.savefig(f'conf_matrix_{file.replace(".xlsx", "")}_LR.png')
    plt.close()

    plt.figure(figsize=(8, 6))
    for i in range(len(class_labels)):
        fpr, tpr, _ = roc_curve(y_bin_test[:, i], y_pred_proba[:, i])
        roc_auc = auc(fpr, tpr) 
        plt.plot(fpr, tpr, lw=2, label=f'Class {class_labels[i]} (AUC = {roc_auc:.2f})')

    plt.plot([0, 1], [0, 1], color='gray', linestyle='--')  
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {file}')
    plt.legend(loc='lower right')
    plt.tight_layout()
    plt.savefig(f'roc_curve_{file.replace(".xlsx", "")}_LR.png')
    plt.close()

results_df = pd.DataFrame(results)
results_df.to_csv('results_LR.csv', index=False)

print("Výsledky uložené do 'results_LR.csv' a obrázky do PNG súborov.")

Non-numeric columns in wave1_imputed.xlsx: Index(['Meno', 'Pohlavie', 'Dátum príjmu', 'Dátum prepustenia', 'HLN Dg.',
       'Diagnózy', 'DRG výkony', 'Liečba', 'SVLZ správy', 'Mikrobiológia ',
       'Epikríza', 'Terajšie ochorenie', 'Dôvod hospitalizácie',
       'Objektívny nález', 'Osobná anamnéza', 'Lieková anamnéza',
       'Návyková anamnéza', 'Epidemiologická anamnéza'],
      dtype='object')


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Non-numeric columns in wave2_imputed.xlsx: Index(['Meno', 'Pohlavie', 'Dátum príjmu', 'Dátum prepustenia', 'HLN Dg.',
       'Diagnózy', 'DRG výkony', 'Liečba', 'SVLZ správy', 'Mikrobiológia ',
       'Epikríza', 'Terajšie ochorenie', 'Dôvod hospitalizácie',
       'Objektívny nález', 'Osobná anamnéza', 'Lieková anamnéza',
       'Návyková anamnéza', 'Epidemiologická anamnéza'],
      dtype='object')


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Non-numeric columns in wave3_imputed.xlsx: Index(['Meno', 'Pohlavie', 'Dátum príjmu', 'Dátum prepustenia', 'HLN Dg.',
       'Diagnózy', 'DRG výkony', 'Liečba', 'SVLZ správy', 'Mikrobiológia ',
       'Epikríza', 'Terajšie ochorenie', 'Dôvod hospitalizácie',
       'Objektívny nález', 'Osobná anamnéza', 'Lieková anamnéza',
       'Návyková anamnéza', 'Epidemiologická anamnéza'],
      dtype='object')


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Non-numeric columns in wave4_imputed.xlsx: Index(['Meno', 'Pohlavie', 'Dátum príjmu', 'Dátum prepustenia', 'HLN Dg.',
       'Diagnózy', 'DRG výkony', 'Liečba', 'SVLZ správy', 'Mikrobiológia ',
       'Epikríza', 'Terajšie ochorenie', 'Dôvod hospitalizácie',
       'Objektívny nález', 'Osobná anamnéza', 'Lieková anamnéza',
       'Návyková anamnéza', 'Epidemiologická anamnéza'],
      dtype='object')


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


✅ Hotovo! Výsledky uložené do 'results_LR.csv' a obrázky do PNG súborov.
