In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.model_selection import (train_test_split, StratifiedKFold, cross_validate)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    confusion_matrix, classification_report
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset

# Suppress overflow warnings from sklearn/numpy
warnings.filterwarnings("ignore", category=RuntimeWarning)
np.seterr(over='ignore')

sns.set(style='whitegrid', context='notebook')

#Check device for NN
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
df = pd.read_csv('ECGCvdata 2.csv') #replace obv
df.shape

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
# Noticing a lot of NULLS with some entries not having complete data, we will see how valid those stats are 
# And decide if we are able to drop them 

In [None]:
# 1) Drop columns with >50% missing
missing_pct = (df.isnull().sum() / len(df)) * 100
high_missing_cols = missing_pct[missing_pct > 50].index.tolist()
df_clean = df.drop(columns=high_missing_cols)

In [None]:
# Find outliers 
num_features = df.select_dtypes(include='number').columns
outliers_list = []

for col in num_features:
    Q1 = df[col].quantile(0.25)
    median = df[col].median()
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - (1.5 * IQR)
    upper_bound = Q3 + (1.5 * IQR)

    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col]
    outliers_list.append([col, len(outliers)])

for col_name, outlier_count in outliers_list:
    print(f"{col_name}: {outlier_count} outliers")

In [None]:
# With these results its clear we should fill NaNs with median not mean

# Fill NaNs left with median
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')

In [None]:
#Show different results for ECG_signal
print(sorted(df['ECG_signal'].unique().tolist()))
# AFF = Atrial Fibrillation: Rapid irregular atrial activity
# ARR = Arrhythmia: Abnormal Rhytms aren't speficially labeled
# CHF = Congestive Heart Failure: Not rhytm but heart fail;ure
# NSR = Normal Sinus Rhythm: healthy regular heartbeat

In [None]:
# 
df_encoded = pd.get_dummies(df, columns=['ECG_signal'], dtype=int)
df_encoded.sum()

In [None]:
df_encoded.head()

In [None]:
feature_list = ['RRmean', 'PPmean', 'RTdis', 'PonTdis', 'PonToffdis', 'QTdis',
                'IBIM', 'NN50', 'hbpermin', 'RSslope', 'RRTot', 'NNTot', 'STslope']

feature_list = [c for c in feature_list if c in df_encoded.columns]

In [None]:
# Linear regression R^2 for EACH class (using your feature_list)
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression

X = df_encoded[feature_list]
label_cols = [c for c in df_encoded.columns if c.startswith('ECG_signal_')]

pipe = make_pipeline(
    SimpleImputer(strategy='median'),
    LinearRegression()
)

r2_table = []

for lab in label_cols:
    y = df_encoded[lab].astype(float)
    mask = y.notna()               # (y should be clean, but keep it consistent)
    X_use = X.loc[mask]
    y_use = y.loc[mask]
    if y_use.nunique() <= 1:       # constant target => R² undefined
        r2 = np.nan
    else:
        pipe.fit(X_use, y_use)
        r2 = pipe.score(X_use, y_use)
    r2_table.append((lab, r2))

# Show R^2 sorted (highest first)
r2_table = sorted(r2_table, key=lambda x: x[1], reverse=True)
r2_table

In [None]:
# As we can tell the regular one is MUCH higher than the rest.
# Adding a proper classifier + CV AUC

In [None]:
# See which features were dropped and which remain
set(['RRmean','PPmean','RTdis','PonTdis','PonToffdis','QTdis','IBIM','NN50','hbpermin','RSslope','RRTot','NNTot','STslope']) - set(df_encoded.columns)

# Confirm X is clean *after* imputation is applied in the pipeline (no need to impute here)
df_encoded[feature_list].isna().sum().sort_values(ascending=False).head(10)

# Check label balance
df_encoded[label_cols].sum()


In [None]:
# Check correlations among features
corr_matrix = df_encoded[feature_list].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

# Find highly correlated pairs (>0.9)
high_corr = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if abs(corr_matrix.iloc[i, j]) > 0.9:
            high_corr.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))
print("Highly correlated features (>0.9):")
for feat1, feat2, corr in high_corr:
    print(f"{feat1} <-> {feat2}: {corr:.3f}")

In [None]:
# Dropping PPmean as same hting as RRMean
features_final = [f for f in feature_list if f not in ['PPmean',]]

In [None]:
# Check correlations among features
corr_matrix = df_encoded[features_final].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

# Find highly correlated pairs (>0.9)
high_corr = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if abs(corr_matrix.iloc[i, j]) > 0.9:
            high_corr.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))
print("Highly correlated features (>0.9):")
for feat1, feat2, corr in high_corr:
    print(f"{feat1} <-> {feat2}: {corr:.3f}")

In [None]:
# Train/Val/Test Split

X = df_encoded[features_final]
y = df_encoded[label_cols].values

y_labels = df_encoded[label_cols].idxmax(axis=1).str.replace('ECG_Signal_','')

# Spliting: 60% train, 20% val, 20% test
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y_labels, test_size=0.4, stratify=y_labels, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)
print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")
print(f"Train distribution:\n{y_train.value_counts()}")

In [None]:
# One note is that RRmean and PPmean being 1 is not a red flag, but should not be used 
# To train the model 

In [None]:
## Evaluation HELPER function to give accuracy for P/R/F1 scores -> Useful for model training


In [None]:
def evaluate_model(name, pipeline, X_tr, y_tr, X_va, y_va):
    pipeline.fit(X_tr, y_tr)

    # Predictions
    y_tr_pred = pipeline.predict(X_tr)
    y_va_pred = pipeline.predict(X_va)

    # Accuracy
    acc_tr = accuracy_score(y_tr, y_tr_pred)
    acc_va = accuracy_score(y_va, y_va_pred)

    # Macro P/R/F1 (Equally weighted)
    p_va, r_va, f1_va, _ = precision_recall_fscore_support(
        y_va, y_va_pred, average='macro', zero_division=0
    )
    
    # Weighted P/R/F1 (accounts for class support)
    p_weighted, r_weighted, f1_weighted, _ = precision_recall_fscore_support(
        y_va, y_va_pred, average='weighted', zero_division=0
    )

    print(f"\n=== {name} ===")
    print(f"Train accuracy: {acc_tr:.3f}")
    print(f"Val accuracy:   {acc_va:.3f}")
    print(f"Val macro P/R/F1: {p_va:.3f} / {r_va:.3f} / {f1_va:.3f}")
    print(f"Val weighted P/R/F1: {p_weighted: .3f} / {r_weighted:.3f} / {f1_weighted:.3f}")

    # Confusion matrix
    labels_sorted = sorted(y_va.unique())
    print("Labels order:", labels_sorted)
    print("Val confusion matrix:\n",
          confusion_matrix(y_va, y_va_pred, labels=labels_sorted))
    
    print("\nPer-class classification report:\n",
          classification_report(y_va, y_va_pred, digits=3, zero_division=0))

    return pipeline

In [None]:
# Model 1
# Logistic Regression baseline linear classifer
# Classic ML baseline, imputation + standardization

log_reg_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(
        max_iter = 1000,
        multi_class = 'multinomial'
    ))
])

log_reg_pipeline = evaluate_model(
    "Logistic Regression",
    log_reg_pipeline,
    X_train, y_train,
    X_val, y_val
)

In [None]:
## Interpreting the confusion matrix 
# 60 Sampels per class in validation (240 total)
# AFF: 48/60 correct (80%)
# ARR: 60/60 correct (100%)
# CHF: 57/60 correct (95%)
# NSR: 60/60 correct (100%)

# Not bad, but when it coems to life-or-death situations we don't want any level of risk
# Let's see if we can do better


## Note
#the linear model is unable to reliably distinguish between AFF and CHF, misclassifying 20% of AFF cases??


In [None]:
# Random Forest 

rf_pipeline = Pipeline ([
    ('imputer', SimpleImputer(strategy='median')),
    # No scaler needed for trees
    ('clf', RandomForestClassifier(
        n_estimators=100,
        random_state=42,
        n_jobs=-1
    ))
])

rf_pipeline = evaluate_model(
    "Random Forest",
    rf_pipeline,
    X_train, y_train,
    X_val, y_val
)

In [None]:
# k-NN
def build_knn_pipeline(k):
    return Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('clf', KNeighborsClassifier(n_neighbors=k))
    ])

ks = [3, 5, 7, 9]
knn_results = {}

for k in ks:
    pipe = build_knn_pipeline(k)
    pipe.fit(X_train, y_train)
    y_val_pred = pipe.predict(X_val)
    
    acc = accuracy_score(y_val, y_val_pred)
    p, r, f1, _ = precision_recall_fscore_support(
        y_val, y_val_pred, average='macro'
    )
    knn_results[k] = (acc, p, r, f1)

print("k-NN validation results:")
for k, (acc, p, r, f1) in knn_results.items():
    print(f"k={k}: acc={acc:.3f}, macro F1={f1:.3f}")

In [None]:
# From this validaiton score we can select k=3 being the most precise model 
best_k = max(knn_results, key=lambda k: knn_results[k][0])
print("Best k on validation:", best_k)

knn_best_pipeline = build_knn_pipeline(best_k)
knn_best_pipeline = evaluate_model(
    f"k-NN (k={best_k})",
    knn_best_pipeline,
    X_train, y_train,
    X_val, y_val
)

In [None]:
## With this model we can see eventhough we have a very high train and val accuracy we can see 5 FN precditions
# With 5 (True AFF, predicted CHF)

In [None]:
# Decision Tree Classifier

dt_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('clf', DecisionTreeClassifier(
        max_depth=6, # Adjust this??? None = Full-depth will overfit
        random_state=42
    ))
])

dt_pipeline= evaluate_model(
    "Decision Tree (max_depth=6)",
    dt_pipeline,
    X_train, y_train,
    X_val, y_val
)

In [None]:
## Showcasing overfitting for Decision Tree
dt_overfit_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('clf', DecisionTreeClassifier(
        max_depth=None,   # let it grow fully
        random_state=42
    ))
])

dt_overfit_pipeline = evaluate_model(
    "Decision Tree (max_depth=None)",
    dt_overfit_pipeline,
    X_train, y_train,
    X_val, y_val
)

In [None]:
# Gradient Boosting

gb_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('clf', GradientBoostingClassifier(
        random_state=42
    ))
])

gb_pipeline=evaluate_model(
    "Gradient Boosting",
    gb_pipeline,
    X_train, y_train,
    X_val, y_val
)

In [None]:
# 5-fold stratified CV 

X_all = df_encoded[features_final]
y_all = y_labels 
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define models as pipelines (imputer + optional scaler + classifier)
models = {
    "LogisticRegression": log_reg_pipeline,
    "RandomForestClassifier": rf_pipeline,
    "KNeighbourClassifier": knn_best_pipeline, # k=3
    "DecisionTreeClassifier": dt_pipeline,
    "GradientBoostingClassifier": gb_pipeline,
}

# Run CV and collect macro P/R/F1 for each model
results = []

for name, pipe in models.items():
    cv_scores = cross_validate(
        pipe, X_all, y_all, cv=skf,
        scoring=['precision_macro', 'recall_macro', 'f1_macro']
    )
    results.append({
        "Model": name,
        "Precision": cv_scores['test_precision_macro'].mean() * 100,
        "Recall":    cv_scores['test_recall_macro'].mean() * 100,
        "F1 Score":  cv_scores['test_f1_macro'].mean() * 100
    })

results_df = pd.DataFrame(results)
results_df

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

order = results_df.sort_values('Recall', ascending=False)['Model']

# Long format for seaborn
results_long = results_df.melt(
    id_vars='Model',
    value_vars=['Precision', 'Recall', 'F1 Score'],
    var_name='Metric',
    value_name='Score'
)

plt.figure(figsize=(10, 10))
sns.barplot(
    data=results_long,
    x='Model',
    y='Score',
    hue='Metric',
    order=order
)

plt.ylabel('Percentage %')
plt.xlabel('Model Name')
plt.title('Model Performance Metrics (5-fold CV, Macro Averages)\nSorted by Recall')
plt.ylim(0, 100)
plt.xticks(rotation=20, ha='right')
plt.legend(title='Metric')
plt.tight_layout()
plt.show()


In [None]:
## Regularized Neural Network (fixed)


import torch.optim as optim

imputer = SimpleImputer(strategy='ßmedian')
X_imputed = imputer.fit_transform(df_encoded[features_final])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)


y_indices = df_encoded[label_cols].values.argmax(axis=1)

# resplitting (run independend)
X_tr, X_temp, y_tr, y_temp = train_test_split(
    X_scaled, y_indices,
    test_size=0.3,
    random_state=42,
    stratify=y_indices
)
X_va, X_te, y_va, y_te = train_test_split(
    X_temp, y_temp,
    test_size=0.5,
    random_state=42,
    stratify=y_temp
)

# Convert to tensors
train_data = TensorDataset(torch.FloatTensor(X_tr), torch.LongTensor(y_tr))
val_data   = TensorDataset(torch.FloatTensor(X_va), torch.LongTensor(y_va))
test_data  = TensorDataset(torch.FloatTensor(X_te), torch.LongTensor(y_te))


batch_size = 32
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_data, batch_size=batch_size)
test_loader  = DataLoader(test_data, batch_size=batch_size)

#regularization
class ECGClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(ECGClassifier, self).__init__()

        self.layer1   = nn.Linear(input_dim, 64)
        self.bn1      = nn.BatchNorm1d(64)
        self.relu     = nn.ReLU()
        self.dropout1 = nn.Dropout(0.4)    

        self.layer2   = nn.Linear(64, 32)
        self.bn2      = nn.BatchNorm1d(32)
        self.dropout2 = nn.Dropout(0.3)     
        self.output   = nn.Linear(32, num_classes)

    def forward(self, x):
        x = self.layer1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.dropout1(x)

        x = self.layer2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.dropout2(x)

        x = self.output(x)
        return x

# Initialize Model
input_dim   = X_tr.shape[1]
num_classes = len(label_cols)  # 4
model = ECGClassifier(input_dim, num_classes).to(device)

criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(
    model.parameters(),
    lr=0.001,
    weight_decay=1e-4      
)

num_epochs    = 100
best_val_loss = float('inf')
patience      = 10
trigger_times = 0

train_losses = []
val_losses   = []

best_state_dict = None  

print("Starting Neural Network Training...")

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for inputs, labels in train_loader:

        optimizer.zero_grad()
        outputs = model(inputs)
        loss    = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss    = criterion(outputs, labels)
            val_loss += loss.item()

    avg_train_loss = running_loss / len(train_loader)
    avg_val_loss   = val_loss / len(val_loader)

    train_losses.append(avg_train_loss)
    val_losses.append(avg_val_loss)

    if avg_val_loss < best_val_loss:
        best_val_loss  = avg_val_loss
        trigger_times  = 0
        best_state_dict = model.state_dict() 
    else:
        trigger_times += 1
        if trigger_times >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{num_epochs} | "
              f"Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

if best_state_dict is not None:
    model.load_state_dict(best_state_dict)
else:
    print("WARNING: FileNotFoundError")

model.eval()


y_true_test = []
y_pred_test = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs        = model(inputs)
        _, predicted   = torch.max(outputs, 1)

        y_true_test.extend(labels.cpu().numpy())
        y_pred_test.extend(predicted.cpu().numpy())

print("\n=== Neural Network Final Test Results ===")
print("Accuracy:", accuracy_score(y_true_test, y_pred_test))
print("\nClassification Report:\n",
      classification_report(
          y_true_test,
          y_pred_test,
          target_names=label_cols 
      ))


plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.title('Neural Network Training vs Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()


Moral of the story neural network not needed


In [None]:
# 5-fold stratified CV including Neural Network
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import precision_recall_fscore_support

# Prepare data
X_all = df_encoded[features_final]
y_all = y_labels 
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define models as pipelines (imputer + optional scaler + classifier)
models = {
    "LogisticRegression": log_reg_pipeline,
    "RandomForestClassifier": rf_pipeline,
    "KNeighbourClassifier": knn_best_pipeline,
    "DecisionTreeClassifier": dt_pipeline,
    "GradientBoostingClassifier": gb_pipeline,
}

# Run CV for traditional models
results = []

for name, pipe in models.items():
    cv_scores = cross_validate(
        pipe, X_all, y_all, cv=skf,
        scoring=['precision_macro', 'recall_macro', 'f1_macro']
    )
    results.append({
        "Model": name,
        "Precision": cv_scores['test_precision_macro'].mean() * 100,
        "Recall":    cv_scores['test_recall_macro'].mean() * 100,
        "F1 Score":  cv_scores['test_f1_macro'].mean() * 100
    })

# Neural Network CV (manual implementation)
print("Running Neural Network 5-fold CV...")

# Impute and scale all data first
imputer_cv = SimpleImputer(strategy='median')
X_imputed_cv = imputer_cv.fit_transform(X_all)
scaler_cv = StandardScaler()
X_scaled_cv = scaler_cv.fit_transform(X_imputed_cv)

# Convert labels to indices
y_indices_cv = df_encoded[label_cols].values.argmax(axis=1)

nn_precisions = []
nn_recalls = []
nn_f1s = []

for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_scaled_cv, y_indices_cv)):
    print(f"  Fold {fold_idx + 1}/5", end='... ')
    
    # Split data
    X_tr_fold = X_scaled_cv[train_idx]
    y_tr_fold = y_indices_cv[train_idx]
    X_va_fold = X_scaled_cv[val_idx]
    y_va_fold = y_indices_cv[val_idx]
    
    # Create dataloaders
    train_data_fold = TensorDataset(torch.FloatTensor(X_tr_fold), torch.LongTensor(y_tr_fold))
    val_data_fold = TensorDataset(torch.FloatTensor(X_va_fold), torch.LongTensor(y_va_fold))
    
    train_loader_fold = DataLoader(train_data_fold, batch_size=32, shuffle=True)
    val_loader_fold = DataLoader(val_data_fold, batch_size=32)
    
    # Initialize model
    model_fold = ECGClassifier(X_tr_fold.shape[1], len(label_cols)).to(device)
    criterion_fold = nn.CrossEntropyLoss()
    optimizer_fold = optim.Adam(model_fold.parameters(), lr=0.001, weight_decay=1e-4)
    
    # Training
    best_val_loss_fold = float('inf')
    patience_fold = 10
    trigger_times_fold = 0
    best_state_dict_fold = None
    
    for epoch in range(100):
        model_fold.train()
        for inputs, labels in train_loader_fold:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer_fold.zero_grad()
            outputs = model_fold(inputs)
            loss = criterion_fold(outputs, labels)
            loss.backward()
            optimizer_fold.step()
        
        # Validation
        model_fold.eval()
        val_loss_fold = 0.0
        with torch.no_grad():
            for inputs, labels in val_loader_fold:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model_fold(inputs)
                loss = criterion_fold(outputs, labels)
                val_loss_fold += loss.item()
        
        avg_val_loss_fold = val_loss_fold / len(val_loader_fold)
        
        if avg_val_loss_fold < best_val_loss_fold:
            best_val_loss_fold = avg_val_loss_fold
            trigger_times_fold = 0
            best_state_dict_fold = model_fold.state_dict()
        else:
            trigger_times_fold += 1
            if trigger_times_fold >= patience_fold:
                break
    
    # Load best model and evaluate
    if best_state_dict_fold is not None:
        model_fold.load_state_dict(best_state_dict_fold)
    
    model_fold.eval()
    y_true_fold = []
    y_pred_fold = []
    
    with torch.no_grad():
        for inputs, labels in val_loader_fold:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model_fold(inputs)
            _, predicted = torch.max(outputs, 1)
            y_true_fold.extend(labels.cpu().numpy())
            y_pred_fold.extend(predicted.cpu().numpy())
    
    # Calculate metrics
    p, r, f1, _ = precision_recall_fscore_support(
        y_true_fold, y_pred_fold, average='macro', zero_division=0
    )
    
    nn_precisions.append(p * 100)
    nn_recalls.append(r * 100)
    nn_f1s.append(f1 * 100)
    
    print(f"P: {p*100:.2f}%, R: {r*100:.2f}%, F1: {f1*100:.2f}%")

# Add NN results
results.append({
    "Model": "NeuralNetwork",
    "Precision": np.mean(nn_precisions),
    "Recall": np.mean(nn_recalls),
    "F1 Score": np.mean(nn_f1s)
})

results_df = pd.DataFrame(results)
print("\n=== 5-Fold CV Results (All Models) ===")
print(results_df)

# Visualization
order = results_df.sort_values('Recall', ascending=False)['Model']

results_long = results_df.melt(
    id_vars='Model',
    value_vars=['Precision', 'Recall', 'F1 Score'],
    var_name='Metric',
    value_name='Score'
)

plt.figure(figsize=(12, 8))
sns.barplot(
    data=results_long,
    x='Model',
    y='Score',
    hue='Metric',
    order=order
)

plt.ylabel('Percentage %')
plt.xlabel('Model Name')
plt.title('Model Performance Metrics (5-fold CV, Macro Averages)\nSorted by Recall')
plt.ylim(90, 100)  # Adjusted to better show differences
plt.xticks(rotation=25, ha='right')
plt.legend(title='Metric')
plt.tight_layout()
plt.show()