In [None]:
import json
import matplotlib.pyplot as plt

metrics_file_path = "gv_experiments/finetune/ResMLP_MAE_fingerprint_molformer_github_pretrained_on:ABCD_finetuned_on:ABCD_for_11_epochs_weight_decay:0.01_lr:0.5/metrics_seed0.json"

with open(metrics_file_path, "r") as f:
    metrics = json.load(f)

train_loss = metrics.get("train_loss", [])
val_loss = metrics.get("val_loss", [])
mcc = metrics.get("mcc_val", [])

plt.figure(figsize=(10, 6))
plt.plot(train_loss, label="Train Loss", marker="o")
plt.plot(val_loss, label="Validation Loss", marker="o")
plt.plot(mcc, label="MCC", marker="o")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training - Validation Losses and MCC over Epochs")
plt.legend()
plt.grid(True)
plt.tight_layout()

#plt.savefig("loss_plot.png")  
plt.show() 

In [None]:
import os
import json
import numpy as np
import matplotlib.pyplot as plt
from glob import glob

# Define the root folder containing all finetune subdirectories
finetune_root = "gv_experiments/finetune"

# Recursively find all "metrics_seed0.json" files in subdirectories
metrics_files = glob(os.path.join(finetune_root, "*/metrics_seed0.json"), recursive=True)
print(f"Found {len(metrics_files)} metric files.")

# Initialize lists to store metrics from all runs
train_losses = []
val_losses = []
mcc_values = []

# Determine the maximum epoch length across all runs
max_epochs = 0

# Load metrics and determine the maximum epoch length
for file in metrics_files:
    # Extract the parent folder name as the label
    folder_name = os.path.basename(os.path.dirname(file))
    
    print(f"Loading file: {file} (Run: {folder_name})")
    with open(file, "r") as f:
        metrics = json.load(f)
        train_losses.append(metrics.get("train_loss", []))
        val_losses.append(metrics.get("val_loss", []))
        mcc_values.append(metrics.get("mcc_val", []))
        max_epochs = max(max_epochs, len(metrics.get("train_loss", [])))

# Check if any metrics are loaded
if len(train_losses) == 0 or len(val_losses) == 0 or len(mcc_values) == 0:
    print("Error: No valid metric data found. Please check the metrics files.")
    exit()

# Pad all metrics to the maximum epoch length with NaNs
def pad_to_max_length(data, max_length):
    padded_data = []
    for run in data:
        if len(run) < max_length:
            # Pad with NaNs to the maximum length
            padded_data.append(run + [np.nan] * (max_length - len(run)))
        else:
            # Truncate if the run is longer than the maximum length
            padded_data.append(run[:max_length])
    return padded_data

# Apply padding and convert to numpy arrays with dtype=float to handle NaNs
train_losses = np.array(pad_to_max_length(train_losses, max_epochs), dtype=float)
val_losses = np.array(pad_to_max_length(val_losses, max_epochs), dtype=float)
mcc_values = np.array(pad_to_max_length(mcc_values, max_epochs), dtype=float)

# Avoid plotting if metrics are empty
if not train_losses.size or not val_losses.size or not mcc_values.size:
    print("Error: One or more metric arrays are empty.")
    exit()

# Calculate the mean and standard deviation while ignoring NaNs
def calculate_mean_and_std(data):
    mean = np.nanmean(data, axis=0)
    std = np.nanstd(data, axis=0)
    return mean, std

epochs = np.arange(max_epochs)

# Calculate mean and standard deviation for each metric
train_mean, train_std = calculate_mean_and_std(train_losses)
val_mean, val_std = calculate_mean_and_std(val_losses)
mcc_mean, mcc_std = calculate_mean_and_std(mcc_values)

# Plot smooth trend lines with shaded areas for standard deviation
fig, axs = plt.subplots(1, 3, figsize=(20, 6))

# Training Loss Trend
axs[0].plot(epochs, train_mean, color='blue', label='Train Loss Mean')
axs[0].fill_between(epochs, train_mean - train_std, train_mean + train_std, 
                    color='blue', alpha=0.2, label='±1 Std Dev')
axs[0].set_title("Training Loss Trend")
axs[0].set_xlabel("Epoch")
axs[0].set_ylabel("Loss")
axs[0].legend()

# Validation Loss Trend
axs[1].plot(epochs, val_mean, color='green', label='Validation Loss Mean')
axs[1].fill_between(epochs, val_mean - val_std, val_mean + val_std, 
                    color='green', alpha=0.2, label='±1 Std Dev')
axs[1].set_title("Validation Loss Trend")
axs[1].set_xlabel("Epoch")
axs[1].set_ylabel("Loss")
axs[1].legend()

# MCC Trend
axs[2].plot(epochs, mcc_mean, color='purple', label='MCC Mean')
axs[2].fill_between(epochs, mcc_mean - mcc_std, mcc_mean + mcc_std, 
                    color='purple', alpha=0.2, label='±1 Std Dev')
axs[2].set_title("MCC Trend")
axs[2].set_xlabel("Epoch")
axs[2].set_ylabel("MCC")
axs[2].legend()

# Unified title and layout
plt.suptitle("Metric Trends with Variance Across Fine-tuning Runs", fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.95])

# Save the plot to a file and show it
output_path = os.path.join(finetune_root, "metric_trend_waves.png")
plt.savefig(output_path)
print(f"Metric trend plot saved to {output_path}")
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import matthews_corrcoef

file_path = "ABCD/ResultsAndCheckpoints/ABCD/MAE_Mol/new_loader_MAE_Mol_ABCD_DRIAMS-any_specific_results/test_set_seed0.csv"
df = pd.read_csv(file_path)

df = df[(df["species"] == "Klebsiella pneumoniae")]

df["Predicted_Binary"] = (df["Predictions"] >= 0.5).astype(int)

df["response"] = df["response"].astype(int)

mcc = matthews_corrcoef(df["response"], df["Predicted_Binary"])

print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")