In [None]:
import json
import matplotlib.pyplot as plt

metrics_file_path = "gv_experiments/finetune/ResMLP_MAE_fingerprint_molformer_github_pretrained_on:ABCD_finetuned_on:ABCD_for_11_epochs_weight_decay:0.01_lr:0.5/metrics_seed0.json"

with open(metrics_file_path, "r") as f:
    metrics = json.load(f)

train_loss = metrics.get("train_loss", [])
val_loss = metrics.get("val_loss", [])
mcc = metrics.get("mcc_val", [])

plt.figure(figsize=(10, 6))
plt.plot(train_loss, label="Train Loss", marker="o")
plt.plot(val_loss, label="Validation Loss", marker="o")
plt.plot(mcc, label="MCC", marker="o")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training - Validation Losses and MCC over Epochs")
plt.legend()
plt.grid(True)
plt.tight_layout()

#plt.savefig("loss_plot.png")  
plt.show() 

In [None]:
import os
import json
import numpy as np
import matplotlib.pyplot as plt
from glob import glob

# Define the root folder containing all finetune subdirectories
finetune_root = "gv_experiments/finetune"

# Recursively find all "metrics_seed0.json" files in subdirectories
metrics_files = glob(os.path.join(finetune_root, "*/metrics_seed0.json"), recursive=True)
print(f"Found {len(metrics_files)} metric files.")

# Initialize lists to store metrics from all runs
train_losses = []
val_losses = []
mcc_values = []

# Determine the maximum epoch length across all runs
max_epochs = 0

# Load metrics and determine the maximum epoch length
for file in metrics_files:
    # Extract the parent folder name as the label
    folder_name = os.path.basename(os.path.dirname(file))
    
    print(f"Loading file: {file} (Run: {folder_name})")
    with open(file, "r") as f:
        metrics = json.load(f)
        train_losses.append(metrics.get("train_loss", []))
        val_losses.append(metrics.get("val_loss", []))
        mcc_values.append(metrics.get("mcc_val", []))
        max_epochs = max(max_epochs, len(metrics.get("train_loss", [])))

# Check if any metrics are loaded
if len(train_losses) == 0 or len(val_losses) == 0 or len(mcc_values) == 0:
    print("Error: No valid metric data found. Please check the metrics files.")
    exit()

# Pad all metrics to the maximum epoch length with NaNs
def pad_to_max_length(data, max_length):
    padded_data = []
    for run in data:
        if len(run) < max_length:
            # Pad with NaNs to the maximum length
            padded_data.append(run + [np.nan] * (max_length - len(run)))
        else:
            # Truncate if the run is longer than the maximum length
            padded_data.append(run[:max_length])
    return padded_data

# Apply padding and convert to numpy arrays with dtype=float to handle NaNs
train_losses = np.array(pad_to_max_length(train_losses, max_epochs), dtype=float)
val_losses = np.array(pad_to_max_length(val_losses, max_epochs), dtype=float)
mcc_values = np.array(pad_to_max_length(mcc_values, max_epochs), dtype=float)

# Avoid plotting if metrics are empty
if not train_losses.size or not val_losses.size or not mcc_values.size:
    print("Error: One or more metric arrays are empty.")
    exit()

# Calculate the mean and standard deviation while ignoring NaNs
def calculate_mean_and_std(data):
    mean = np.nanmean(data, axis=0)
    std = np.nanstd(data, axis=0)
    return mean, std

epochs = np.arange(max_epochs)

# Calculate mean and standard deviation for each metric
train_mean, train_std = calculate_mean_and_std(train_losses)
val_mean, val_std = calculate_mean_and_std(val_losses)
mcc_mean, mcc_std = calculate_mean_and_std(mcc_values)

# Plot smooth trend lines with shaded areas for standard deviation
fig, axs = plt.subplots(1, 3, figsize=(20, 6))

# Training Loss Trend
axs[0].plot(epochs, train_mean, color='blue', label='Train Loss Mean')
axs[0].fill_between(epochs, train_mean - train_std, train_mean + train_std, 
                    color='blue', alpha=0.2, label='±1 Std Dev')
axs[0].set_title("Training Loss Trend")
axs[0].set_xlabel("Epoch")
axs[0].set_ylabel("Loss")
axs[0].legend()

# Validation Loss Trend
axs[1].plot(epochs, val_mean, color='green', label='Validation Loss Mean')
axs[1].fill_between(epochs, val_mean - val_std, val_mean + val_std, 
                    color='green', alpha=0.2, label='±1 Std Dev')
axs[1].set_title("Validation Loss Trend")
axs[1].set_xlabel("Epoch")
axs[1].set_ylabel("Loss")
axs[1].legend()

# MCC Trend
axs[2].plot(epochs, mcc_mean, color='purple', label='MCC Mean')
axs[2].fill_between(epochs, mcc_mean - mcc_std, mcc_mean + mcc_std, 
                    color='purple', alpha=0.2, label='±1 Std Dev')
axs[2].set_title("MCC Trend")
axs[2].set_xlabel("Epoch")
axs[2].set_ylabel("MCC")
axs[2].legend()

# Unified title and layout
plt.suptitle("Metric Trends with Variance Across Fine-tuning Runs", fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.95])

# Save the plot to a file and show it
output_path = os.path.join(finetune_root, "metric_trend_waves.png")
plt.savefig(output_path)
print(f"Metric trend plot saved to {output_path}")
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import matthews_corrcoef

file_path = "ABCD/ResultsAndCheckpoints/ABCD/MAE_Mol/new_loader_MAE_Mol_ABCD_DRIAMS-any_specific_results/test_set_seed0.csv"
df = pd.read_csv(file_path)

df = df[(df["species"] == "Klebsiella pneumoniae")]

df["Predicted_Binary"] = (df["Predictions"] >= 0.5).astype(int)

df["response"] = df["response"].astype(int)

mcc = matthews_corrcoef(df["response"], df["Predicted_Binary"])

print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")

In [None]:
import pandas as pd

csv_file = "processed_data/DRIAMS_combined_long_table.csv"
data = pd.read_csv(csv_file)

klebsiella_data = data[data['species'] == "Klebsiella pneumoniae"]

klebsiella_data.rename(columns={'drug': 'Name'}, inplace=True)

percentages = klebsiella_data.groupby('Name')['response'].mean() * 100

sample_counts = klebsiella_data.groupby('Name')['sample_id'].nunique()

result = pd.DataFrame({
    'percentage_1_responses': percentages,
    'distinct_sample_count': sample_counts
}).reset_index()

result.to_csv("klebsiella_data/1klebsiella_response_stats.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  klebsiella_data.rename(columns={'drug': 'Name'}, inplace=True)


In [None]:
import pandas as pd

csv_file = "processed_data/DRIAMS_combined_long_table.csv"
data = pd.read_csv(csv_file)

# Extract all drug names from the entire dataset
all_drugs = data['drug'].unique()

# Filter Klebsiella pneumoniae data
klebsiella_data = data[data['species'] == "Klebsiella pneumoniae"]

# Rename column for consistency
klebsiella_data.rename(columns={'drug': 'Name'}, inplace=True)

# Compute the percentage of response == 1
percentages = klebsiella_data.groupby('Name')['response'].mean() * 100

# Compute distinct sample counts
sample_counts = klebsiella_data.groupby('Name')['sample_id'].nunique()

# Create a result DataFrame with only the drugs present in Klebsiella data
result = pd.DataFrame({
    'percentage_1_responses': percentages,
    'distinct_sample_count': sample_counts
}).reset_index()

# Ensure all drugs from the original dataset are included
all_drugs_df = pd.DataFrame({'Name': all_drugs})

# Merge with the result, filling missing values with NaN
final_result = all_drugs_df.merge(result, on='Name', how='left')

# Save to CSV
final_result.to_csv("klebsiella_full_response_stats.csv", index=False)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  klebsiella_data.rename(columns={'drug': 'Name'}, inplace=True)


In [None]:
## KNOWN DRUG SPEARSON CORRELATION COMPUTATION ## WINDSURFING

import pandas as pd

input_type = "raw_morgan"
# Load the data
percentage = pd.read_csv("klebsiella_data/klebsiella_response_stats.csv")
proba = pd.read_csv(f"rankings/known_drug_ranks_{input_type}_windsurfing.csv")

# Rename columns
proba.columns = ["Name", "Placement_Score", "Rank"]

# Merge the data frames on the "Name" column
data = pd.merge(percentage, proba, on="Name")

# Quality Control: Filter rows where distinct_sample_count > 100
data = data[data["distinct_sample_count"] > 100]

# Compute Spearman rank correlations
correlation_rank = data["percentage_1_responses"].corr(data["Rank"], method="spearman")
#correlation_probability = data["percentage_1_responses"].corr(data["Average_Probability"], method="spearman")

# Print the results
print(f"Spearman correlation between percentage_1_responses and Rank: {correlation_rank}")
#print(f"Spearman correlation between percentage_1_responses and Average_Probability: {correlation_probability}")


Spearman correlation between percentage_1_responses and Rank: 0.7447769621682666


In [None]:
## AVERAGE PROBABILITY RANKING METHOD ##

def calculate_probability_ranking(input_directory, exclusion_file, output_file):
    """
    Processes prediction files and aggregates probability data for ranking.
    Merges aggregated data with compound lists and saves ranked results.

    Args:
        input_directory (str): Directory containing prediction files.
        excluded_samples (set): Set of sample IDs to be excluded.
        output_file (str): Path to save the ranked results.

    Returns:
        .csv: Average probability - based ranking. 
    """
    excluded_samples = load_excluded_samples(exclusion_file)
    aggregate_dict = {}

    for file_name in os.listdir(input_directory):
        # Skip files containing excluded sample IDs
        if any(sample in file_name for sample in excluded_samples):
            print(f"Excluding file: {file_name}")
            continue

        if file_name.endswith(".csv"):
            file_path = os.path.join(input_directory, file_name)
            print(f"Processing file: {file_path}")

            try:
                df = pd.read_csv(file_path, usecols=["SMILES", "Predictions"])
            except ValueError:
                print(f"Skipping file {file_path}: Missing required columns.")
                continue

            df = df.drop_duplicates(subset=["SMILES"])

            for _, row in df.iterrows():
                smiles = row["SMILES"]
                prediction = row["Predictions"]

                # Accumulate predictions for each SMILES
                if smiles in aggregate_dict:
                    aggregate_dict[smiles]["total"] += prediction
                    aggregate_dict[smiles]["count"] += 1
                else:
                    aggregate_dict[smiles] = {"total": prediction, "count": 1}

    if aggregate_dict:
        # Compute the average probability for each compound
        aggregate_df = pd.DataFrame([
            {"SMILES": smiles, "Average_Probability": data["total"] / data["count"]}
            for smiles, data in aggregate_dict.items()
        ])

        # Sort by increasing average resistance probability
        aggregate_df = aggregate_df.sort_values(by="Average_Probability", ascending=True)

        # Merge aggregate_df with the original vendor compound list and rank 
        df1 = aggregate_df
        df2 = pd.read_csv("compound_lists/Enamine_Hit_Locator_with_fingerprints.csv")
        lf = pd.merge(df1, df2, on="SMILES")
        lf = lf[["Name", "SMILES", "MW","ClogP","HBD","TPSA","RotBonds","Morgan_Fingerprint", "Average_Probability" ]]
        lf = lf.sort_values(by="Average_Probability", ascending=True)

        # Save ranked results
        lf.to_csv(output_file, index=False)
        print(f"Ranking complete. Results saved to {output_file}")
    else:
        print("No valid data was processed.")


if method == "probability":
    """
    - Aggregates resistance probability predictions across samples.
    - Computes the average probability for each compound.
    - Sorts compounds by increasing resistance probability.
    """
    calculate_probability_ranking(input_directory, excluded_samples_file, output_file)

In [None]:
## KNOWN DRUG PROBABILITY AGGREGATION FUNCTION ##

def calculate_known_drug_probabilities(input_directory, exclusion_file, output_file):
    """
    Process prediction files and aggregate data for known drug probabilities.
    Save the aggregated predictions to a CSV file.

    Args:
        input_directory (str): Directory containing prediction files.
        excluded_samples (set): Set of sample IDs to be excluded.
        output_file (str): Path to the output file where results will be saved.
    """
    excluded_samples = load_excluded_samples(exclusion_file)
    aggregate_dict = {}

    for file_name in os.listdir(input_directory):
        # Skip files containing excluded sample IDs
        if any(sample in file_name for sample in excluded_samples):
            print(f"Excluding file: {file_name}")
            continue

        # Process only CSV files
        if file_name.endswith(".csv"):
            file_path = os.path.join(input_directory, file_name)
            print(f"Processing file: {file_path}")
            
            try:
                # Load predictions from CSV file
                df = pd.read_csv(file_path, usecols=["Drugs", "Predictions"])
            except ValueError:
                print(f"Skipping file {file_path}: Missing required columns.")
                continue
            
            # Remove duplicate drug entries within the file
            df = df.drop_duplicates(subset=["Drugs"])
            
            # Aggregate predictions for each drug
            for _, row in df.iterrows():
                name = row["Drugs"]
                prediction = row["Predictions"]
                
                if name in aggregate_dict:
                    aggregate_dict[name]["total"] += prediction
                    aggregate_dict[name]["count"] += 1
                else:
                    aggregate_dict[name] = {"total": prediction, "count": 1}
    
    if aggregate_dict:
        # Convert aggregated data into a DataFrame
        aggregate_df = pd.DataFrame([
            {"Name": names, "Average_Probability": data["total"] / data["count"]}
            for names, data in aggregate_dict.items()
        ])
        
        # Sort predictions in ascending order of probability
        aggregate_df = aggregate_df.sort_values(by="Average_Probability", ascending=True)

        # Save to CSV file
        aggregate_df.to_csv(output_file, index=False)
        print(f"Results saved to {output_file}")
    else:
        print("No valid data was processed.")