### Compound Ranking Notebook  

This notebook contains all the code related to the **compound ranking** step of the pipeline. Each cell includes a detailed description of its function.  

#### Key Functions:
1. **Ranking Vendor Compounds Based on Resistance Probability Predictions:**  
   - Uses different ranking strategies to evaluate the likelihood of vendor compounds exhibiting resistance.  
   - Three ranking methods are implemented:  
     - **Probability-based ranking:** Computes the **average resistance probability** for each compound across test samples.  
     - **Windsurfing placement ranking:** Evaluates how **well a compound ranks** across multiple test samples and **averages** its placement.  
     - **Log probability sum ranking:** Computes the **sum of log resistance probabilities** to emphasize consistently high predictions.  

2. **Generating Ranked Lists of Vendor Compounds:**  
   - Saves ranked compounds as **CSV files** in the `rankings/` directory.  
   - Each ranking method produces a separate list of ranked compounds.  

3. **Combining Ranking Strategies:**  
   - Identifies **common top "n" compounds** from both **Average Probability-based** and **Placement-based** rankings.  
   - Merges rankings and **sorts by probability** for final analysis.  
   - Saves a **combined ranking** of compounds to `rankings/combined_ranking_top{n}_{input_type}.csv`.  

4. **Identification of Known Drug Ranks:**  
   - Determines how **known antibiotics rank among vendor compounds** based on predicted resistance probabilities.  

5. **Spearman Correlation Analysis:**  
   - Computes the **Spearman correlation** between:  
     - `known_drug_ranks` (ranks of known drugs among vendor compounds).  
     - `percentage_of_1_responses` (percentage of resistant test samples to a specific known antibiotic). 


In [1]:
import os
import pandas as pd
import numpy as np
import polars as pl

In [2]:
## HELPER FUNCTION ##

def load_excluded_samples(file_path):
    """Load excluded sample IDs from a text file."""
    with open(file_path, 'r') as f:
        return {line.strip() for line in f}

## WINDSURFING PLACEMENT RANKING METHOD ##

def calculate_windsurfing_ranking(input_directory, exclusion_file, output_file):
    """
    Computes ranking based on the placement of compounds in individual sample prediction files.

    Args:
        input_directory (str): Directory containing prediction files.
        exclusion_file (str): File listing sample IDs to be excluded.
        output_file (str): Path to save the ranking results.
    """
    excluded_samples = load_excluded_samples(exclusion_file)
    placement_scores = {}

    for file_name in os.listdir(input_directory):
        if file_name.endswith(".csv") and all(excluded not in file_name for excluded in excluded_samples):
            file_path = os.path.join(input_directory, file_name)
            print(f"Processing file: {file_path}")

            try:
                df = pd.read_csv(file_path, usecols=["SMILES", "Predictions"])
            except ValueError:
                print(f"Skipping file {file_path}: Missing required columns.")
                continue

            df = df.drop_duplicates(subset=["SMILES"])
            df = df.sort_values(by="Predictions", ascending=True).reset_index(drop=True)

            for index, row in df.iterrows():
                smiles = row["SMILES"]
                placement_score = index + 1  # Rank based on position in sorted list

                # Accumulate placement scores
                if smiles in placement_scores:
                    placement_scores[smiles] += placement_score
                else:
                    placement_scores[smiles] = placement_score
            

    if placement_scores:
        placement_df = pd.DataFrame(
            [{"SMILES": smiles, "Placement_Score": score} for smiles, score in placement_scores.items()]
        )

        placement_df = placement_df.sort_values(by="Placement_Score", ascending=True)

        # Merge placement_df with the original vendor compound list and rank 
        df1 = placement_df
        df2 = pd.read_csv("compound_lists/Enamine_Hit_Locator_with_fingerprints.csv")
        lf = pd.merge(df1, df2, on="SMILES")
        lf = lf[["Name", "SMILES", "MW","ClogP","HBD","TPSA","RotBonds","Morgan_Fingerprint", "Placement_Score" ]]
        lf = lf.sort_values(by="Placement_Score", ascending=True)

        # Save ranked results
        lf.to_csv(output_file, index=False)
        print(f"Ranking complete. Results saved to {output_file}")
    else:
        print("No valid data was processed.")


## LOG PROBABILITY SUM METHOD ##

def calculate_log_prob_sum_ranking(input_directory, exclusion_file, output_file):
    """
    Processes prediction files, aggregates data using the sum of logs of probabilities,
    merges with compound lists, and saves ranked results to a CSV file.
    """
    excluded_samples = load_excluded_samples(exclusion_file)
    aggregate_dict = {}

    for file_name in os.listdir(input_directory):
        if any(sample in file_name for sample in excluded_samples):
            print(f"Excluding file: {file_name}")
            continue

        if file_name.endswith(".csv"):
            file_path = os.path.join(input_directory, file_name)
            print(f"Processing file: {file_path}")

            try:
                df = pd.read_csv(file_path, usecols=["SMILES", "Predictions"])
            except ValueError:
                print(f"Skipping file {file_path}: Missing required columns.")
                continue

            df = df.drop_duplicates(subset=["SMILES"])

            for _, row in df.iterrows():
                smiles = row["SMILES"]
                prediction = row["Predictions"]

                # Ensure prediction value is positive before applying log
                if prediction <= 0:
                    print(f"Skipping invalid prediction value for {smiles}: {prediction}")
                    continue

                # Compute log sum for each compound
                if smiles in aggregate_dict:
                    aggregate_dict[smiles]["log_sum"] += np.log(prediction)
                else:
                    aggregate_dict[smiles] = {"log_sum": np.log(prediction)}

    if aggregate_dict:
        aggregate_df = pd.DataFrame([
            {"SMILES": smiles, "LogSumScore": data["log_sum"]}
            for smiles, data in aggregate_dict.items()
        ])

        aggregate_df = aggregate_df.sort_values(by="LogSumScore", ascending=True)

        # Merge aggregate_df with the original vendor compound list and rank 
        df1 = aggregate_df
        df2 = pd.read_csv("compound_lists/Enamine_Hit_Locator_with_fingerprints.csv")
        lf = pd.merge(df1, df2, on="SMILES")
        lf = lf[["Name", "SMILES", "MW","ClogP","HBD","TPSA","RotBonds","Morgan_Fingerprint", "LogSumScore" ]]
        lf = lf.sort_values(by="LogSumScore", ascending=True)

        lf.to_csv(output_file, index=False)
        print(f"Ranking complete. Results saved to {output_file}")
    else:
        print("No valid data was processed.")

## KNOWN DRUG PROBABILITY AGGREGATION FUNCTIONS ##

def calculate_known_drug_log_probability_sum_ranks(input_directory, exclusion_file, output_file):
    """
    Process prediction files and aggregate data for known drug probabilities using the log probability sum method.
    Save the aggregated predictions to a CSV file.

    Args:
        input_directory (str): Directory containing prediction files.
        excluded_samples (set): Set of sample IDs to be excluded.
        output_file (str): Path to the output file where results will be saved.
    """
    excluded_samples = load_excluded_samples(exclusion_file)
    aggregate_dict = {}

    for file_name in os.listdir(input_directory):
        # Skip files containing excluded sample IDs
        if any(sample in file_name for sample in excluded_samples):
            print(f"Excluding file: {file_name}")
            continue

        # Process only CSV files
        if file_name.endswith(".csv"):
            file_path = os.path.join(input_directory, file_name)
            print(f"Processing file: {file_path}")
            
            try:
                # Load predictions from CSV file
                df = pd.read_csv(file_path, usecols=["Drugs", "Predictions"])
            except ValueError:
                print(f"Skipping file {file_path}: Missing required columns.")
                continue
            
            # Remove duplicate drug entries within the file
            df = df.drop_duplicates(subset=["Drugs"])
            
            # Aggregate predictions for each drug
            for _, row in df.iterrows():
                name = row["Drugs"]
                prediction = row["Predictions"]

            # Ensure prediction value is positive before applying log
                if prediction <= 0:
                    print(f"Skipping invalid prediction value for {name}: {prediction}")
                    continue
                
                # Compute log sum for each compound
                if name in aggregate_dict:
                    aggregate_dict[name]["log_sum"] += np.log(prediction)
                else:
                    aggregate_dict[name] = {"log_sum": np.log(prediction)}
    
    if aggregate_dict:
        # Convert aggregated data into a DataFrame
        aggregate_df = pd.DataFrame([
            {"Name": names, "LogProbSum": data["log_sum"]}
            for names, data in aggregate_dict.items()
        ])
        
        # Sort predictions in ascending order of probability
        aggregate_df = aggregate_df.sort_values(by="LogProbSum", ascending=True)

        # Save to CSV file
        aggregate_df.to_csv(output_file, index=False)
        print(f"Results saved to {output_file}")
    else:
        print("No valid data was processed.")

def calculate_known_drug_windsurfing_ranks(prefiltering_folder, full_predictions_folder, exclusion_file, output_file):
    """
    Processes prediction files, merges data from prefiltering and full prediction folders, 
    and tracks placement scores of known drugs.

    Args:
        prefiltering_folder (str): Directory containing prefiltered prediction files.
        full_predictions_folder (str): Directory containing full prediction files.
        exclusion_file (str): Path to the file containing excluded sample IDs.
        output_file (str): Path to the output file where placement scores will be saved.
    """
    excluded_samples = load_excluded_samples(exclusion_file)
    placement_sums = {}

    processed_files = 0
    excluded_files = 0

    full_folder_files = set(os.listdir(full_predictions_folder))
    #print(f"Files in full_predictions_folder: {full_folder_files}\n")

    for file_name in os.listdir(prefiltering_folder):
        # Skip non-CSV files explicitly
        if not file_name.endswith(".csv"):
            print(f"Skipping non-CSV file: {file_name}")  
            continue

        # Extract sample ID from filename
        sample_id = file_name.replace("predictions_sample_", "").split(".csv")[0]

        # Check if sample ID is in the excluded list
        if sample_id in excluded_samples:
            print(f"Excluding file: {file_name}")
            excluded_files += 1
            continue        

        # Debug: Print which file is being processed
        print(f"Checking file: {file_name} in {prefiltering_folder}")

        # Define the exact expected filename in full_predictions_folder
        full_prediction_path = os.path.join(full_predictions_folder, file_name)

        # **Check if the corresponding file exists**
        if file_name not in full_folder_files:
            print(f"Skipping {file_name}: No matching file in full_predictions_folder.\n")
            continue

        try:
            # Load prefiltering predictions (contains "Drugs" and "Predictions")
            prefilter_df = pd.read_csv(os.path.join(prefiltering_folder, file_name), usecols=["Drugs", "Predictions"])
            
            # Load full predictions (contains "SMILES" and "Predictions")
            full_df = pd.read_csv(full_prediction_path, usecols=["SMILES", "Predictions"])
        except ValueError:
            print(f"Skipping {file_name}: Missing required columns.")
            continue

        # Rename "Drugs" to "SMILES" in prefiltering dataframe
        prefilter_df = prefilter_df.rename(columns={"Drugs": "SMILES"})

        # Concatenate prefilter_df and full_df
        combined_df = pd.concat([prefilter_df, full_df], ignore_index=True)

        # Sort all predictions by probability (smaller = higher ranking)
        combined_df = combined_df.sort_values(by="Predictions", ascending=True).reset_index(drop=True)

        # Track placement of known drugs (only those from prefilter_df)
        for smiles in prefilter_df["SMILES"]:
            if smiles in combined_df["SMILES"].values:
                rank = combined_df[combined_df["SMILES"] == smiles].index[0] + 1  # Rank starts at 1
        
                if smiles in placement_sums:
                    placement_sums[smiles] += rank
                else:
                    placement_sums[smiles] = rank


        processed_files += 1

    print(f"\nProcessed {processed_files} files.")  
    print(f"Excluded {excluded_files} files.")  

    # Convert placement sum dictionary to DataFrame
    placement_df = pd.DataFrame([
        {"Name": smiles, "Placement_Score": placement}  
        for smiles, placement in placement_sums.items()
    ])

    placement_df = placement_df.sort_values(by="Placement_Score", ascending=True)

    # Save the placement results
    placement_df.to_csv(output_file, index=False)
    print(f"Placement scores for known drugs saved to {output_file}")


In [None]:
## SELECT AND EXECUTE A RANKING METHOD BASED ON THE CHOSEN STRATEGY ##

# Define input parameters
input_type = "mae_molformer"  # Type of input data (e.g., Morgan fingerprints - Raw Spectra)
method = "log_prob_sum"  # Ranking method to use; options: ["windsurfing", "log_prob_sum"]

# Define file paths for input data, excluded samples, and output ranking file
input_directory = f"klebsiella_resistance_predictions/{input_type}"
excluded_samples_file = f"sample_lists/excluded_samples_{input_type}.txt"
output_file = f"rankings/{method}_ranked_compounds_{input_type}.csv"

# Execute the selected ranking method
if method == "windsurfing":
    """
    - Computes ranking based on placement across multiple sample predictions.
    - Averages the relative ranking of each compound across different test samples.
    - Lower scores indicate compounds that consistently rank better.
    """
    calculate_windsurfing_ranking(input_directory, excluded_samples_file, output_file)

elif method == "log_prob_sum":
    """
    - Computes ranking based on the sum of log probabilities across samples.
    - Highlights compounds with strong predicted resistance probabilities.
    - Sorts compounds in descending order based on log probability sum.
    """
    calculate_log_prob_sum_ranking(input_directory, excluded_samples_file, output_file)

else:
    print("Invalid method specified. Please choose from ['windsurfing', 'log_prob_sum'].")


In [None]:
## COMBINES THE RANKINGS FROM TWO METHODS USING THE COMMON TOP "n" CANDIDATES FROM BOTH ##

input_type = "raw_morgan"

# Load ranked compound lists from both log probability-based and windsurfing methods
log_prob_comp_df = pd.read_csv(f"rankings/log_prob_sum_ranked_compounds_{input_type}.csv")
wind_comp_df = pd.read_csv(f"rankings/windsurfing_ranked_compounds_{input_type}.csv")

# Define the number of top candidates to consider from each ranking method
n = 1000  # Select the top "n" compounds from both rankings

# Retain only the top "n" ranked compounds in both lists
log_prob_comp_df = log_prob_comp_df.iloc[:n]
wind_comp_df = wind_comp_df.iloc[:n]

# Assign ranking positions to the selected top "n" compounds in each list
log_prob_comp_df["Log Sum Probability Ranking"] = log_prob_comp_df.index + 1  # Rank based on probability
wind_comp_df["Point Placement Ranking"] = wind_comp_df.index + 1  # Rank based on windsurfing placement

# Merge both rankings based on compound "Name", keeping only the common compounds
merged_df = pd.merge(log_prob_comp_df, wind_comp_df, on='Name', how='inner')

# Drop duplicate or unnecessary columns
merged_df = merged_df.drop(columns=["SMILES_x", "LogSumScore", "Placement_Score", "MW_x", "ClogP_x","HBD_x","TPSA_x","RotBonds_x","Morgan_Fingerprint_x"])

# Rename the SMILES column for consistency
merged_df = merged_df.rename(columns={"SMILES_y": "SMILES", "MW_y": "MW", "ClogP_y":"ClogP","HBD_y":"HBD","TPSA_y":"TPSA","RotBonds_y":"RotBonds","Morgan_Fingerprint_y":"Morgan_Fingerprint"})

# Reorder columns to place SMILES as the second column for readability
cols = merged_df.columns.tolist()  
cols.insert(1, cols.pop(cols.index("SMILES")))  # Move "SMILES" to second position
merged_df = merged_df[cols]  

# Sort the merged ranking based on "Average Probability Ranking" in ascending order
merged_df_sorted = merged_df.sort_values(by="Log Sum Probability Ranking", ascending=True)

# Save the final combined ranking to a CSV file
merged_df_sorted.to_csv(f"rankings/combined_ranking_top{n}_{input_type}_log.csv", index=False)

print(f"Combined ranking of top {n} compounds saved to rankings/combined_ranking_top{n}_{input_type}_log.csv")


In [9]:
## KNOWN DRUG AGGREGATE RESISTANCE PROBABILITY COMPUTATION ##

# Define input type and file paths
input_type = "mae_molformer"
method = "windsurfing" #log_prob_sum or windsurfing
input_directory = f"prefiltering_{input_type}"  # Directory containing individual sample predictions
full_predictions_folder = f"klebsiella_resistance_predictions/{input_type}" # Directory containing all the prediction files for the specific input_type
output_file = f"klebsiella_resistance_predictions/DRIAMS_drugs_predictions_{input_type}_{method}.csv"  # Output file for aggregated results
excluded_samples_file = f"sample_lists/excluded_samples_{input_type}.txt"  # List of excluded samples

# Process prediction files and aggregate resistance probabilities for known drugs and save the aggregated predictions to a CSV file
if method == "log_prob_sum":
    calculate_known_drug_log_probability_sum_ranks(input_directory, excluded_samples_file, output_file)
elif method == "windsurfing":
    calculate_known_drug_windsurfing_ranks(input_directory, full_predictions_folder, excluded_samples_file, output_file)

Excluding file: predictions_sample_6c6e9033-8df6-48fa-8651-9efe85fd5a1b_3312.csv
Excluding file: predictions_sample_3b011528-0cfc-4253-84c7-db44375a5aec.csv
Excluding file: predictions_sample_8732d661-3e41-461e-a4b7-20ee29ef3bec_3312.csv
Checking file: predictions_sample_84cabaa1-9c48-4e36-b468-fceeb23f1661.csv in prefiltering_mae_molformer
Checking file: predictions_sample_007893a6-6ff9-4fe9-9b4b-d65fb4f181ca_MALDI2.csv in prefiltering_mae_molformer
Checking file: predictions_sample_2f3275a8-d073-440a-a7c3-dc37ab37a6e6_3313.csv in prefiltering_mae_molformer
Checking file: predictions_sample_070045c0-1cab-43d2-83ac-09a2c5d7dbf8_MALDI1.csv in prefiltering_mae_molformer
Checking file: predictions_sample_10090ede-c4d4-414d-b304-b604642554c0.csv in prefiltering_mae_molformer
Excluding file: predictions_sample_6fe5176f-00df-46c2-91de-93e3ac9c7ae3_MALDI1.csv
Checking file: predictions_sample_42cc25be-48da-42ea-80a4-6664b96e0663_MALDI2.csv in prefiltering_mae_molformer
Excluding file: predict

In [11]:
## RANK IDENTIFICATION FOR THE KNOWN DRIAMS DRUGS ##

# Define input type, ranking method and file paths for the aggregated DRIAMS drug predictions and the ranked compounds
input_type = "mae_molformer"
method = "windsurfing"
file_1 = f'klebsiella_resistance_predictions/DRIAMS_drugs_predictions_{input_type}_{method}.csv' 
file_2 = f'rankings/{method}_ranked_compounds_{input_type}.csv'   

# Load the two CSV files into DataFrames
df1 = pd.read_csv(file_1)
df2 = pd.read_csv(file_2)

# Combine the two DataFrames into one
combined_df = pd.concat([df1, df2], ignore_index=True)

if method == "windsurfing":
    # Sort the combined DataFrame by 'Average_Probability' in ascending order and reset the index
    combined_df = combined_df.sort_values(by='Placement_Score', ascending=True).reset_index(drop=True)

    # Create a new column 'Rank' based on the sorted index (starting at 1)
    combined_df['Rank'] = combined_df.index + 1

    # Filter to retain only the rows corresponding to the drugs in the first file
    df1_with_ranks = combined_df[combined_df['Name'].isin(df1['Name'])]

    # Save the ranking results with drug names, average probabilities, and ranks to a CSV file
    df1_with_ranks[['Name', 'Placement_Score', 'Rank']].to_csv(f'rankings/known_drug_ranks_{input_type}_windsurfing.csv', index=False)

elif method == "log_prob_sum":
    # Sort the combined DataFrame by 'Average_Probability' in ascending order and reset the index
    combined_df = combined_df.sort_values(by='LogProbSum', ascending=True).reset_index(drop=True)

    # Create a new column 'Rank' based on the sorted index (starting at 1)
    combined_df['Rank'] = combined_df.index + 1

    # Filter to retain only the rows corresponding to the drugs in the first file
    df1_with_ranks = combined_df[combined_df['Name'].isin(df1['Name'])]

    # Save the ranking results with drug names, average probabilities, and ranks to a CSV file
    df1_with_ranks[['Name', 'LogProbSum', 'Rank']].to_csv(f'rankings/known_drug_ranks_{input_type}_log_prob_sum.csv', index=False)


In [13]:
## KNOWN DRUG SPEARSON CORRELATION COMPUTATION ##

import pandas as pd

input_type = "mae_molformer"
method = "windsurfing"

# Load the data
percentage = pd.read_csv("klebsiella_data/klebsiella_response_stats.csv")
proba = pd.read_csv(f"rankings/known_drug_ranks_{input_type}_{method}.csv")

# Rename columns
proba.columns = ["Name", "Score", "Rank"]

# Merge the data frames on the "Name" column
data = pd.merge(percentage, proba, on="Name")

# Quality Control: Filter rows where distinct_sample_count > 100
data = data[data["distinct_sample_count"] > 100]

# Compute and print the Spearman rank correlation
correlation_rank = data["percentage_1_responses"].corr(data["Rank"], method="spearman")
print(f"Spearman correlation between percentage_1_responses and Rank: {correlation_rank}")

Spearman correlation between percentage_1_responses and Rank: 0.6939582156973463
