This script is built to run in python 3.13
Matching script
✅Match the experimental spectra to the in-silico generated spectra (1: m, one to many matching)
✅Score every pair using cosine similarity
✅Apply dual mass tolerance window to accommodate for the higher mass error (10 ppm) in fragments below 100 Da, while 5 ppm is selected for higher masses
✅Remove peaks matching the precursor m/z within 10 ppm were excluded from similarity scoring to avoid false positives
✅Keep only hits above threshold (e.g. > 0.5)
✅Output results to CSV for inspection & prioritization


In [None]:
from matchms.importing import load_from_msp
from matchms.similarity import CosineGreedy
import pandas as pd
import os
import re

In [None]:
# === File paths ===
exp_file = "Extracted_MS2_Spectra_PosMode.msp"
cfmid_file = "cfmid_combined_all_energies.msp"
output_csv = "Spectral_Matches_PosMode_vs_online_predicted_allenergies.csv"

In [None]:
# === File paths NI ===
#exp_file = "Extracted_MS2_Spectra_NegMode.msp"
#cfmid_file = "cfmid_predicted_NI_20eV.msp"
#output_csv = "Spectral_Matches_NegMode_NI_20eV.csv"

In [None]:
# === Load spectra ===
experimental = list(load_from_msp(exp_file))
predicted = list(load_from_msp(cfmid_file))

print(f"✅ Loaded {len(experimental)} experimental and {len(predicted)} predicted spectra")

In [None]:
def cosine_similarity_ppm(spec1, spec2, ppm_tol=5, ppm_tol_low=10, mz_cutoff=100):
    mz1, intens1 = np.array(spec1.mz), np.array(spec1.intensities)
    mz2, intens2 = np.array(spec2.mz), np.array(spec2.intensities)
    precursor1 = spec1.get("precursor_mz")
    precursor2 = spec2.get("precursor_mz")
    precursor_tol = 10  # ppm window around precursor to ignore

    # Normalize intensities
    intens1 = intens1 / intens1.max() if intens1.max() > 0 else intens1
    intens2 = intens2 / intens2.max() if intens2.max() > 0 else intens2

    # Sort m/z arrays
    i, j = 0, 0
    matched1, matched2 = [], []

    while i < len(mz1) and j < len(mz2):
        mz_val = mz1[i]
        current_tol = ppm_tol_low if mz_val < mz_cutoff else ppm_tol
        ppm_diff = abs(mz_val - mz2[j]) / mz_val * 1e6

        if ppm_diff <= current_tol:
            precursor_hit1 = abs(mz1[i] - precursor1) / precursor1 * 1e6 <= precursor_tol if precursor1 else False
            precursor_hit2 = abs(mz2[j] - precursor1) / precursor1 * 1e6 <= precursor_tol if precursor1 else False
            if precursor_hit1 or precursor_hit2:
                i += 1
                j += 1
                continue  # skip precursor ion
            matched1.append(intens1[i])
            matched2.append(intens2[j])
            i += 1
            j += 1
        elif mz1[i] < mz2[j]:
            i += 1
        else:
            j += 1

    if len(matched1) == 0:
        return 0.0, 0

    dot = np.dot(matched1, matched2)
    norm1 = np.linalg.norm(matched1)
    norm2 = np.linalg.norm(matched2)
    cosine_score = dot / (norm1 * norm2) if norm1 > 0 and norm2 > 0 else 0.0

    return cosine_score, len(matched1)

In [None]:
# === Match spectra using PPM-based cosine ===
precursor_ppm = 5  # stricter precursor filter
fragment_ppm = 5  # used in the cosine similarity function
ppm_tol_low=10  # for fragments < 100 m/z

results = []

for exp_spec in experimental:
    mz_exp = exp_spec.get("precursor_mz")
    if mz_exp is None:
        continue

    for pred_spec in predicted:
        mz_pred = pred_spec.get("precursor_mz")
        if mz_pred is None:
            continue

        # Step 1: precursor filter
        ppm_diff = abs(mz_exp - mz_pred) / mz_exp * 1e6
        if ppm_diff > precursor_ppm:
            continue

        # Step 2: compare fragments using cosine with ppm tolerance
        score, n_matches = cosine_similarity_ppm(exp_spec, pred_spec, ppm_tol=fragment_ppm)
        if score > 0.5:
            results.append({
                "Feature_ID": exp_spec.get("feature_id"),
                "Experimental_mz": mz_exp,
                "RT_min": exp_spec.get("retention_time"),
                "Predicted_Name": pred_spec.get("name") or pred_spec.get("compound_name"),
                "Predicted_mz": mz_pred,
                "SMILES": pred_spec.get("smiles"),
                "Cosine_Score": round(score, 4),
                "Num_Matching_Peaks": n_matches
            })


In [None]:
# === Save to CSV ===
df = pd.DataFrame(results)
df.sort_values("Cosine_Score", ascending=False, inplace=True)
df.to_csv(output_csv, index=False)

print(f"✅ Done! Found {len(df)} matches with cosine > 0.5 at {precursor_ppm} ppm tolerance")
print(f"📁 Results saved to: {output_csv}")

In [None]:
# === Keep only the best match per Feature_ID based on NUM_MATCHING_PEAKS ===
best_matches_df_online = df.sort_values("Num_Matching_Peaks", ascending=False).drop_duplicates("Feature_ID")

# Secondary sort by cosine score
best_matches_df_online = best_matches_df_online.sort_values(["Num_Matching_Peaks", "Cosine_Score"], ascending=[False, False])

# ❌ Remove meaningless perfect matches with only 1 peak
best_matches_df_online = best_matches_df_online[
    ~((best_matches_df_online["Num_Matching_Peaks"] == 1) & (best_matches_df_online["Cosine_Score"] == 1.0))
]

# 🧹 Remove duplicate predicted compounds
best_matches_df_online = best_matches_df_online.drop_duplicates(subset="Predicted_Name", keep="first")

# 💾 Save to CSV
best_matches_df_online.to_csv("Top_Matches_By_NumPeaks_PosMode_ppm5_online.csv", index=False)

print(f"✅ Final curated matches: {len(best_matches_df_online)} unique features + predicted compounds retained.")


In [None]:
######Next step is Retip, which I will be running in R. 
#Retip needs training set with known compounds and their respective Rt to create the model and the suspect screening results to predict the rt based on the proposed structure
#Prepare the data for Retip as follows

In [None]:
import pandas as pd

# === Load files ===
matches_file = "Top_Matches_By_NumPeaks_PosMode_ppm5_online.csv"
cfmid_file = "Matched_Suspects_PositiveMode_with_CFMID_ID.csv"

matches_df = pd.read_csv(matches_file)
cfmid_df = pd.read_csv(cfmid_file)

# === Standardize key columns ===
cfmid_df["CFMID_ID"] = cfmid_df["CFMID_ID"].astype(str).str.strip()
matches_df["Predicted_Name"] = matches_df["Predicted_Name"].astype(str).str.strip()

# === Merge on CFMID ID ===
merged = matches_df.merge(
    cfmid_df[["CFMID_ID", "Canonical_SMILES"]],
    left_on="Predicted_Name",
    right_on="CFMID_ID",
    how="left"
)

# Optional: move Canonical_SMILES column next to SMILES for clarity
cols = list(merged.columns)
if "SMILES" in cols and "Canonical_SMILES" in cols:
    smi_idx = cols.index("SMILES")
    cols.insert(smi_idx + 1, cols.pop(cols.index("Canonical_SMILES")))
    merged = merged[cols]

# === Save result ===
merged.to_csv("Top_Matches_Annotated_PosMode.csv", index=False)

print(f"✅ Done! Canonical SMILES added. Final shape: {merged.shape}")
print("📁 Saved to: Top_Matches_Annotated_PosMode.csv")


In [None]:
# Load annotated matches
df = pd.read_csv("Top_Matches_Annotated_PosMode.csv")

# Drop unnecessary columns
columns_to_drop = [
    "Predicted_Name", "Predicted_mz", "SMILES", "Cosine_Score", "Num_Matching_Peaks"
]
df = df.drop(columns=columns_to_drop, errors="ignore")

# Rename Canonical_SMILES → smiles
df = df.rename(columns={"Canonical_SMILES": "SMILES"})

# Save to new CSV
df.to_csv("suspect_for_retip_online.csv", index=False)

print(f"✅ Saved suspect list for Retip prediction: {df.shape[0]} entries")
print("📁 File: suspect_for_retip_online.csv")


In [None]:
#Move to RStudio to predict retention times. Bring back a csv files containing only the outliers

In [None]:
import pandas as pd

# Load the files
top_matches = pd.read_csv("Top_Matches_Annotated_PosMode.csv")
rt_outliers = pd.read_csv("RT_Outliers_XGB_online.csv")

# Remove outliers based on Feature_ID
filtered = top_matches[~top_matches["Feature_ID"].isin(rt_outliers["Feature_ID"])]

# Save the filtered result
filtered.to_csv("Top_Matches_Annotated_PosMode_filtered.csv", index=False)

print(f"✅ Filtered matches saved — {len(filtered)} entries remaining after removing RT outliers.")

In [None]:
#Combine all info 
# Load data
filtered_matches = pd.read_csv("Top_Matches_Annotated_PosMode_filtered.csv")
meta = pd.read_csv("Matched_Suspects_PositiveMode_with_CFMID_ID.csv", dtype=str)

# Merge desired metadata fields
merged = pd.merge(
    filtered_matches,
    meta[["CFMID_ID", "Suspect Name", "Function", "Source", "Formula"]],
    on="CFMID_ID",
    how="left"
)

# Save the enriched result
merged.to_csv("Top_Matches_Annotated_PosMode_filtered_enriched.csv", index=False)

print(f"✅ Enriched matches saved — {len(merged)} entries with additional suspect metadata.")


In [None]:
# === File paths NI ===
exp_file = "Extracted_MS2_Spectra_NegMode.msp"
cfmid_file = "cfmid_predicted_NI_20eV.msp"
output_csv = "Spectral_Matches_NegMode_NI_20eV.csv"

In [None]:
# === Load spectra ===
experimental = list(load_from_msp(exp_file))
predicted = list(load_from_msp(cfmid_file))

print(f"✅ Loaded {len(experimental)} experimental and {len(predicted)} predicted spectra")

In [None]:
# === Match spectra using PPM-based cosine ===
precursor_ppm = 5  # stricter precursor filter
fragment_ppm = 5  # used in the cosine similarity function
ppm_tol_low=10  # for fragments < 100 m/z

results = []

for exp_spec in experimental:
    mz_exp = exp_spec.get("precursor_mz")
    if mz_exp is None:
        continue

    for pred_spec in predicted:
        mz_pred = pred_spec.get("precursor_mz")
        if mz_pred is None:
            continue

        # Step 1: precursor filter
        ppm_diff = abs(mz_exp - mz_pred) / mz_exp * 1e6
        if ppm_diff > precursor_ppm:
            continue

        # Step 2: compare fragments using cosine with ppm tolerance
        score, n_matches = cosine_similarity_ppm(exp_spec, pred_spec, ppm_tol=fragment_ppm)
        if score > 0.5:
            results.append({
                "Feature_ID": exp_spec.get("feature_id"),
                "Experimental_mz": mz_exp,
                "RT_min": exp_spec.get("retention_time"),
                "Predicted_Name": pred_spec.get("name") or pred_spec.get("compound_name"),
                "Predicted_mz": mz_pred,
                "SMILES": pred_spec.get("smiles"),
                "Cosine_Score": round(score, 4),
                "Num_Matching_Peaks": n_matches
            })


In [None]:
# === Save to CSV ===
df = pd.DataFrame(results)
df.sort_values("Cosine_Score", ascending=False, inplace=True)
df.to_csv(output_csv, index=False)

print(f"✅ Done! Found {len(df)} matches with cosine > 0.5 at {precursor_ppm} ppm tolerance")
print(f"📁 Results saved to: {output_csv}")

In [None]:
# === Keep only the best match per Feature_ID based on NUM_MATCHING_PEAKS ===
best_matches_df_online = df.sort_values("Num_Matching_Peaks", ascending=False).drop_duplicates("Feature_ID")

# Secondary sort by cosine score
best_matches_df_online = best_matches_df_online.sort_values(["Num_Matching_Peaks", "Cosine_Score"], ascending=[False, False])

# ❌ Remove meaningless perfect matches with only 1 peak
best_matches_df_online = best_matches_df_online[
    ~((best_matches_df_online["Num_Matching_Peaks"] == 1) & (best_matches_df_online["Cosine_Score"] == 1.0))
]

# 🧹 Remove duplicate predicted compounds
best_matches_df_online = best_matches_df_online.drop_duplicates(subset="Predicted_Name", keep="first")

# 💾 Save to CSV
best_matches_df_online.to_csv("Top_Matches_By_NumPeaks_NegMode_ppm5.csv", index=False)

print(f"✅ Final curated matches: {len(best_matches_df_online)} unique features + predicted compounds retained.")


In [None]:
import pandas as pd

# === Load files ===
matches_file = "Top_Matches_By_NumPeaks_NegMode_ppm5.csv"
cfmid_file = "Matched_Suspects_NegativeMode_with_CFMID_ID.csv"

matches_df = pd.read_csv(matches_file)
cfmid_df = pd.read_csv(cfmid_file)

# === Standardize key columns ===
cfmid_df["CFMID_ID"] = cfmid_df["CFMID_ID"].astype(str).str.strip()
matches_df["Predicted_Name"] = matches_df["Predicted_Name"].astype(str).str.strip()

# === Merge on CFMID ID ===
merged = matches_df.merge(
    cfmid_df[["CFMID_ID", "Canonical_SMILES"]],
    left_on="Predicted_Name",
    right_on="CFMID_ID",
    how="left"
)

# Optional: move Canonical_SMILES column next to SMILES for clarity
cols = list(merged.columns)
if "SMILES" in cols and "Canonical_SMILES" in cols:
    smi_idx = cols.index("SMILES")
    cols.insert(smi_idx + 1, cols.pop(cols.index("Canonical_SMILES")))
    merged = merged[cols]

# === Save result ===
merged.to_csv("Top_Matches_Annotated_NegMode.csv", index=False)

print(f"✅ Done! Canonical SMILES added. Final shape: {merged.shape}")
print("📁 Saved to: Top_Matches_Annotated_NegMode.csv")


In [None]:
# Load annotated matches
df = pd.read_csv("Top_Matches_Annotated_NegMode.csv")

# Drop unnecessary columns
columns_to_drop = [
    "Predicted_Name", "Predicted_mz", "SMILES", "Cosine_Score", "Num_Matching_Peaks"
]
df = df.drop(columns=columns_to_drop, errors="ignore")

# Rename Canonical_SMILES → smiles
df = df.rename(columns={"Canonical_SMILES": "SMILES"})

# Save to new CSV
df.to_csv("suspect_for_retip_NI.csv", index=False)

print(f"✅ Saved suspect list for Retip prediction: {df.shape[0]} entries")
print("📁 File: suspect_for_retip_NI.csv")


In [None]:
import pandas as pd

# Load the files
top_matches = pd.read_csv("Top_Matches_Annotated_NegMode.csv")
rt_outliers = pd.read_csv("RT_Outliers_XGB_NI.csv")

# Remove outliers based on Feature_ID
filtered = top_matches[~top_matches["Feature_ID"].isin(rt_outliers["Feature_ID"])]

# Save the filtered result
filtered.to_csv("Top_Matches_Annotated_NegMode_filtered.csv", index=False)

print(f"✅ Filtered matches saved — {len(filtered)} entries remaining after removing RT outliers.")

In [None]:
#Combine all info 
# Load data
filtered_matches = pd.read_csv("Top_Matches_Annotated_NegMode_filtered.csv")
meta = pd.read_csv("Matched_Suspects_NegativeMode_with_CFMID_ID.csv", dtype=str)

# Merge desired metadata fields
merged = pd.merge(
    filtered_matches,
    meta[["CFMID_ID", "Suspect Name", "Function", "Source", "Formula"]],
    on="CFMID_ID",
    how="left"
)

# Save the enriched result
merged.to_csv("Top_Matches_Annotated_NegMode_filtered_enriched.csv", index=False)

print(f"✅ Enriched matches saved — {len(merged)} entries with additional suspect metadata.")


In [None]:
import pandas as pd

# === Load the confirmed isotopic profile data ===
confirmed_df = pd.read_excel("NIAS_2b_IP_confirmed.xlsx")
confirmed_ids = confirmed_df["ID"].astype(str).tolist()  # Ensure IDs are strings for comparison

# === Load the positive and negative mode match files ===
pos_df = pd.read_csv("Top_Matches_Annotated_PosMode_filtered_enriched_deduplicated_ToxTree.csv", dtype=str)
neg_df = pd.read_csv("Top_Matches_Annotated_NegMode_filtered_enriched_deduplicated_ToxTree.csv", dtype=str)

# === Filter both DataFrames based on confirmed IDs ===
filtered_pos = pos_df[pos_df["Feature_ID"].isin(confirmed_ids)]
filtered_neg = neg_df[neg_df["Feature_ID"].isin(confirmed_ids)]

# === Combine filtered data ===
combined_df = pd.concat([filtered_pos, filtered_neg], ignore_index=True)

# === Save to CSV ===
combined_df.to_csv("NIAS_2b_Confirmed_Annotated.csv", index=False)

print(f"✅ Saved {len(combined_df)} confirmed 2b matches to 'NIAS_2b_Confirmed_Annotated.csv'")


In [None]:

# Load confirmed intensity data (from TraceFinder)
area_df = pd.read_excel("NIAS_2b_IP_confirmed.xlsx")
area_df["ID"] = area_df["ID"].astype(str)  # Make sure ID is string

# Load annotated matches
annotated_df = pd.read_csv("NIAS_2b_Confirmed_Annotated.csv", dtype=str)
annotated_df["Feature_ID"] = annotated_df["Feature_ID"].astype(str)

# Merge based on Feature_ID (from annotation) and ID (from quantification)
merged_df = annotated_df.merge(area_df, how="left", left_on="Feature_ID", right_on="ID")

# Drop the duplicate 'ID' column if needed (same as Feature_ID)
merged_df.drop(columns=["ID"], inplace=True)

# Save the complete file
merged_df.to_csv("NIAS_2b_Confirmed_Annotated_With_areas.csv", index=False)

print(f"✅ Merged intensities for {len(merged_df)} confirmed 2b compounds.")
