This script is built to run in python 3.13
Step 1: Identify Relevant Databases
Since we are working with food packaging materials, we need suspect lists related to:

Plastics and additives

Contaminants and migrants

Food contact chemicals (FCCs)

Here is the list of the compiled databases
 1. CPPdb ListA Mapped 06032019
 2. CPPdb ListB Mapped 06032019
 3. ECHA PlasticAdditivesInitiative 06032019
 4. FOODCONTACTSDB FCCdb FINAL LIST V5 WStructures
 5. FOODPLASTICS 2025 03 31
 6. PLASTICMAP 2025 03 31
 7. S112 FCCMIGEX
 8. S117 PFASFCCDB
 9. S118 PFASFCCMIGEX

Step 2: Curate the Suspect Lists
Standardization pipeline
Input columns needed:

#Compound name

#SMILES

#InChIKey

#CAS number (optional)

#Molecular weight

Remove problematic entries:
  Mixtures (e.g. multiple SMILES per entry)
  Salts or poorly defined substances (you can standardize salts to neutral molecules)
  Very large or inorganic compounds (filter MW to e.g. 100–1000 Da)

Add metadata like functional use (plasticizer, monomer, photoinitiator, etc.) from the original sources.

In [None]:
import os
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, SaltRemover
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.Chem import rdMolDescriptors

In [None]:
# === Configure your file paths ===
input_files = [
    "CPPdb_ListA_Mapped_06032019.xlsx",
    "CPPdb_ListB_Mapped_06032019.xlsx",
    "ECHA_PlasticAdditivesInitiative_06032019.csv",
    "FOODCONTACTSDB_FCCdb_FINAL_LIST_v5_wStructures.csv",
    "FOODPLASTICS-2025-03-31.csv",
    "PLASTICMAP-2025-03-31.csv",
    "S112_FCCMIGEX.csv",
    "S117_PFASFCCDB.xlsx",
    "S118_PFASFCCMIGEX.csv"
]

In [None]:
# === Load files ===
all_dfs = []
for file in input_files:
    ext = os.path.splitext(file)[-1].lower()
    if ext in [".csv", ".txt"]:
        df = pd.read_csv(file, dtype=str, encoding='utf-8', on_bad_lines='skip')
    elif ext in [".xls", ".xlsx"]:
        df = pd.read_excel(file, dtype=str)
    else:
        continue
    df["Source"] = os.path.splitext(os.path.basename(file))[0]
    all_dfs.append(df)

In [None]:
# === Merge and extract relevant fields ===
all_entries = []
for df in all_dfs:
    for _, row in df.iterrows():
        # SMILES: from any of the following columns
        smiles = (
            row.get("SMILES")
        )
        if not isinstance(smiles, str) or "." in smiles:
            continue  # Skip mixtures or missing

        # Name / Preferred Name
        name = (
            row.get("PREFERRED NAME") or row.get("Preferred_Name")
        )

        # Functional use (if available)
        function_fields = [
            row.get("Function"),
            row.get("FunctionalUse"),
            row.get("Description, \nfunction"),
            row.get("Other function in plastic"),
            row.get("Use \nlevels and migration potential")
        ]
        function = "; ".join(str(f).strip() for f in function_fields if isinstance(f, str) and f.strip())

        all_entries.append({
            "SMILES": smiles.strip(),
            "Name": str(name).strip() if isinstance(name, str) else None,
            "Function": function,
            "Source": row.get("Source")
        })

In [None]:
pd.DataFrame(all_entries).to_csv("Raw_Suspect_List.csv", index=False)

In [None]:
# === Deduplicate raw SMILES before RDKit processing ===
print(f"🔍 Total entries before raw SMILES deduplication: {len(all_entries)}")

# Convert to DataFrame to drop duplicates based on 'SMILES'
df_raw = pd.DataFrame(all_entries)
df_raw = df_raw.drop_duplicates(subset="SMILES", keep="first")

# Reconvert to list of dicts for the RDKit loop
all_entries = df_raw.to_dict(orient="records")

print(f"✅ Entries after removing duplicate SMILES: {len(all_entries)}")

In [None]:
raw_mixture_count = sum(1 for entry in all_entries if "." in entry["SMILES"])
print(f"🧼 SMILES containing mixtures (before RDKit): {raw_mixture_count}")

In [None]:
# === RDKit processing ===
processed = []
unique_smiles = set()
remover = SaltRemover.SaltRemover()
uncharger = rdMolStandardize.Uncharger()
#tautomer_canon = rdMolStandardize.TautomerCanonicalizer()

for entry in all_entries:
    smiles = entry["SMILES"]
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            continue

        # Handle mixtures
        frags = Chem.GetMolFrags(mol, asMols=True, sanitizeFrags=True)
        if len(frags) > 1:
            mol = max(frags, key=lambda m: m.GetNumAtoms())

        # Normalize tautomers and neutralize
        mol = rdMolStandardize.Normalize(mol)  # Apply normalization directly
        mol = uncharger.uncharge(mol)
        #mol = tautomer_canon.canonicalize(mol)

        # Molecular weight filtering
        mw = Descriptors.ExactMolWt(mol)
        if not (100 <= mw <= 1000):
            continue

                # Canonical SMILES
        can_smiles = Chem.MolToSmiles(mol, canonical=True)
        if can_smiles in unique_smiles:
            continue

        unique_smiles.add(can_smiles)

        # Get molecular formula
        formula = rdMolDescriptors.CalcMolFormula(mol)

        entry.update({
            "Canonical_SMILES": can_smiles,
            "MW": mw,
            "Formula": formula
        })
        processed.append(entry)
    except Exception:
        continue

# === Save final output ===
cleaned_df = pd.DataFrame(processed)

# Drop duplicates based on Canonical SMILES, just in case
before_dedup = len(cleaned_df)
cleaned_df = cleaned_df.drop_duplicates(subset="Canonical_SMILES", keep="first")
after_dedup = len(cleaned_df)

cleaned_df.to_csv("Curated_Suspect_List.csv", index=False)

print(f"✅ Done. Final suspect list contains {after_dedup} unique entries.")
print(f"🧹 Duplicates removed after processing: {before_dedup - after_dedup}")

In [None]:
# Generate accurate mass lists for MS matching

#Add the monoisotopic m/z values for the most common adducts:

#[M+H]+	MW + 1.007276	Positive mode
#[M+Na]+	MW + 22.989218	Positive mode
#[M-H]-	MW - 1.007276	Negative mode

In [None]:
# === Add monoisotopic m/z values for common adducts ===
H_mass = 1.007276
Na_mass = 22.989218

cleaned_df["[M+H]+"] = cleaned_df["MW"].astype(float) + H_mass
cleaned_df["[M+Na]+"] = cleaned_df["MW"].astype(float) + Na_mass
cleaned_df["[M-H]-"] = cleaned_df["MW"].astype(float) - H_mass

# Round to 4 decimals
cleaned_df["[M+H]+"] = cleaned_df["[M+H]+"].round(4)
cleaned_df["[M+Na]+"] = cleaned_df["[M+Na]+"].round(4)
cleaned_df["[M-H]-"] = cleaned_df["[M-H]-"].round(4)

# Save final list for MS matching
cleaned_df.to_csv("Curated_Suspect_List_with_Adducts.csv", index=False)
print("✅ Saved: Curated_Suspect_List_with_Adducts.csv")


In [None]:
#Match Loads MS-DIAL and suspect list files in positive ionization results

#   Match suspect [M+H]+ to MS-DIAL Average Mz within ±5 ppm

#   Keep all matches (1:m (one feature may match multiple suspects) or m:1 (many features to match the same suspect))

#   Retain useful metadata from both files (e.g. Alignment ID, MS/MS spectrum, formula, etc.)

In [None]:
# === Step 1: Load files ===
features_file = "Aligned_MS1_Features_PI_MSDIAL_03.csv"
suspects_file = "Curated_Suspect_List_with_Adducts.csv"

features_df = pd.read_csv(features_file, low_memory=False)
suspects_df = pd.read_csv(suspects_file)

In [None]:
# === Step 2: Prepare for matching ===
ppm_tolerance = 5
matches = []

# Make sure m/z columns are numeric
features_df["Average Mz"] = pd.to_numeric(features_df["Average Mz"], errors="coerce")
suspects_df["[M+H]+"] = pd.to_numeric(suspects_df["[M+H]+"], errors="coerce")

# Drop rows with missing values in key columns
features_df = features_df.dropna(subset=["Average Mz"])
suspects_df = suspects_df.dropna(subset=["[M+H]+"])

In [None]:
# === Step 3: Perform m/z matching ===
for _, feature in features_df.iterrows():
    feature_mz = feature["Average Mz"]
    mz_min = feature_mz - (feature_mz * ppm_tolerance / 1e6)
    mz_max = feature_mz + (feature_mz * ppm_tolerance / 1e6)

    matches_subset = suspects_df[suspects_df["[M+H]+"].between(mz_min, mz_max)].copy()
    
    for _, suspect in matches_subset.iterrows():
        delta_ppm = abs(feature_mz - suspect["[M+H]+"]) / suspect["[M+H]+"] * 1e6
        matches.append({
            "Alignment ID": feature.get("Alignment ID"),
            "Average Mz": feature_mz,
            "Average Rt(min)": feature.get("Average Rt(min)"),
            "MS/MS spectrum": feature.get("MS/MS spectrum"),
            "Peak Height": feature.filter(like="Height").mean(),  # average across sample columns
            "Suspect Name": suspect.get("Name"),
            "Function": suspect.get("Function"),
            "Source": suspect.get("Source"),
            "Canonical_SMILES": suspect.get("Canonical_SMILES"),
            "Formula": suspect.get("Formula"),
            "MW": suspect.get("MW"),
            "[M+H]+": suspect.get("[M+H]+"),
            "Δppm": round(delta_ppm, 2)
        })

In [None]:
# === Step 4: Convert to DataFrame and save ===
matches_df = pd.DataFrame(matches)
matches_df.to_csv("Matched_Suspects_PositiveMode.csv", index=False)

print(f"✅ Matching complete: {len(matches_df)} matches saved to 'Matched_Suspects_PositiveMode.csv'")


Predict MS/MS only for suspects that were matched to real features
Use Matched_Suspects_PositiveMode.csv

🔁 Steps:
#Add a CFMID_ID column (Molecule1, Molecule2, …)
        #Preserve the original row order and mapping
        #Ensure the CFMID_ID stays traceable to the full Matched_Suspects_PositiveMode.csv
#Keep only:
    #CFMID_ID
    #Canonical_SMILES
    #Drop duplicate Canonical_SMILES (keep first)
    #Export as tab-separated file with no header
    #Later: use CFMID_ID to merge predictions back into your full match table

In [None]:
# Step 1: Load match file
matches_df = pd.read_csv("Matched_Suspects_PositiveMode.csv")

# Step 2: Drop rows without SMILES
matches_df = matches_df.dropna(subset=["Canonical_SMILES"])

# Step 3: Add CFM-ID before deduplication
matches_df["CFMID_ID"] = ["Molecule" + str(i + 1) for i in range(len(matches_df))]

# Step 4: Save full match table with IDs (to reconnect later)
matches_df.to_csv("Matched_Suspects_PositiveMode_with_CFMID_ID.csv", index=False)

# Step 5: Deduplicate by Canonical_SMILES for CFM-ID prediction
cfmid_input = matches_df.drop_duplicates(subset=["Canonical_SMILES"], keep="first")
cfmid_input = cfmid_input[["CFMID_ID", "Canonical_SMILES"]]

# Step 6: Export tab-separated file (no header)
cfmid_input.to_csv("CFMID_Input_PosMode_FromMatches.txt", sep="\t", index=False, header=False)

print(f"✅ Done! Input file for CFM-ID created with {len(cfmid_input)} unique SMILES.")
print("📁 File: CFMID_Input_PosMode_FromMatches.txt")
print("📁 Full match file: Matched_Suspects_PositiveMode_with_CFMID_ID.csv")


In [None]:
# Load and count lines in the final CFM-ID input file
with open("CFMID_Input_PosMode_FromMatches.txt", "r") as f:
    lines = f.readlines()

print(f"📦 Total unique SMILES submitted to CFM-ID: {len(lines)}")

In [None]:
#Remember, you are working just with results from positive mode, you will have to repeat this part for the negative mode as well. 
#To avoid getting this script too long, I moved the part for the negative results into NI.ipynb

In [None]:
#Now download docker, and make sure that you have Windows Subsystem for Linux (wsl) installed and updated for docker to run properly
#Open docker and search wishartlab/cfmid and click on pull
#Then create a foler named cfmid_runner and put the CFMID_Input_PosMode_FromMatches.txt file inside
#Open command promt and run the following
#docker run --rm -v "C:/Users/User/Documents/R projects/NIAS/cfmid_runner:/cfmid/public/" -i wishartlab/cfmid:latest sh -c "cd /cfmid/public/ && cfm-predict 'CFMID_Input_PosMode_FromMatches.txt' 0.001 /trained_models_cfmid4.0/[M+H]+/param_output.log /trained_models_cfmid4.0/[M+H]+/param_config.txt 1 myout"

In [None]:
#Convert Your Predicted CFM-ID Spectra to .msp
#The following script will
#Loop through all MoleculeX.txt files in the myout folder and extract:
        #ID, SMILES, InChiKey, PMass
        #Fragment peaks from your selected energy level (e.g. energy1 = 20 eV)
        #Format it into proper .msp
        #Save to a single file like cfmid_predicted_20eV.msp
#You have to run this 3 times, each time for a separate energy level

In [None]:
import os
import re

input_folder = "cfmid_runner/myout"  # Folder with MoleculeX.txt files
output_msp = "cfmid_predicted_10eV.msp"
energy_level = "energy0"  # change to "energy0" or "energy2" for 10/40 eV

entries = []

for filename in os.listdir(input_folder):
    if filename.startswith("Molecule"):
        with open(os.path.join(input_folder, filename), "r") as f:
            lines = f.readlines()

        metadata = {
            "Name": None,
            "SMILES": None,
            "InChIKey": None,
            "Formula": None,
            "PrecursorMZ": None
        }

        peaks = []
        current_energy = None

        for line in lines:
            line = line.strip()

            # Extract metadata
            if line.startswith("#ID="):
                metadata["Name"] = line.split("=")[1].strip()
            elif line.startswith("#SMILES="):
                metadata["SMILES"] = line.split("=")[1].strip()
            elif line.startswith("#InChiKey="):
                metadata["InChIKey"] = line.split("=")[1].strip()
            elif line.startswith("#Formula="):
                metadata["Formula"] = line.split("=")[1].strip()
            elif line.startswith("#PMass="):
                metadata["PrecursorMZ"] = float(line.split("=")[1].strip())

            # Set energy block (e.g. energy1 = 20 eV)
            elif line.lower().startswith("energy"):
                current_energy = line.strip().lower()

            # Read peaks if we're in the right energy section
            elif current_energy == energy_level and re.match(r"^\d", line):
                parts = line.split()
                try:
                    mz = float(parts[0])
                    intensity = float(parts[1])
                    peaks.append((mz, intensity))
                except:
                    continue

        if not peaks or metadata["Name"] is None:
            continue

        entry = []
        entry.append(f"Name: {metadata['Name']}")
        entry.append(f"PrecursorMZ: {metadata['PrecursorMZ']:.5f}")
        entry.append(f"SMILES: {metadata['SMILES']}")
        entry.append(f"InChIKey: {metadata['InChIKey']}")
        entry.append(f"Formula: {metadata['Formula']}")
        entry.append("Num Peaks: " + str(len(peaks)))
        for mz, intensity in peaks:
            entry.append(f"{mz:.5f}\t{intensity:.2f}")

        entries.append("\n".join(entry))

# Save .msp
with open(output_msp, "w") as out_f:
    out_f.write("\n\n".join(entries))

print(f"✅ Saved {len(entries)} predicted spectra to '{output_msp}' using {energy_level}")


In [None]:
input_folder = "cfmid_runner/myout"  # Folder with MoleculeX.txt files
output_msp = "cfmid_predicted_20eV.msp"
energy_level = "energy1"  # change to "energy0" or "energy2" for 10/40 eV

entries = []

for filename in os.listdir(input_folder):
    if filename.startswith("Molecule"):
        with open(os.path.join(input_folder, filename), "r") as f:
            lines = f.readlines()

        metadata = {
            "Name": None,
            "SMILES": None,
            "InChIKey": None,
            "Formula": None,
            "PrecursorMZ": None
        }

        peaks = []
        current_energy = None

        for line in lines:
            line = line.strip()

            # Extract metadata
            if line.startswith("#ID="):
                metadata["Name"] = line.split("=")[1].strip()
            elif line.startswith("#SMILES="):
                metadata["SMILES"] = line.split("=")[1].strip()
            elif line.startswith("#InChiKey="):
                metadata["InChIKey"] = line.split("=")[1].strip()
            elif line.startswith("#Formula="):
                metadata["Formula"] = line.split("=")[1].strip()
            elif line.startswith("#PMass="):
                metadata["PrecursorMZ"] = float(line.split("=")[1].strip())

            # Set energy block (e.g. energy1 = 20 eV)
            elif line.lower().startswith("energy"):
                current_energy = line.strip().lower()

            # Read peaks if we're in the right energy section
            elif current_energy == energy_level and re.match(r"^\d", line):
                parts = line.split()
                try:
                    mz = float(parts[0])
                    intensity = float(parts[1])
                    peaks.append((mz, intensity))
                except:
                    continue

        if not peaks or metadata["Name"] is None:
            continue

        entry = []
        entry.append(f"Name: {metadata['Name']}")
        entry.append(f"PrecursorMZ: {metadata['PrecursorMZ']:.5f}")
        entry.append(f"SMILES: {metadata['SMILES']}")
        entry.append(f"InChIKey: {metadata['InChIKey']}")
        entry.append(f"Formula: {metadata['Formula']}")
        entry.append("Num Peaks: " + str(len(peaks)))
        for mz, intensity in peaks:
            entry.append(f"{mz:.5f}\t{intensity:.2f}")

        entries.append("\n".join(entry))

# Save .msp
with open(output_msp, "w") as out_f:
    out_f.write("\n\n".join(entries))

print(f"✅ Saved {len(entries)} predicted spectra to '{output_msp}' using {energy_level}")


In [None]:
input_folder = "cfmid_runner/myout"  # Folder with MoleculeX.txt files
output_msp = "cfmid_predicted_40eV.msp"
energy_level = "energy2"  # change to "energy0" or "energy2" for 10/40 eV

entries = []

for filename in os.listdir(input_folder):
    if filename.startswith("Molecule"):
        with open(os.path.join(input_folder, filename), "r") as f:
            lines = f.readlines()

        metadata = {
            "Name": None,
            "SMILES": None,
            "InChIKey": None,
            "Formula": None,
            "PrecursorMZ": None
        }

        peaks = []
        current_energy = None

        for line in lines:
            line = line.strip()

            # Extract metadata
            if line.startswith("#ID="):
                metadata["Name"] = line.split("=")[1].strip()
            elif line.startswith("#SMILES="):
                metadata["SMILES"] = line.split("=")[1].strip()
            elif line.startswith("#InChiKey="):
                metadata["InChIKey"] = line.split("=")[1].strip()
            elif line.startswith("#Formula="):
                metadata["Formula"] = line.split("=")[1].strip()
            elif line.startswith("#PMass="):
                metadata["PrecursorMZ"] = float(line.split("=")[1].strip())

            # Set energy block (e.g. energy1 = 20 eV)
            elif line.lower().startswith("energy"):
                current_energy = line.strip().lower()

            # Read peaks if we're in the right energy section
            elif current_energy == energy_level and re.match(r"^\d", line):
                parts = line.split()
                try:
                    mz = float(parts[0])
                    intensity = float(parts[1])
                    peaks.append((mz, intensity))
                except:
                    continue

        if not peaks or metadata["Name"] is None:
            continue

        entry = []
        entry.append(f"Name: {metadata['Name']}")
        entry.append(f"PrecursorMZ: {metadata['PrecursorMZ']:.5f}")
        entry.append(f"SMILES: {metadata['SMILES']}")
        entry.append(f"InChIKey: {metadata['InChIKey']}")
        entry.append(f"Formula: {metadata['Formula']}")
        entry.append("Num Peaks: " + str(len(peaks)))
        for mz, intensity in peaks:
            entry.append(f"{mz:.5f}\t{intensity:.2f}")

        entries.append("\n".join(entry))

# Save .msp
with open(output_msp, "w") as out_f:
    out_f.write("\n\n".join(entries))

print(f"✅ Saved {len(entries)} predicted spectra to '{output_msp}' using {energy_level}")


In [None]:
import os
import re
import pandas as pd

# === Load metadata CSV ===
meta = pd.read_csv("Matched_Suspects_PositiveMode_with_CFMID_ID.csv", dtype=str)
meta = meta.set_index("CFMID_ID")

# === Define columns ===
smiles_col = "Canonical_SMILES"
formula_col = "Formula"
precursor_col = "[M+H]+"

# === File paths ===
input_folder = "C:/download"
output_msp = "cfmid_combined_all_energies.msp"

# === Start processing ===
entries = []
skipped = 0

for filename in os.listdir(input_folder):
    if not filename.startswith("Molecule") or not filename.endswith(".txt"):
        continue

    molecule_id = filename.replace(".txt", "")
    filepath = os.path.join(input_folder, filename)

    if molecule_id not in meta.index:
        print(f"⚠️ Skipping {molecule_id} — not in metadata")
        skipped += 1
        continue

    try:
        precursor_mz = float(meta.loc[molecule_id, precursor_col])
    except:
        print(f"⚠️ Skipping {molecule_id} — missing precursor m/z in metadata")
        skipped += 1
        continue

    with open(filepath, "r", encoding="utf-8") as f:
        lines = f.readlines()

    peaks = []
    for line in lines:
        line = line.strip()
        if re.match(r"^\d", line):  # Fragment line
            parts = line.split()
            try:
                mz = float(parts[0])
                intensity = float(parts[1])
                peaks.append((mz, intensity))
            except:
                continue

    if not peaks:
        print(f"⚠️ Skipping {filename} — no fragments found")
        skipped += 1
        continue

    smiles = meta.loc[molecule_id, smiles_col]
    formula = meta.loc[molecule_id, formula_col] if formula_col in meta.columns else "N/A"

    entry = [
        f"Name: {molecule_id}",
        f"PrecursorMZ: {precursor_mz:.5f}",
        f"SMILES: {smiles}",
        f"Formula: {formula}",
        f"Num Peaks: {len(peaks)}"
    ]
    for mz, intensity in peaks:
        entry.append(f"{mz:.5f}\t{intensity:.2f}")

    entries.append("\n".join(entry))

# === Write combined output ===
with open(output_msp, "w", encoding="utf-8") as out_f:
    out_f.write("\n\n".join(entries))

print(f"✅ Combined spectra saved to '{output_msp}' ({len(entries)} entries)")
print(f"⚠️ Skipped {skipped} files due to issues")


In [None]:
#Up until now we were working on the predicted spectra.
#Now move on to python 3.10 and script "extract_msms_from_experimental_spectra" to create the experimental scpectra .msp and come back for the mathcing part

In [None]:
from matchms.importing import load_from_msp
from matchms.similarity import CosineGreedy
import pandas as pd

# === File paths ===
exp_file = "Extracted_MS2_Spectra_PosMode.msp"
cfmid_file = "cfmid_online_predicted_20eV.msp"
output_csv = "Spectral_Matches_PosMode_vs_online_predicted_20eV.csv"

# === Load spectra ===
experimental = list(load_from_msp(exp_file))
predicted = list(load_from_msp(cfmid_file))

print(f"✅ Loaded {len(experimental)} experimental and {len(predicted)} predicted spectra")



In [None]:
import numpy as np
import pandas as pd
from matchms.importing import load_from_msp

# === Custom cosine similarity function using PPM ===
def cosine_similarity_ppm(spec1, spec2, ppm_tol=5):
    mz1, intens1 = np.array(spec1.mz), np.array(spec1.intensities)
    mz2, intens2 = np.array(spec2.mz), np.array(spec2.intensities)
    precursor1 = spec1.get("precursor_mz")
    precursor2 = spec2.get("precursor_mz")
    precursor_tol = 10  # ppm window around precursor to ignore
    
    # Normalize intensities to relative scale (0–1)
    intens1 = intens1 / intens1.max() if intens1.max() > 0 else intens1
    intens2 = intens2 / intens2.max() if intens2.max() > 0 else intens2
    
    # Sort m/z values
    i, j = 0, 0
    matched1, matched2 = [], []

    while i < len(mz1) and j < len(mz2):
        ppm_diff = abs(mz1[i] - mz2[j]) / mz1[i] * 1e6
        if ppm_diff <= ppm_tol:
            precursor_hit1 = abs(mz1[i] - precursor1) / precursor1 * 1e6 <= precursor_tol if precursor1 else False
            precursor_hit2 = abs(mz2[j] - precursor1) / precursor1 * 1e6 <= precursor_tol if precursor1 else False
            if precursor_hit1 or precursor_hit2:
                i += 1
                j += 1
                continue  # skip matching the precursor ion
            matched1.append(intens1[i])
            matched2.append(intens2[j])
            i += 1
            j += 1
        elif mz1[i] < mz2[j]:
            i += 1
        else:
            j += 1

    if len(matched1) == 0:
        return 0.0, 0

    # Compute cosine similarity
    dot = np.dot(matched1, matched2)
    norm1 = np.linalg.norm(matched1)
    norm2 = np.linalg.norm(matched2)
    cosine_score = dot / (norm1 * norm2) if norm1 > 0 and norm2 > 0 else 0.0

    return cosine_score, len(matched1)

In [None]:
# === Load spectra ===
exp_file = "Extracted_MS2_Spectra_PosMode.msp"
cfmid_file = "cfmid_predicted_20eV.msp"
output_csv = "Spectral_Matches_PosMode_vs_20eV_ppm10.csv"

experimental = list(load_from_msp(exp_file))
predicted = list(load_from_msp(cfmid_file))

print(f"✅ Loaded {len(experimental)} experimental and {len(predicted)} predicted spectra")

In [None]:
# === Match spectra using PPM-based cosine ===
precursor_ppm = 5  # stricter precursor filter
fragment_ppm = 5  # used in the cosine similarity function

results = []

for exp_spec in experimental:
    mz_exp = exp_spec.get("precursor_mz")
    if mz_exp is None:
        continue

    for pred_spec in predicted:
        mz_pred = pred_spec.get("precursor_mz")
        if mz_pred is None:
            continue

        # Step 1: precursor filter
        ppm_diff = abs(mz_exp - mz_pred) / mz_exp * 1e6
        if ppm_diff > precursor_ppm:
            continue

        # Step 2: compare fragments using cosine with ppm tolerance
        score, n_matches = cosine_similarity_ppm(exp_spec, pred_spec, ppm_tol=fragment_ppm)
        if score > 0.5:
            results.append({
                "Feature_ID": exp_spec.get("feature_id"),
                "Experimental_mz": mz_exp,
                "RT_min": exp_spec.get("retention_time"),
                "Predicted_Name": pred_spec.get("name") or pred_spec.get("compound_name"),
                "Predicted_mz": mz_pred,
                "SMILES": pred_spec.get("smiles"),
                "InChIKey": pred_spec.get("inchikey"),
                "Cosine_Score": round(score, 4),
                "Num_Matching_Peaks": n_matches
            })


In [None]:
ppm_tolerance

In [None]:
# === Save to CSV ===
df = pd.DataFrame(results)
df.sort_values("Cosine_Score", ascending=False, inplace=True)
df.to_csv(output_csv, index=False)

print(f"✅ Done! Found {len(df)} matches with cosine > 0.5 at {ppm_tolerance} ppm tolerance")
print(f"📁 Results saved to: {output_csv}")

In [None]:
# === Keep only the best match per Feature_ID based on NUM_MATCHING_PEAKS ===
best_matches_df = df.sort_values("Num_Matching_Peaks", ascending=False).drop_duplicates("Feature_ID")

# Secondary sort by cosine score
best_matches_df = best_matches_df.sort_values(["Num_Matching_Peaks", "Cosine_Score"], ascending=[False, False])

# ❌ Remove meaningless perfect matches with only 1 peak
best_matches_df = best_matches_df[
    ~((best_matches_df["Num_Matching_Peaks"] == 1) & (best_matches_df["Cosine_Score"] == 1.0))
]

# 🧹 Remove duplicate predicted compounds
best_matches_df = best_matches_df.drop_duplicates(subset="Predicted_Name", keep="first")

# 💾 Save to CSV
best_matches_df.to_csv("Top_Matches_By_NumPeaks_PosMode_ppm5ppm10.csv", index=False)

print(f"✅ Final curated matches: {len(best_matches_df)} unique features + predicted compounds retained.")


In [None]:
######Next step is Retip, which I will be running in R. 
#Retip needs training set with known compounds and their respective Rt to create the model and the suspect screening results to predict the rt based on the proposed structure
#Prepare the data for Retip as follows

In [None]:
import pandas as pd

# === Load files ===
matches_file = "Top_Matches_By_NumPeaks_PosMode_ppm5ppm10.csv"
cfmid_file = "Matched_Suspects_PositiveMode_with_CFMID_ID.csv"

matches_df = pd.read_csv(matches_file)
cfmid_df = pd.read_csv(cfmid_file)

# === Standardize key columns ===
cfmid_df["CFMID_ID"] = cfmid_df["CFMID_ID"].astype(str).str.strip()
matches_df["Predicted_Name"] = matches_df["Predicted_Name"].astype(str).str.strip()

# === Merge on CFMID ID ===
merged = matches_df.merge(
    cfmid_df[["CFMID_ID", "Canonical_SMILES"]],
    left_on="Predicted_Name",
    right_on="CFMID_ID",
    how="left"
)

# Optional: move Canonical_SMILES column next to SMILES for clarity
cols = list(merged.columns)
if "SMILES" in cols and "Canonical_SMILES" in cols:
    smi_idx = cols.index("SMILES")
    cols.insert(smi_idx + 1, cols.pop(cols.index("Canonical_SMILES")))
    merged = merged[cols]

# === Save result ===
merged.to_csv("Top_Matches_Annotated_PosMode.csv", index=False)

print(f"✅ Done! Canonical SMILES added. Final shape: {merged.shape}")
print("📁 Saved to: Top_Matches_Annotated_PosMode.csv")


In [None]:
# Load annotated matches
df = pd.read_csv("Top_Matches_Annotated_PosMode.csv")

# Drop unnecessary columns
columns_to_drop = [
    "Predicted_Name", "Predicted_mz", "SMILES", "Cosine_Score", "Num_Matching_Peaks"
]
df = df.drop(columns=columns_to_drop, errors="ignore")

# Rename Canonical_SMILES → smiles
df = df.rename(columns={"Canonical_SMILES": "SMILES"})

# Save to new CSV
df.to_csv("suspect_for_retip.csv", index=False)

print(f"✅ Saved suspect list for Retip prediction: {df.shape[0]} entries")
print("📁 File: suspect_for_retip.csv")


In [None]:
import sys
print(sys.executable)
print(sys.version)
