Match Loads MS-DIAL and suspect list files in negative ionization results

   Match suspect [M-H]- to MS-DIAL Average Mz within ±5 ppm

   Keep all matches (1:m (one feature may match multiple suspects) or m:1 (many features to match the same suspect))

   Retain useful metadata from both files (e.g. Alignment ID, MS/MS spectrum, formula, etc.)

In [None]:
import os
import pandas as pd

In [None]:
# === Step 1: Load files ===
features_file = "Aligned_MS1_Features_NI_MSDIAL.csv"
suspects_file = "Curated_Suspect_List_with_Adducts.csv"

features_df = pd.read_csv(features_file, low_memory=False)
suspects_df = pd.read_csv(suspects_file)

In [None]:
# === Step 2: Prepare for matching ===
ppm_tolerance = 5
matches = []

# Make sure m/z columns are numeric
features_df["Average Mz"] = pd.to_numeric(features_df["Average Mz"], errors="coerce")
suspects_df["[M-H]-"] = pd.to_numeric(suspects_df["[M-H]-"], errors="coerce")

# Drop rows with missing values in key columns
features_df = features_df.dropna(subset=["Average Mz"])
suspects_df = suspects_df.dropna(subset=["[M-H]-"])

In [None]:
# === Step 3: Perform m/z matching ===
for _, feature in features_df.iterrows():
    feature_mz = feature["Average Mz"]
    mz_min = feature_mz - (feature_mz * ppm_tolerance / 1e6)
    mz_max = feature_mz + (feature_mz * ppm_tolerance / 1e6)

    matches_subset = suspects_df[suspects_df["[M-H]-"].between(mz_min, mz_max)].copy()
    
    for _, suspect in matches_subset.iterrows():
        delta_ppm = abs(feature_mz - suspect["[M-H]-"]) / suspect["[M-H]-"] * 1e6
        matches.append({
            "Alignment ID": feature.get("Alignment ID"),
            "Average Mz": feature_mz,
            "Average Rt(min)": feature.get("Average Rt(min)"),
            "MS/MS spectrum": feature.get("MS/MS spectrum"),
            "Peak Height": feature.filter(like="Height").mean(),  # average across sample columns
            "Suspect Name": suspect.get("Name"),
            "Function": suspect.get("Function"),
            "Source": suspect.get("Source"),
            "Canonical_SMILES": suspect.get("Canonical_SMILES"),
            "Formula": suspect.get("Formula"),
            "MW": suspect.get("MW"),
            "[M-H]-": suspect.get("[M-H]-"),
            "Δppm": round(delta_ppm, 2)
        })

In [None]:
# === Step 4: Convert to DataFrame and save ===
matches_df = pd.DataFrame(matches)
matches_df.to_csv("Matched_Suspects_NegativeMode.csv", index=False)

print(f"✅ Matching complete: {len(matches_df)} matches saved to 'Matched_Suspects_NegativeMode.csv'")


In [None]:
# Step 1: Load match file
matches_df = pd.read_csv("Matched_Suspects_NegativeMode.csv")

# Step 2: Drop rows without SMILES
matches_df = matches_df.dropna(subset=["Canonical_SMILES"])

# Step 3: Add CFM-ID before deduplication
matches_df["CFMID_ID"] = ["Molecule" + str(i + 1) for i in range(len(matches_df))]

# Step 4: Save full match table with IDs (to reconnect later)
matches_df.to_csv("Matched_Suspects_NegativeMode_with_CFMID_ID.csv", index=False)

# Step 5: Deduplicate by Canonical_SMILES for CFM-ID prediction
cfmid_input = matches_df.drop_duplicates(subset=["Canonical_SMILES"], keep="first")
cfmid_input = cfmid_input[["CFMID_ID", "Canonical_SMILES"]]

# Step 6: Export tab-separated file (no header)
cfmid_input.to_csv("CFMID_Input_NegMode_FromMatches.txt", sep="\t", index=False, header=False)

print(f"✅ Done! Input file for CFM-ID created with {len(cfmid_input)} unique SMILES.")
print("📁 File: CFMID_Input_NegMode_FromMatches.txt")
print("📁 Full match file: Matched_Suspects_NegativeMode_with_CFMID_ID.csv")


In [None]:
# Load and count lines in the final CFM-ID input file
with open("CFMID_Input_NegMode_FromMatches.txt", "r") as f:
    lines = f.readlines()

print(f"📦 Total unique SMILES submitted to CFM-ID: {len(lines)}")

In [None]:
import os
import re

input_folder = "cfmid_runner/NI/myout_neg"  # Folder with MoleculeX.txt files
output_msp = "cfmid_predicted_NI_10eV.msp"
energy_level = "energy0"  # change to "energy0" or "energy2" for 10/40 eV

entries = []

for filename in os.listdir(input_folder):
    if filename.startswith("Molecule"):
        with open(os.path.join(input_folder, filename), "r") as f:
            lines = f.readlines()

        metadata = {
            "Name": None,
            "SMILES": None,
            "InChIKey": None,
            "Formula": None,
            "PrecursorMZ": None
        }

        peaks = []
        current_energy = None

        for line in lines:
            line = line.strip()

            # Extract metadata
            if line.startswith("#ID="):
                metadata["Name"] = line.split("=")[1].strip()
            elif line.startswith("#SMILES="):
                metadata["SMILES"] = line.split("=")[1].strip()
            elif line.startswith("#InChiKey="):
                metadata["InChIKey"] = line.split("=")[1].strip()
            elif line.startswith("#Formula="):
                metadata["Formula"] = line.split("=")[1].strip()
            elif line.startswith("#PMass="):
                metadata["PrecursorMZ"] = float(line.split("=")[1].strip())

            # Set energy block (e.g. energy1 = 20 eV)
            elif line.lower().startswith("energy"):
                current_energy = line.strip().lower()

            # Read peaks if we're in the right energy section
            elif current_energy == energy_level and re.match(r"^\d", line):
                parts = line.split()
                try:
                    mz = float(parts[0])
                    intensity = float(parts[1])
                    peaks.append((mz, intensity))
                except:
                    continue

        if not peaks or metadata["Name"] is None:
            continue

        entry = []
        entry.append(f"Name: {metadata['Name']}")
        entry.append(f"PrecursorMZ: {metadata['PrecursorMZ']:.5f}")
        entry.append(f"SMILES: {metadata['SMILES']}")
        entry.append(f"InChIKey: {metadata['InChIKey']}")
        entry.append(f"Formula: {metadata['Formula']}")
        entry.append("Num Peaks: " + str(len(peaks)))
        for mz, intensity in peaks:
            entry.append(f"{mz:.5f}\t{intensity:.2f}")

        entries.append("\n".join(entry))

# Save .msp
with open(output_msp, "w") as out_f:
    out_f.write("\n\n".join(entries))

print(f"✅ Saved {len(entries)} predicted spectra to '{output_msp}' using {energy_level}")


In [None]:
import os
import re

input_folder = "cfmid_runner/NI/myout_neg"  # Folder with MoleculeX.txt files
output_msp = "cfmid_predicted_NI_20eV.msp"
energy_level = "energy1"  # change to "energy0" or "energy2" for 10/40 eV

entries = []

for filename in os.listdir(input_folder):
    if filename.startswith("Molecule"):
        with open(os.path.join(input_folder, filename), "r") as f:
            lines = f.readlines()

        metadata = {
            "Name": None,
            "SMILES": None,
            "InChIKey": None,
            "Formula": None,
            "PrecursorMZ": None
        }

        peaks = []
        current_energy = None

        for line in lines:
            line = line.strip()

            # Extract metadata
            if line.startswith("#ID="):
                metadata["Name"] = line.split("=")[1].strip()
            elif line.startswith("#SMILES="):
                metadata["SMILES"] = line.split("=")[1].strip()
            elif line.startswith("#InChiKey="):
                metadata["InChIKey"] = line.split("=")[1].strip()
            elif line.startswith("#Formula="):
                metadata["Formula"] = line.split("=")[1].strip()
            elif line.startswith("#PMass="):
                metadata["PrecursorMZ"] = float(line.split("=")[1].strip())

            # Set energy block (e.g. energy1 = 20 eV)
            elif line.lower().startswith("energy"):
                current_energy = line.strip().lower()

            # Read peaks if we're in the right energy section
            elif current_energy == energy_level and re.match(r"^\d", line):
                parts = line.split()
                try:
                    mz = float(parts[0])
                    intensity = float(parts[1])
                    peaks.append((mz, intensity))
                except:
                    continue

        if not peaks or metadata["Name"] is None:
            continue

        entry = []
        entry.append(f"Name: {metadata['Name']}")
        entry.append(f"PrecursorMZ: {metadata['PrecursorMZ']:.5f}")
        entry.append(f"SMILES: {metadata['SMILES']}")
        entry.append(f"InChIKey: {metadata['InChIKey']}")
        entry.append(f"Formula: {metadata['Formula']}")
        entry.append("Num Peaks: " + str(len(peaks)))
        for mz, intensity in peaks:
            entry.append(f"{mz:.5f}\t{intensity:.2f}")

        entries.append("\n".join(entry))

# Save .msp
with open(output_msp, "w") as out_f:
    out_f.write("\n\n".join(entries))

print(f"✅ Saved {len(entries)} predicted spectra to '{output_msp}' using {energy_level}")


In [None]:
import os
import re

input_folder = "cfmid_runner/NI/myout_neg"  # Folder with MoleculeX.txt files
output_msp = "cfmid_predicted_NI_40eV.msp"
energy_level = "energy2"  # change to "energy0" or "energy2" for 10/40 eV

entries = []

for filename in os.listdir(input_folder):
    if filename.startswith("Molecule"):
        with open(os.path.join(input_folder, filename), "r") as f:
            lines = f.readlines()

        metadata = {
            "Name": None,
            "SMILES": None,
            "InChIKey": None,
            "Formula": None,
            "PrecursorMZ": None
        }

        peaks = []
        current_energy = None

        for line in lines:
            line = line.strip()

            # Extract metadata
            if line.startswith("#ID="):
                metadata["Name"] = line.split("=")[1].strip()
            elif line.startswith("#SMILES="):
                metadata["SMILES"] = line.split("=")[1].strip()
            elif line.startswith("#InChiKey="):
                metadata["InChIKey"] = line.split("=")[1].strip()
            elif line.startswith("#Formula="):
                metadata["Formula"] = line.split("=")[1].strip()
            elif line.startswith("#PMass="):
                metadata["PrecursorMZ"] = float(line.split("=")[1].strip())

            # Set energy block (e.g. energy1 = 20 eV)
            elif line.lower().startswith("energy"):
                current_energy = line.strip().lower()

            # Read peaks if we're in the right energy section
            elif current_energy == energy_level and re.match(r"^\d", line):
                parts = line.split()
                try:
                    mz = float(parts[0])
                    intensity = float(parts[1])
                    peaks.append((mz, intensity))
                except:
                    continue

        if not peaks or metadata["Name"] is None:
            continue

        entry = []
        entry.append(f"Name: {metadata['Name']}")
        entry.append(f"PrecursorMZ: {metadata['PrecursorMZ']:.5f}")
        entry.append(f"SMILES: {metadata['SMILES']}")
        entry.append(f"InChIKey: {metadata['InChIKey']}")
        entry.append(f"Formula: {metadata['Formula']}")
        entry.append("Num Peaks: " + str(len(peaks)))
        for mz, intensity in peaks:
            entry.append(f"{mz:.5f}\t{intensity:.2f}")

        entries.append("\n".join(entry))

# Save .msp
with open(output_msp, "w") as out_f:
    out_f.write("\n\n".join(entries))

print(f"✅ Saved {len(entries)} predicted spectra to '{output_msp}' using {energy_level}")


In [None]:
import sys
print(sys.executable)
print(sys.version)
