‚úÖ Scans all .mzML files in a folder
‚úÖ Extracts all MS2 precursor m/z values
‚úÖ Stores:
    the original m/z
    the rounded m/z (to 4 decimals)
    the filename it came from
    polarity of the precursor
‚úÖ Sorts by rounded m/z
‚úÖ Removes duplicate rounded m/z values, keeping the first occurrence
‚úÖ Saves to a CSV: precursor_list_rounded_with_source.csv

In [None]:
from lxml import etree
from pathlib import Path
import pandas as pd

# üîß Set folder with your mzML files
mzml_folder = Path(r"D:\Stefania Dalagka\mzml")
out_file = "All_MSMS_Precursors.csv"

# mzML files
mzml_files = sorted(mzml_folder.glob("*.mzML"))
if not mzml_files:
    raise FileNotFoundError("‚ùå No mzML files found in the folder.")

ns = {"ns": "http://psi.hupo.org/ms/mzml"}
all_records = []

# === Process each mzML file ===
for mzml_path in mzml_files:
    print(f"üîç Parsing: {mzml_path.name}")
    try:
        tree = etree.parse(str(mzml_path))

        for spectrum in tree.xpath("//ns:spectrum", namespaces=ns):
            # Keep only MS2 spectra
            ms_level_elem = spectrum.xpath('.//ns:cvParam[@accession="MS:1000511"]', namespaces=ns)
            if not ms_level_elem or ms_level_elem[0].attrib["value"] != "2":
                continue

            # Extract filter string
            filter_elem = spectrum.xpath('.//ns:cvParam[@accession="MS:1000512"]', namespaces=ns)
            filter_string = filter_elem[0].attrib["value"] if filter_elem else ""

            # Determine polarity from filter string
            if "FTMS + " in filter_string:
                polarity = "POS"
            elif "FTMS - " in filter_string:
                polarity = "NEG"
            else:
                polarity = "UNKNOWN"

            # Extract precursor m/z
            precursor_elem = spectrum.xpath('.//ns:cvParam[@accession="MS:1000744"]', namespaces=ns)
            if precursor_elem:
                mz = float(precursor_elem[0].attrib["value"])
                mz_rounded = round(mz, 4)

                all_records.append({
                    "original_mz": mz,
                    "rounded_mz": mz_rounded,
                    "polarity": polarity,
                    "filter_string": filter_string,
                    "source_file": mzml_path.name
                })

    except Exception as e:
        print(f"‚ö†Ô∏è Failed to process {mzml_path.name}: {e}")
        continue

# === Save to DataFrame ===
df = pd.DataFrame(all_records)
df_unique = df.drop_duplicates(subset=["rounded_mz", "polarity"]).sort_values("rounded_mz")

# Save final CSV
df_unique.to_csv(out_file, index=False)

print(f"\n‚úÖ Done. Extracted {len(df_unique)} unique precursor m/z values from {len(mzml_files)} files.")
print(f"üìÑ Saved to: {out_file}")


In [None]:
import pandas as pd

# === Load data ===
norman_df = pd.read_csv("Curated_Suspect_List_mz.csv", dtype=str)
precursor_df = pd.read_csv("All_MSMS_Precursors_deduplicated_based_on_filter.csv")

# Convert m/z columns to float
norman_df["M+H"] = pd.to_numeric(norman_df["M+H"], errors="coerce")
norman_df["M-H"] = pd.to_numeric(norman_df["M-H"], errors="coerce")
precursor_df["rounded_mz"] = pd.to_numeric(precursor_df["rounded_mz"], errors="coerce")

# Remove rows with missing precursor m/z
precursor_df = precursor_df.dropna(subset=["rounded_mz"])

# Function to filter NORMAN list based on polarity and m/z tolerance
def filter_by_precursors(precursor_df, norman_df, ppm_tol=5):
    matches = []

    for _, row in precursor_df.iterrows():
        exp_mz = row["rounded_mz"]
        polarity = row["polarity"]
        source_file = row["source_file"]

        if polarity == "POS":
            db_subset = norman_df.dropna(subset=["M+H"]).copy()
            db_subset["mz_diff"] = abs(db_subset["M+H"] - exp_mz)
            db_subset["ppm"] = (db_subset["mz_diff"] / db_subset["M+H"]) * 1e6
            matched = db_subset[db_subset["ppm"] <= ppm_tol].copy()
        elif polarity == "NEG":
            db_subset = norman_df.dropna(subset=["M-H"]).copy()
            db_subset["mz_diff"] = abs(db_subset["M-H"] - exp_mz)
            db_subset["ppm"] = (db_subset["mz_diff"] / db_subset["M-H"]) * 1e6
            matched = db_subset[db_subset["ppm"] <= ppm_tol].copy()
        else:
            continue

        if not matched.empty:
            matched["precursor_mz"] = exp_mz
            matched["precursor_file"] = source_file
            matched["precursor_polarity"] = polarity
            matches.append(matched)

    if matches:
        return pd.concat(matches, ignore_index=True)
    else:
        return pd.DataFrame()

# Apply matching
filtered_matches = filter_by_precursors(precursor_df, norman_df)

import caas_jupyter_tools as tools; tools.display_dataframe_to_user(name="Filtered Suspects", dataframe=filtered_matches)

# Save to CSV
filtered_matches.to_csv("Filtered_Suspects_For_CFMi.csv", index=False)


In [None]:
# Save to CSV
filtered_matches.to_csv("Filtered_Suspects_For_CFMi.csv", index=False)

In [None]:
# === Step 1: Load and deduplicate ===
df = pd.read_csv("Filtered_Suspects_For_CFMi.csv", dtype=str)

# Remove duplicates based on Norman_SusDat_ID (keep first occurrence)
df_dedup = df.drop_duplicates(subset="Norman_SusDat_ID")

# Save to new CSV
df_dedup.to_csv("Filtered_Suspects_For_CFMID_deduplicated_based_on_normanID.csv", index=False)
print(f"‚úÖ Deduplicated file saved. Rows kept: {len(df_dedup)}")

# === Step 2: Create norman_matched_PI.txt ===
df_pos = df_dedup[df_dedup["precursor_polarity"] == "POS"]
df_pos[["Norman_SusDat_ID", "MS_Ready_SMILES"]].to_csv(
    "norman_matched_PI.txt", sep="\t", header=False, index=False
)
print(f"‚úÖ POS SMILES file saved. Rows: {len(df_pos)}")

# === Step 3: Create norman_matched_NI.txt ===
df_neg = df_dedup[df_dedup["precursor_polarity"] == "NEG"]
df_neg[["Norman_SusDat_ID", "MS_Ready_SMILES"]].to_csv(
    "norman_matched_NI.txt", sep="\t", header=False, index=False
)
print(f"‚úÖ NEG SMILES file saved. Rows: {len(df_neg)}")

Use norman_matched_PI.txt and norman_matched_NI.txt as input files to run CFM-ID to predict the spectra.

In [None]:
import os
import re
import pandas as pd

# === Configuration ===
input_folder = "cfmid_runner/PI"  # Folder with CFMiD predicted spectra (positive mode)
output_msp = "cfmid_predicted_10eV_PI.msp"
energy_level = "energy0"  # Choose "energy0" for 10 eV or "energy2" for 40 eV
curated_csv = "Filtered_Suspects_For_CFMID_deduplicated_based_on_normanID.csv"

# === Load NORMAN Curated Data ===
norman_df = pd.read_csv(curated_csv, dtype=str)
norman_df = norman_df.set_index("Norman_SusDat_ID")  # Use correct ID column for matching

entries = []

# === Parse Each CFMiD Prediction File ===
for filename in os.listdir(input_folder):
    if not filename.startswith("NS"):
        continue

    norman_id = filename.replace(".txt", "")
    file_path = os.path.join(input_folder, filename)

    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    meta = {
        "Norman_ID": norman_id,
        "Name": None,
        "SMILES": None,
        "InChIKey": None,
        "Formula": None,
        "PrecursorMZ": None,
    }

    peaks = []
    current_energy = None

    for line in lines:
        line = line.strip()
        if line.startswith("#ID="):
            meta["Name"] = line.split("=", 1)[1].strip()
        elif line.startswith("#SMILES="):
            meta["SMILES"] = line.split("=", 1)[1].strip()
        elif line.startswith("#InChiKey="):
            meta["InChIKey"] = line.split("=", 1)[1].strip()
        elif line.startswith("#Formula="):
            meta["Formula"] = line.split("=", 1)[1].strip()
        elif line.startswith("#PMass="):
            try:
                meta["PrecursorMZ"] = float(line.split("=", 1)[1].strip())
            except:
                pass
        elif line.lower().startswith("energy"):
            current_energy = line.lower()
        elif current_energy == energy_level and re.match(r"^\d", line):
            try:
                mz, intensity = map(float, line.split()[:2])
                peaks.append((mz, intensity))
            except:
                continue

    if not peaks:
        continue

    # Match metadata from curated list
    if norman_id in norman_df.index:
        extra = norman_df.loc[norman_id]
    else:
        extra = {}

    entry = [
        f"Name: {extra.get('Name', meta['Name'] or norman_id)}",
        f"PrecursorMZ: {meta['PrecursorMZ']:.5f}" if meta["PrecursorMZ"] else "PrecursorMZ: ",
        f"Formula: {extra.get('Formula', meta['Formula'] or '')}",
        f"SMILES: {extra.get('Canonical_SMILES', meta['SMILES'] or '')}",
        f"InChI: {extra.get('StdInChI', '')}",
        f"InChIKey: {extra.get('StdInChIKey', meta['InChIKey'] or '')}",
        f"ExactMass: {extra.get('ExactMass', '')}",
        f"CAS#: {extra.get('CAS Number', '')}",
        f"Spectrum_type: MS2",
        f"Ion_mode: P",
        f"Num Peaks: {len(peaks)}"
    ]

    for mz, intensity in peaks:
        entry.append(f"{mz:.5f}\t{intensity:.2f}")

    entries.append("\n".join(entry))

# === Save MSP File ===
with open(output_msp, "w", encoding="utf-8") as out_f:
    out_f.write("\n\n".join(entries))

print(f"‚úÖ Saved {len(entries)} predicted spectra to '{output_msp}' using {energy_level} eV")


In [None]:
import os
import re
import pandas as pd

# === Configuration ===
input_folder = "cfmid_runner/PI"  # Folder with CFMiD predicted spectra (positive mode)
output_msp = "cfmid_predicted_20eV_PI.msp"
energy_level = "energy1"  # Choose "energy0" for 10 eV or "energy2" for 40 eV
curated_csv = "Filtered_Suspects_For_CFMID_deduplicated_based_on_normanID.csv"

# === Load NORMAN Curated Data ===
norman_df = pd.read_csv(curated_csv, dtype=str)
norman_df = norman_df.set_index("Norman_SusDat_ID")  # Use correct ID column for matching

entries = []

# === Parse Each CFMiD Prediction File ===
for filename in os.listdir(input_folder):
    if not filename.startswith("NS"):
        continue

    norman_id = filename.replace(".txt", "")
    file_path = os.path.join(input_folder, filename)

    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    meta = {
        "Norman_ID": norman_id,
        "Name": None,
        "SMILES": None,
        "InChIKey": None,
        "Formula": None,
        "PrecursorMZ": None,
    }

    peaks = []
    current_energy = None

    for line in lines:
        line = line.strip()
        if line.startswith("#ID="):
            meta["Name"] = line.split("=", 1)[1].strip()
        elif line.startswith("#SMILES="):
            meta["SMILES"] = line.split("=", 1)[1].strip()
        elif line.startswith("#InChiKey="):
            meta["InChIKey"] = line.split("=", 1)[1].strip()
        elif line.startswith("#Formula="):
            meta["Formula"] = line.split("=", 1)[1].strip()
        elif line.startswith("#PMass="):
            try:
                meta["PrecursorMZ"] = float(line.split("=", 1)[1].strip())
            except:
                pass
        elif line.lower().startswith("energy"):
            current_energy = line.lower()
        elif current_energy == energy_level and re.match(r"^\d", line):
            try:
                mz, intensity = map(float, line.split()[:2])
                peaks.append((mz, intensity))
            except:
                continue

    if not peaks:
        continue

    # Match metadata from curated list
    if norman_id in norman_df.index:
        extra = norman_df.loc[norman_id]
    else:
        extra = {}

    entry = [
        f"Name: {extra.get('Name', meta['Name'] or norman_id)}",
        f"PrecursorMZ: {meta['PrecursorMZ']:.5f}" if meta["PrecursorMZ"] else "PrecursorMZ: ",
        f"Formula: {extra.get('Formula', meta['Formula'] or '')}",
        f"SMILES: {extra.get('Canonical_SMILES', meta['SMILES'] or '')}",
        f"InChI: {extra.get('StdInChI', '')}",
        f"InChIKey: {extra.get('StdInChIKey', meta['InChIKey'] or '')}",
        f"ExactMass: {extra.get('ExactMass', '')}",
        f"CAS#: {extra.get('CAS Number', '')}",
        f"Spectrum_type: MS2",
        f"Ion_mode: P",
        f"Num Peaks: {len(peaks)}"
    ]

    for mz, intensity in peaks:
        entry.append(f"{mz:.5f}\t{intensity:.2f}")

    entries.append("\n".join(entry))

# === Save MSP File ===
with open(output_msp, "w", encoding="utf-8") as out_f:
    out_f.write("\n\n".join(entries))

print(f"‚úÖ Saved {len(entries)} predicted spectra to '{output_msp}' using {energy_level} eV")


In [None]:
import os
import re
import pandas as pd

# === Configuration ===
input_folder = "cfmid_runner/PI"  # Folder with CFMiD predicted spectra (positive mode)
output_msp = "cfmid_predicted_40eV_PI.msp"
energy_level = "energy2"  # Choose "energy0" for 10 eV or "energy2" for 40 eV
curated_csv = "Filtered_Suspects_For_CFMID_deduplicated_based_on_normanID.csv"

# === Load NORMAN Curated Data ===
norman_df = pd.read_csv(curated_csv, dtype=str)
norman_df = norman_df.set_index("Norman_SusDat_ID")  # Use correct ID column for matching

entries = []

# === Parse Each CFMiD Prediction File ===
for filename in os.listdir(input_folder):
    if not filename.startswith("NS"):
        continue

    norman_id = filename.replace(".txt", "")
    file_path = os.path.join(input_folder, filename)

    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    meta = {
        "Norman_ID": norman_id,
        "Name": None,
        "SMILES": None,
        "InChIKey": None,
        "Formula": None,
        "PrecursorMZ": None,
    }

    peaks = []
    current_energy = None

    for line in lines:
        line = line.strip()
        if line.startswith("#ID="):
            meta["Name"] = line.split("=", 1)[1].strip()
        elif line.startswith("#SMILES="):
            meta["SMILES"] = line.split("=", 1)[1].strip()
        elif line.startswith("#InChiKey="):
            meta["InChIKey"] = line.split("=", 1)[1].strip()
        elif line.startswith("#Formula="):
            meta["Formula"] = line.split("=", 1)[1].strip()
        elif line.startswith("#PMass="):
            try:
                meta["PrecursorMZ"] = float(line.split("=", 1)[1].strip())
            except:
                pass
        elif line.lower().startswith("energy"):
            current_energy = line.lower()
        elif current_energy == energy_level and re.match(r"^\d", line):
            try:
                mz, intensity = map(float, line.split()[:2])
                peaks.append((mz, intensity))
            except:
                continue

    if not peaks:
        continue

    # Match metadata from curated list
    if norman_id in norman_df.index:
        extra = norman_df.loc[norman_id]
    else:
        extra = {}

    entry = [
        f"Name: {extra.get('Name', meta['Name'] or norman_id)}",
        f"PrecursorMZ: {meta['PrecursorMZ']:.5f}" if meta["PrecursorMZ"] else "PrecursorMZ: ",
        f"Formula: {extra.get('Formula', meta['Formula'] or '')}",
        f"SMILES: {extra.get('Canonical_SMILES', meta['SMILES'] or '')}",
        f"InChI: {extra.get('StdInChI', '')}",
        f"InChIKey: {extra.get('StdInChIKey', meta['InChIKey'] or '')}",
        f"ExactMass: {extra.get('ExactMass', '')}",
        f"CAS#: {extra.get('CAS Number', '')}",
        f"Spectrum_type: MS2",
        f"Ion_mode: P",
        f"Num Peaks: {len(peaks)}"
    ]

    for mz, intensity in peaks:
        entry.append(f"{mz:.5f}\t{intensity:.2f}")

    entries.append("\n".join(entry))

# === Save MSP File ===
with open(output_msp, "w", encoding="utf-8") as out_f:
    out_f.write("\n\n".join(entries))

print(f"‚úÖ Saved {len(entries)} predicted spectra to '{output_msp}' using {energy_level} eV")


In [None]:
import os
import re
import pandas as pd

# === Configuration ===
input_folder = "cfmid_runner/NI"  # Folder with CFMiD predicted spectra (positive mode)
output_msp = "cfmid_predicted_10eV_NI.msp"
energy_level = "energy0"  # Choose "energy0" for 10 eV or "energy2" for 40 eV
curated_csv = "Filtered_Suspects_For_CFMID_deduplicated_based_on_normanID.csv"

# === Load NORMAN Curated Data ===
norman_df = pd.read_csv(curated_csv, dtype=str)
norman_df = norman_df.set_index("Norman_SusDat_ID")  # Use correct ID column for matching

entries = []

# === Parse Each CFMiD Prediction File ===
for filename in os.listdir(input_folder):
    if not filename.startswith("NS"):
        continue

    norman_id = filename.replace(".txt", "")
    file_path = os.path.join(input_folder, filename)

    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    meta = {
        "Norman_ID": norman_id,
        "Name": None,
        "SMILES": None,
        "InChIKey": None,
        "Formula": None,
        "PrecursorMZ": None,
    }

    peaks = []
    current_energy = None

    for line in lines:
        line = line.strip()
        if line.startswith("#ID="):
            meta["Name"] = line.split("=", 1)[1].strip()
        elif line.startswith("#SMILES="):
            meta["SMILES"] = line.split("=", 1)[1].strip()
        elif line.startswith("#InChiKey="):
            meta["InChIKey"] = line.split("=", 1)[1].strip()
        elif line.startswith("#Formula="):
            meta["Formula"] = line.split("=", 1)[1].strip()
        elif line.startswith("#PMass="):
            try:
                meta["PrecursorMZ"] = float(line.split("=", 1)[1].strip())
            except:
                pass
        elif line.lower().startswith("energy"):
            current_energy = line.lower()
        elif current_energy == energy_level and re.match(r"^\d", line):
            try:
                mz, intensity = map(float, line.split()[:2])
                peaks.append((mz, intensity))
            except:
                continue

    if not peaks:
        continue

    # Match metadata from curated list
    if norman_id in norman_df.index:
        extra = norman_df.loc[norman_id]
    else:
        extra = {}

    entry = [
        f"Name: {extra.get('Name', meta['Name'] or norman_id)}",
        f"PrecursorMZ: {meta['PrecursorMZ']:.5f}" if meta["PrecursorMZ"] else "PrecursorMZ: ",
        f"Formula: {extra.get('Formula', meta['Formula'] or '')}",
        f"SMILES: {extra.get('Canonical_SMILES', meta['SMILES'] or '')}",
        f"InChI: {extra.get('StdInChI', '')}",
        f"InChIKey: {extra.get('StdInChIKey', meta['InChIKey'] or '')}",
        f"ExactMass: {extra.get('ExactMass', '')}",
        f"CAS#: {extra.get('CAS Number', '')}",
        f"Spectrum_type: MS2",
        f"Ion_mode: N",
        f"Num Peaks: {len(peaks)}"
    ]

    for mz, intensity in peaks:
        entry.append(f"{mz:.5f}\t{intensity:.2f}")

    entries.append("\n".join(entry))

# === Save MSP File ===
with open(output_msp, "w", encoding="utf-8") as out_f:
    out_f.write("\n\n".join(entries))

print(f"‚úÖ Saved {len(entries)} predicted spectra to '{output_msp}' using {energy_level} eV")


In [None]:
import os
import re
import pandas as pd

# === Configuration ===
input_folder = "cfmid_runner/NI"  # Folder with CFMiD predicted spectra (positive mode)
output_msp = "cfmid_predicted_20eV_NI.msp"
energy_level = "energy1"  # Choose "energy0" for 10 eV or "energy2" for 40 eV
curated_csv = "Filtered_Suspects_For_CFMID_deduplicated_based_on_normanID.csv"

# === Load NORMAN Curated Data ===
norman_df = pd.read_csv(curated_csv, dtype=str)
norman_df = norman_df.set_index("Norman_SusDat_ID")  # Use correct ID column for matching

entries = []

# === Parse Each CFMiD Prediction File ===
for filename in os.listdir(input_folder):
    if not filename.startswith("NS"):
        continue

    norman_id = filename.replace(".txt", "")
    file_path = os.path.join(input_folder, filename)

    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    meta = {
        "Norman_ID": norman_id,
        "Name": None,
        "SMILES": None,
        "InChIKey": None,
        "Formula": None,
        "PrecursorMZ": None,
    }

    peaks = []
    current_energy = None

    for line in lines:
        line = line.strip()
        if line.startswith("#ID="):
            meta["Name"] = line.split("=", 1)[1].strip()
        elif line.startswith("#SMILES="):
            meta["SMILES"] = line.split("=", 1)[1].strip()
        elif line.startswith("#InChiKey="):
            meta["InChIKey"] = line.split("=", 1)[1].strip()
        elif line.startswith("#Formula="):
            meta["Formula"] = line.split("=", 1)[1].strip()
        elif line.startswith("#PMass="):
            try:
                meta["PrecursorMZ"] = float(line.split("=", 1)[1].strip())
            except:
                pass
        elif line.lower().startswith("energy"):
            current_energy = line.lower()
        elif current_energy == energy_level and re.match(r"^\d", line):
            try:
                mz, intensity = map(float, line.split()[:2])
                peaks.append((mz, intensity))
            except:
                continue

    if not peaks:
        continue

    # Match metadata from curated list
    if norman_id in norman_df.index:
        extra = norman_df.loc[norman_id]
    else:
        extra = {}

    entry = [
        f"Name: {extra.get('Name', meta['Name'] or norman_id)}",
        f"PrecursorMZ: {meta['PrecursorMZ']:.5f}" if meta["PrecursorMZ"] else "PrecursorMZ: ",
        f"Formula: {extra.get('Formula', meta['Formula'] or '')}",
        f"SMILES: {extra.get('Canonical_SMILES', meta['SMILES'] or '')}",
        f"InChI: {extra.get('StdInChI', '')}",
        f"InChIKey: {extra.get('StdInChIKey', meta['InChIKey'] or '')}",
        f"ExactMass: {extra.get('ExactMass', '')}",
        f"CAS#: {extra.get('CAS Number', '')}",
        f"Spectrum_type: MS2",
        f"Ion_mode: N",
        f"Num Peaks: {len(peaks)}"
    ]

    for mz, intensity in peaks:
        entry.append(f"{mz:.5f}\t{intensity:.2f}")

    entries.append("\n".join(entry))

# === Save MSP File ===
with open(output_msp, "w", encoding="utf-8") as out_f:
    out_f.write("\n\n".join(entries))

print(f"‚úÖ Saved {len(entries)} predicted spectra to '{output_msp}' using {energy_level} eV")


In [None]:
import os
import re
import pandas as pd

# === Configuration ===
input_folder = "cfmid_runner/NI"  # Folder with CFMiD predicted spectra (positive mode)
output_msp = "cfmid_predicted_40eV_NI.msp"
energy_level = "energy2"  # Choose "energy0" for 10 eV or "energy2" for 40 eV
curated_csv = "Filtered_Suspects_For_CFMID_deduplicated_based_on_normanID.csv"

# === Load NORMAN Curated Data ===
norman_df = pd.read_csv(curated_csv, dtype=str)
norman_df = norman_df.set_index("Norman_SusDat_ID")  # Use correct ID column for matching

entries = []

# === Parse Each CFMiD Prediction File ===
for filename in os.listdir(input_folder):
    if not filename.startswith("NS"):
        continue

    norman_id = filename.replace(".txt", "")
    file_path = os.path.join(input_folder, filename)

    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    meta = {
        "Norman_ID": norman_id,
        "Name": None,
        "SMILES": None,
        "InChIKey": None,
        "Formula": None,
        "PrecursorMZ": None,
    }

    peaks = []
    current_energy = None

    for line in lines:
        line = line.strip()
        if line.startswith("#ID="):
            meta["Name"] = line.split("=", 1)[1].strip()
        elif line.startswith("#SMILES="):
            meta["SMILES"] = line.split("=", 1)[1].strip()
        elif line.startswith("#InChiKey="):
            meta["InChIKey"] = line.split("=", 1)[1].strip()
        elif line.startswith("#Formula="):
            meta["Formula"] = line.split("=", 1)[1].strip()
        elif line.startswith("#PMass="):
            try:
                meta["PrecursorMZ"] = float(line.split("=", 1)[1].strip())
            except:
                pass
        elif line.lower().startswith("energy"):
            current_energy = line.lower()
        elif current_energy == energy_level and re.match(r"^\d", line):
            try:
                mz, intensity = map(float, line.split()[:2])
                peaks.append((mz, intensity))
            except:
                continue

    if not peaks:
        continue

    # Match metadata from curated list
    if norman_id in norman_df.index:
        extra = norman_df.loc[norman_id]
    else:
        extra = {}

    entry = [
        f"Name: {extra.get('Name', meta['Name'] or norman_id)}",
        f"PrecursorMZ: {meta['PrecursorMZ']:.5f}" if meta["PrecursorMZ"] else "PrecursorMZ: ",
        f"Formula: {extra.get('Formula', meta['Formula'] or '')}",
        f"SMILES: {extra.get('Canonical_SMILES', meta['SMILES'] or '')}",
        f"InChI: {extra.get('StdInChI', '')}",
        f"InChIKey: {extra.get('StdInChIKey', meta['InChIKey'] or '')}",
        f"ExactMass: {extra.get('ExactMass', '')}",
        f"CAS#: {extra.get('CAS Number', '')}",
        f"Spectrum_type: MS2",
        f"Ion_mode: N",
        f"Num Peaks: {len(peaks)}"
    ]

    for mz, intensity in peaks:
        entry.append(f"{mz:.5f}\t{intensity:.2f}")

    entries.append("\n".join(entry))

# === Save MSP File ===
with open(output_msp, "w", encoding="utf-8") as out_f:
    out_f.write("\n\n".join(entries))

print(f"‚úÖ Saved {len(entries)} predicted spectra to '{output_msp}' using {energy_level} eV")


In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd

# === Load curated suspect list ===
df = pd.read_csv("Filtered_Suspects_For_CFMID_deduplicated_based_on_normanID.csv", dtype=str)

# Make sure SMILES exist
df = df.dropna(subset=["Canonical_SMILES"])

# === Generate MOL blocks from SMILES ===
mol_blocks = []

for smi in df["Canonical_SMILES"]:
    try:
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            mol_blocks.append("")
            continue

        # Generate 2D coordinates (needed for visualization)
        AllChem.Compute2DCoords(mol)

        # Convert to MOL block string
        mol_block = Chem.MolToMolBlock(mol)
        mol_blocks.append(mol_block)
    except:
        mol_blocks.append("")

# Add new column
df["MOL"] = mol_blocks

# Save new version
df.to_csv("Filtered_Suspects_For_CFMID_deduplicated_based_on_normanID_with_MOL.csv", index=False)
print("‚úÖ MOL structures added and saved to Curated_Suspect_List_with_MOL.csv")


In [None]:
import os
import re
import pandas as pd

# === Configuration ===
input_folder = "pfas specific database/myout"  # Folder with CFMiD predicted spectra (positive mode)
output_msp = "cfmid_predicted_norman_pfas_10ev.msp"
energy_level = "energy0"  # Choose "energy0" for 10 eV or "energy2" for 40 eV
curated_csv = "pfas specific database/norman_curated_pfas.csv"

# === Load NORMAN Curated Data ===
norman_df = pd.read_csv(curated_csv, dtype=str)
norman_df = norman_df.set_index("Norman_SusDat_ID")  # Use correct ID column for matching

entries = []

# === Parse Each CFMiD Prediction File ===
for filename in os.listdir(input_folder):
    if not filename.startswith("NS"):
        continue

    norman_id = filename.replace(".txt", "")
    file_path = os.path.join(input_folder, filename)

    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    meta = {
        "Norman_ID": norman_id,
        "Name": None,
        "SMILES": None,
        "InChIKey": None,
        "Formula": None,
        "PrecursorMZ": None,
    }

    peaks = []
    current_energy = None

    for line in lines:
        line = line.strip()
        if line.startswith("#ID="):
            meta["Name"] = line.split("=", 1)[1].strip()
        elif line.startswith("#SMILES="):
            meta["SMILES"] = line.split("=", 1)[1].strip()
        elif line.startswith("#InChiKey="):
            meta["InChIKey"] = line.split("=", 1)[1].strip()
        elif line.startswith("#Formula="):
            meta["Formula"] = line.split("=", 1)[1].strip()
        elif line.startswith("#PMass="):
            try:
                meta["PrecursorMZ"] = float(line.split("=", 1)[1].strip())
            except:
                pass
        elif line.lower().startswith("energy"):
            current_energy = line.lower()
        elif current_energy == energy_level and re.match(r"^\d", line):
            try:
                mz, intensity = map(float, line.split()[:2])
                peaks.append((mz, intensity))
            except:
                continue

    if not peaks:
        continue

    # Match metadata from curated list
    if norman_id in norman_df.index:
        extra = norman_df.loc[norman_id]
    else:
        extra = {}

    entry = [
        f"Name: {extra.get('Name', meta['Name'] or norman_id)}",
        f"PrecursorMZ: {meta['PrecursorMZ']:.5f}" if meta["PrecursorMZ"] else "PrecursorMZ: ",
        f"Formula: {extra.get('Formula', meta['Formula'] or '')}",
        f"SMILES: {extra.get('Canonical_SMILES', meta['SMILES'] or '')}",
        f"InChI: {extra.get('StdInChI', '')}",
        f"InChIKey: {extra.get('StdInChIKey', meta['InChIKey'] or '')}",
        f"ExactMass: {extra.get('ExactMass', '')}",
        f"CAS#: {extra.get('CAS Number', '')}",
        f"Spectrum_type: MS2",
        f"Ion_mode: P",
        f"Num Peaks: {len(peaks)}"
    ]

    for mz, intensity in peaks:
        entry.append(f"{mz:.5f}\t{intensity:.2f}")

    entries.append("\n".join(entry))

# === Save MSP File ===
with open(output_msp, "w", encoding="utf-8") as out_f:
    out_f.write("\n\n".join(entries))

print(f"‚úÖ Saved {len(entries)} predicted spectra to '{output_msp}' using {energy_level} eV")


In [None]:
import os
import re
import pandas as pd

# === Configuration ===
input_folder = "pfas specific database/myout"  # Folder with CFMiD predicted spectra (positive mode)
output_msp = "cfmid_predicted_norman_pfas_10ev.msp"
energy_level = "energy0"  # Choose "energy0" for 10 eV or "energy2" for 40 eV
curated_csv = "pfas specific database/norman_curated_pfas.csv"

# === Load NORMAN Curated Data ===
norman_df = pd.read_csv(curated_csv, dtype=str)
norman_df = norman_df.set_index("Norman_SusDat_ID")  # Use correct ID column for matching

entries = []

# === Parse Each CFMiD Prediction File ===
for filename in os.listdir(input_folder):
    if not filename.startswith("NS"):
        continue

    norman_id = filename.replace(".txt", "")
    file_path = os.path.join(input_folder, filename)

    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    meta = {
        "Norman_ID": norman_id,
        "Name": None,
        "SMILES": None,
        "InChIKey": None,
        "Formula": None,
        "PrecursorMZ": None,
    }

    peaks = []
    current_energy = None

    for line in lines:
        line = line.strip()
        if line.startswith("#ID="):
            meta["Name"] = line.split("=", 1)[1].strip()
        elif line.startswith("#SMILES="):
            meta["SMILES"] = line.split("=", 1)[1].strip()
        elif line.startswith("#InChiKey="):
            meta["InChIKey"] = line.split("=", 1)[1].strip()
        elif line.startswith("#Formula="):
            meta["Formula"] = line.split("=", 1)[1].strip()
        elif line.startswith("#PMass="):
            try:
                meta["PrecursorMZ"] = float(line.split("=", 1)[1].strip())
            except:
                pass
        elif line.lower().startswith("energy"):
            current_energy = line.lower()
        elif current_energy == energy_level and re.match(r"^\d", line):
            try:
                mz, intensity = map(float, line.split()[:2])
                peaks.append((mz, intensity))
            except:
                continue

    if not peaks:
        continue

    # Match metadata from curated list
    if norman_id in norman_df.index:
        extra = norman_df.loc[norman_id]
    else:
        extra = {}

    entry = [
        f"Name: {extra.get('Name', meta['Name'] or norman_id)}",
        f"PrecursorMZ: {meta['PrecursorMZ']:.5f}" if meta["PrecursorMZ"] else "PrecursorMZ: ",
        f"Formula: {extra.get('Formula', meta['Formula'] or '')}",
        f"SMILES: {extra.get('Canonical_SMILES', meta['SMILES'] or '')}",
        f"InChI: {extra.get('StdInChI', '')}",
        f"InChIKey: {extra.get('StdInChIKey', meta['InChIKey'] or '')}",
        f"ExactMass: {extra.get('ExactMass', '')}",
        f"CAS#: {extra.get('CAS Number', '')}",
        f"Spectrum_type: MS2",
        f"Ion_mode: N",
        f"Num Peaks: {len(peaks)}"
    ]

    for mz, intensity in peaks:
        entry.append(f"{mz:.5f}\t{intensity:.2f}")

    entries.append("\n".join(entry))

# === Save MSP File ===
with open(output_msp, "w", encoding="utf-8") as out_f:
    out_f.write("\n\n".join(entries))

print(f"‚úÖ Saved {len(entries)} predicted spectra to '{output_msp}' using {energy_level} eV")
