This script is built to run in python 3.10

Parse MS-DIAL's Embedded MS/MS Spectra. These are the experimental data that will be compared to the ones created in-silico from CFM-id

This script: 

Loads your Aligned_MS1_Features_PI_MSDIAL_03.csv
✅ Extracts Alignment ID, Precursor m/z, RT, and the MS/MS spectrum
✅ Converts the "mz:intensity mz:intensity ..." strings into usable lists
✅ Structures them into a Python list of dictionaries
✅ Saves it as a .json file ready for downstream spectral matching

In [None]:
import pandas as pd
import json
import numpy as np
from matchms import Spectrum
from matchms.exporting import save_as_msp

In [None]:
# === Step 1: Load MS-DIAL feature table ===
input_file = "Aligned_MS1_Features_PI_MSDIAL_03.csv"
df = pd.read_csv(input_file)

In [None]:
# === Step 2: Extract and parse MS/MS spectra ===
extracted_ms2 = []
matchms_spectra = []
csv_rows = []

for idx, row in df.iterrows():
    spectrum_str = row.get("MS/MS spectrum")
    
    if isinstance(spectrum_str, str) and ":" in spectrum_str:
        try:
            fragments = [
                (float(p.split(":")[0]), float(p.split(":")[1]))
                for p in spectrum_str.strip().split(" ")
                if ":" in p
            ]

            if not fragments:
                continue

            max_int = max(i for _, i in fragments)
            filtered = [(mz, i) for mz, i in fragments if i >= 0.01 * max_int]

            feature_id = str(row["Alignment ID"])
            precursor_mz = float(row["Average Mz"])
            rt = float(row["Average Rt(min)"])

            # Save for JSON
            extracted_ms2.append({
                "Feature_ID": feature_id,
                "Precursor_mz": precursor_mz,
                "RT": rt,
                "MS2_peaks": filtered
            })

            # Save for CSV (only top 10 peaks)
            top_peaks = sorted(filtered, key=lambda x: x[1], reverse=True)[:10]
            csv_rows.append({
                "Feature_ID": feature_id,
                "Precursor_mz": precursor_mz,
                "RT_min": rt,
                "Top10_MS2": "; ".join(f"{mz:.4f}:{intensity:.0f}" for mz, intensity in top_peaks)
            })

            # Save for matchms
            filtered_sorted = sorted(filtered, key=lambda x: x[0])  # sort by mz
            mz_array = np.array([mz for mz, _ in filtered_sorted])
            intensity_array = np.array([i for _, i in filtered_sorted])

            spec = Spectrum(mz=mz_array,
                intensities=intensity_array,
                metadata={
                    "precursor_mz": precursor_mz,
                    "feature_id": feature_id,
                    "retention_time": rt,
                    "ionmode": "positive",
                    "compound_name": f"Feature_{feature_id}"
                })
            matchms_spectra.append(spec)

        except Exception as e:
            print(f"⚠️ Skipped row {idx} due to error: {e}")

In [None]:
# === Step 3a: Save JSON ===
with open("Extracted_MS2_Spectra_PosMode.json", "w") as f:
    json.dump(extracted_ms2, f, indent=2)

# === Step 3b: Save CSV ===
csv_df = pd.DataFrame(csv_rows)
csv_df.to_csv("Extracted_MS2_Spectra_PosMode.csv", index=False)

# === Step 3c: Save MSP ===
save_as_msp(matchms_spectra, "Extracted_MS2_Spectra_PosMode.msp")

print(f"✅ Saved: {len(extracted_ms2)} spectra to:")
print("  - JSON: Extracted_MS2_Spectra_PosMode.json")
print("  - CSV : Extracted_MS2_Spectra_PosMode.csv")
print("  - MSP : Extracted_MS2_Spectra_PosMode.msp")


In [None]:
#Now repeat for the negative mode

In [None]:
import pandas as pd
import json
import numpy as np
from matchms import Spectrum
from matchms.exporting import save_as_msp

In [None]:
# === Step 1: Load MS-DIAL feature table ===
input_file = "Aligned_MS1_Features_NI_MSDIAL.csv"
df = pd.read_csv(input_file)

In [None]:
# === Step 2: Extract and parse MS/MS spectra ===
extracted_ms2 = []
matchms_spectra = []
csv_rows = []

for idx, row in df.iterrows():
    spectrum_str = row.get("MS/MS spectrum")
    
    if isinstance(spectrum_str, str) and ":" in spectrum_str:
        try:
            fragments = [
                (float(p.split(":")[0]), float(p.split(":")[1]))
                for p in spectrum_str.strip().split(" ")
                if ":" in p
            ]

            if not fragments:
                continue

            max_int = max(i for _, i in fragments)
            filtered = [(mz, i) for mz, i in fragments if i >= 0.01 * max_int]

            feature_id = str(row["Alignment ID"])
            precursor_mz = float(row["Average Mz"])
            rt = float(row["Average Rt(min)"])

            # Save for JSON
            extracted_ms2.append({
                "Feature_ID": feature_id,
                "Precursor_mz": precursor_mz,
                "RT": rt,
                "MS2_peaks": filtered
            })

            # Save for CSV (only top 10 peaks)
            top_peaks = sorted(filtered, key=lambda x: x[1], reverse=True)[:10]
            csv_rows.append({
                "Feature_ID": feature_id,
                "Precursor_mz": precursor_mz,
                "RT_min": rt,
                "Top10_MS2": "; ".join(f"{mz:.4f}:{intensity:.0f}" for mz, intensity in top_peaks)
            })

            # Save for matchms
            filtered_sorted = sorted(filtered, key=lambda x: x[0])  # sort by mz
            mz_array = np.array([mz for mz, _ in filtered_sorted])
            intensity_array = np.array([i for _, i in filtered_sorted])

            spec = Spectrum(mz=mz_array,
                intensities=intensity_array,
                metadata={
                    "precursor_mz": precursor_mz,
                    "feature_id": feature_id,
                    "retention_time": rt,
                    "ionmode": "positive",
                    "compound_name": f"Feature_{feature_id}"
                })
            matchms_spectra.append(spec)

        except Exception as e:
            print(f"⚠️ Skipped row {idx} due to error: {e}")

In [None]:
# === Step 3a: Save JSON ===
with open("Extracted_MS2_Spectra_NegMode.json", "w") as f:
    json.dump(extracted_ms2, f, indent=2)

# === Step 3b: Save CSV ===
csv_df = pd.DataFrame(csv_rows)
csv_df.to_csv("Extracted_MS2_Spectra_NegMode.csv", index=False)

# === Step 3c: Save MSP ===
save_as_msp(matchms_spectra, "Extracted_MS2_Spectra_NegMode.msp")

print(f"✅ Saved: {len(extracted_ms2)} spectra to:")
print("  - JSON: Extracted_MS2_Spectra_NegMode.json")
print("  - CSV : Extracted_MS2_Spectra_NegMode.csv")
print("  - MSP : Extracted_MS2_Spectra_NegMode.msp")
