<a href="https://colab.research.google.com/github/ismaRP/maldiZooMSconverter/blob/main/convert_zooMSdata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title Mount drive
from google.colab import drive
drive.mount('/content/drive', readonly=False)

Mounted at /content/drive


In [None]:
# @title Install dependencies
!pip install pyteomics
!pip install pyopenms

Collecting pyteomics
  Downloading pyteomics-4.7.5-py3-none-any.whl.metadata (6.5 kB)
Downloading pyteomics-4.7.5-py3-none-any.whl (238 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/239.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m235.5/239.0 kB[0m [31m7.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.0/239.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyteomics
Successfully installed pyteomics-4.7.5
Collecting pyopenms
  Downloading pyopenms-3.4.0-cp312-cp312-manylinux_2_34_x86_64.whl.metadata (2.0 kB)
Downloading pyopenms-3.4.0-cp312-cp312-manylinux_2_34_x86_64.whl (59.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.8/59.8 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyopenms
Successfully installed pyopenms-3.4.0


In [None]:
# @title Load packages
from pyteomics import mzxml, mzml
import pyopenms as oms
import os
import sys
import subprocess
import shlex
from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
from lxml import etree

In [None]:
# @title Input parameters
# @markdown Input folder with data
input_folder = '/content/drive/MyDrive/odense_runs/20251110_Odense_ZooMS' # @param {type: "string"}
output_folder = '/content/drive/MyDrive/odense_runs_test/' # @param {type: "string"}
# @markdown Fix data points or peak counts in mzML or mzXML
fix_peakscount = True # @param {type: "boolean"}
# @markdown input and output formats: mzML/mzXML
input_format = 'mzXML' # @param ['mzML', 'mzXML']
convert_to = 'mzML' # @param ['none', 'mzML', 'mzXML']
# @markdown Indices of spectra to extract.
# @markdown List of intervals of the form [a,b] or a[-][b].<br>
# @markdown E.g. '[0,2] 5-7' is the set '0 1 2 5 6 7' and
# @markdown '0-' means from 0 to the largest index in the file.<br>
# @markdown By default all spectra are extracted ('0-').
extract_index = '0-' # @param {type: "string"}
# @markdown <br>
# @markdown Whether to include or remove experiment metadata:
# @markdown instrument, software processing,...<br>
# @markdown Some software fail to recognise some of terms here, particularly
# @markdown associated with newer instruments.
metadata_options = 'both' # @param ['with experiment', 'without experiment', 'both']
# @markdown <br>
# @markdown Path to local msconvert folder on the drive
msconvert_path = '' # @param {type: 'string'}

In [None]:
# @title Load functions
def find_spectra(root):
    # Generic search for elements whose local-name is 'scan' or 'spectrum'
    # returns list of element objects in document order
    xpath_expr = ".//*[local-name() = 'scan' or local-name() = 'spectrum']"
    return root.xpath(xpath_expr)


def get_peakscounts(file_input, ms_format='mzXML'):
    if ms_format == 'mzXML':
        ms_data = mzxml.read(file_input)
    elif ms_format == 'mzML':
        ms_data = mzml.read(file_input)
    else:
        sys.exit(f'Format {ms_format} not supported')
    correct_lengths = []
    for s in ms_data:
        correct_lengths.append(len(s['m/z array']))
    return correct_lengths

def correct_peakscount(file_input, output_path, ms_format='mzXML'):

    if ms_format == 'mzXML':
        peak_count_field = 'peaksCount'
    elif ms_format == 'mzML':
        peak_count_field = 'defaultArrayLength'
    else:
        sys.exit(f'Format {ms_format} not supported')

    peak_counts = get_peakscounts(file_input, ms_format)

    parser = etree.XMLParser(remove_blank_text=False, huge_tree=True)
    tree = etree.parse(file_input, parser)
    root = tree.getroot()

    spectra = find_spectra(root)
    n_spectra = len(spectra)
    print(f"\tFound {n_spectra} spectrum/scan elements in '{file_input}'")

    if n_spectra == 0:
        print(
            f"WARNING. No spectra found in {file_input}."
             "Exiting without changes to this file.")
        return
    if len(peak_counts) != n_spectra:
        print(
            f"WARNING. counts length does not match number of spectra in\n{file_input}."
            f"  counts: {len(peak_counts)}, spectra: {n_spectra}"
            "Exiting without changes to this file."
        )
        return

    changed = 0
    for i in range(n_spectra):
        elem = spectra[i]
        new_count = int(peak_counts[i])
        # set attribute; attribute name is typically 'peaksCount' in mzXML
        old_val = elem.get(peak_count_field)
        if old_val is None or str(old_val) != str(new_count):
            elem.set(peak_count_field, str(new_count))
            changed += 1
            # optionally, you could also update <peaks> child length if present,
            # but typically only the attribute is needed.
    print(f"\tUpdated peaksCount for {changed} spectra (out of {n_spectra} matched).")
    print(f"\tSaving modified mzXML to '{output_path}'\n")
    tree.write(output_path, encoding='utf-8', xml_declaration=True, pretty_print=False)



def run_msconvert(input_folder, output_folder, input_format, output_format):

    tmp_filelist = os.path.join(output_folder, 'file_list.txt')

    with open(tmp_filelist, 'w') as outf:
        input_files = [
            os.path.join(input_folder, f) + '\n'
            for f in os.listdir(input_folder)
            if f.endswith(f'.{input_format}')]
        outf.writelines(input_files)

    msconvert_cmd = shlex.split(
        f'{msconvert_path}msconvert --{output_format} -z'
        f' -f {tmp_filelist} -o {output_folder}'
        )
    subprocess.run(
        ['chmod', 'u+x', f'{msconvert_path}msconvert']
    )
    subprocess.run(
        msconvert_cmd,
        # stdout=subprocess.DEVNULL,
        # stderr=subprocess.DEVNULL,
        )


class FormatConsumer:
    def __init__(self, ms_writer=None, ms_writer_noexp=None):
        self._ms_writer = None
        self._ms_writer_noexp = None
        if ms_writer is not None:
            self._ms_writer = ms_writer
        if ms_writer_noexp is not None:
            self._ms_writer_noexp = ms_writer_noexp

    def setExperimentalSettings(self, e):
        if self._ms_writer is not None:
            self._ms_writer.setExperimentalSettings(e)

    def setExpectedSize(self, a, b):
        if self._ms_writer is not None:
            self._ms_writer.setExpectedSize(a, b)
        if self._ms_writer_noexp is not None:
            self._ms_writer_noexp.setExpectedSize(a, b)

    def consumeChromatogram(self, c):
        pass
        # self._internal_consumer.consumeChromatogram(c)

    def consumeSpectrum(self, s):
        if self._ms_writer is not None:
            self._ms_writer.consumeSpectrum(s)
        if self._ms_writer_noexp is not None:
            self._ms_writer_noexp.consumeSpectrum(s)


def run_openms(input_file, outf_withexp, outf_withoutexp, input_format):

    if outf_withexp is None and outf_withoutexp is None:
        # This shouldn't happen! as the user is forced to pick any or both
        sys.exit(
            'Please indicate whether saving file with or without '
            'experimental metadata or both'
        )
    ms_writer = None
    ms_writer_noexp = None
    if outf_withexp is not None:
        ms_writer = oms.PlainMSDataWritingConsumer(outf_withexp)
        writer_opts = ms_writer.getOptions()
        writer_opts.setCompression(True)
        ms_writer.setOptions(writer_opts)
        print(
            f'Transforming {os.path.split(input_file)[1]} into '
            f'{os.path.split(outf_withexp)[1]}')
    if outf_withoutexp is not None:
        ms_writer_noexp = oms.PlainMSDataWritingConsumer(outf_withoutexp)
        writer_opts_noexp = ms_writer_noexp.getOptions()
        writer_opts_noexp.setCompression(True)
        ms_writer_noexp.setOptions(writer_opts)
        print(
            f'Transforming {os.path.split(input_file)[1]} into '
            f'{os.path.split(outf_withoutexp)[1]}')

    if input_format == 'mzML':
        loader = oms.MzMLFile()
    elif input_format == 'mzXML':
        loader = oms.MzXMLFile()

    formatter = FormatConsumer(ms_writer, ms_writer_noexp)
    loader.transform(input_file, formatter)
    del ms_writer, ms_writer_noexp




In [None]:
# @title Run program

_, basename = os.path.split(input_folder)

if fix_peakscount:
    print('Fixing peaks counts in MS files')
    outfolder_fixcount = os.path.join(output_folder, basename + '_fixedcounts')
    os.makedirs(outfolder_fixcount, exist_ok=True)
    infile_list = []
    outfile_list = []
    for f in os.listdir(input_folder):
        outfile, ext = os.path.splitext(f)
        outfile = outfile + '_corrected' + ext
        outfile = os.path.join(outfolder_fixcount, outfile)
        infile = os.path.join(input_folder, f)
        infile_list.append(infile)
        outfile_list.append(outfile)
    with ThreadPoolExecutor(max_workers=4) as executor:
        executor.map(correct_peakscount, infile_list, outfile_list)

if convert_to != 'none':
    print(f'Converting from {input_format} to {convert_to}')
    if fix_peakscount:
        input_convert = outfolder_fixcount
    else:
        input_convert = input_folder

    if metadata_options == 'with experiment':
        with_experiment = True
        without_experiment = False
    elif metadata_options == 'without experiment':
        with_experiment = False
        without_experiment = True
    elif metadata_options == 'both':
        with_experiment = True
        without_experiment = True
    converted_folder = os.path.join(output_folder, basename + '_' + convert_to)
    os.makedirs(converted_folder, exist_ok=True)
    if with_experiment and not without_experiment and msconvert_path != '':
        print('Saving converted files with experimental metadata using msconvert')
        run_msconvert(input_convert, converted_folder, input_format, convert_to)
    else:
        infile_list = []
        outfile_list = []
        for f in os.listdir(input_convert):
            if not f.endswith(f'.{input_format}'):
                continue
            infile_list.append(os.path.join(input_convert, f))
            outf_withexp = None
            outf_withoutexp = None

            f, ext = os.path.splitext(f)

            if with_experiment:
                f_we = f + '.' + convert_to
                outf_withexp = os.path.join(converted_folder, f_we)
            if without_experiment:
                f_woe = f + '_noexp.' + convert_to
                outf_withoutexp = os.path.join(converted_folder, f_woe)
            outfile_list.append((outf_withexp, outf_withoutexp))
        with ThreadPoolExecutor(max_workers=4) as executor:
            executor.map(
                lambda i, o: run_openms(i, o[0], o[1], input_format),
                infile_list, outfile_list)
        # for i, o in zip(infile_list, outfile_list):
        #     run_openms(i, o[0], o[1], input_format)



Fixing peaks counts in MS files
	Found 288 spectrum/scan elements in '/content/drive/MyDrive/odense_runs/20251110_Odense_ZooMS/20251111-1133_TTF_013337_ONJ_TR_MC_2025_11_11_2101004291.mzXML'
	Updated peaksCount for 288 spectra (out of 288 matched).
	Saving modified mzXML to '/content/drive/MyDrive/odense_runs_test/20251110_Odense_ZooMS_fixedcounts/20251111-1133_TTF_013337_ONJ_TR_MC_2025_11_11_2101004291_corrected.mzXML'

	Found 384 spectrum/scan elements in '/content/drive/MyDrive/odense_runs/20251110_Odense_ZooMS/20251111-1133_TTF_013338_ONJ_TR_MC_2025_11_11_1005528.mzXML'
	Updated peaksCount for 384 spectra (out of 384 matched).
	Saving modified mzXML to '/content/drive/MyDrive/odense_runs_test/20251110_Odense_ZooMS_fixedcounts/20251111-1133_TTF_013338_ONJ_TR_MC_2025_11_11_1005528_corrected.mzXML'

	Found 384 spectrum/scan elements in '/content/drive/MyDrive/odense_runs/20251110_Odense_ZooMS/20251111-1133_TTF_013340_ONJ_TR_MC_2025_11_11_1004289.mzXML'
	Updated peaksCount for 384 spec