In [None]:
# @title Mount drive
from google.colab import drive
drive.mount('/content/drive', readonly=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# @title Install dependencies
!pip install pyteomics
!pip install pyopenms

Collecting pyopenms
  Using cached pyopenms-3.4.0-cp312-cp312-manylinux_2_34_x86_64.whl.metadata (2.0 kB)
Downloading pyopenms-3.4.0-cp312-cp312-manylinux_2_34_x86_64.whl (59.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.8/59.8 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyopenms
Successfully installed pyopenms-3.4.0


In [None]:
# @title Load packages
from pyteomics import mzxml, mzml
import pyopenms as oms
import os
import sys
import subprocess
import shlex
from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
from lxml import etree

In [None]:
# @title Input parameters
# @markdown Input folder with data
input_folder = '/content/drive/MyDrive/odense_runs/20251110_Odense_ZooMS' # @param {type: "string"}
output_folder = '/content/drive/MyDrive/odense_runs' # @param {type: "string"}
# @markdown Fix data points or peak counts in mzML or mzXML
fix_peakscount = True # @param {type: "boolean"}
# @markdown input and output formats: mzML/mzXML
input_format = 'mzXML' # @param ['mzML', 'mzXML']
convert_to = 'mzML' # @param ['none', 'mzML', 'mzXML']
# @markdown <br>
# @markdown Whether to include or remove experiment metadata:
# @markdown instrument, software processing,...<br>
# @markdown Some software fail to recognise some of terms here, particularly
# @markdown associated with newer instruments.
remove_exp = 'without experiment' # @param ['with experiment', 'without experiment', 'both']
# @markdown <br>
# @markdown Path to local msconvert folder on the drive
msconvert_path = '/content/drive/MyDrive/software/pwiz/' # @param {type: 'string'}


In [None]:
# @title Load functions
def find_spectra(root):
    # Generic search for elements whose local-name is 'scan' or 'spectrum'
    # returns list of element objects in document order
    xpath_expr = ".//*[local-name() = 'scan' or local-name() = 'spectrum']"
    return root.xpath(xpath_expr)


def get_peakscounts(file_input, ms_format='mzXML'):
    if ms_format == 'mzXML':
        ms_data = mzxml.read(file_input)
    elif ms_format == 'mzML':
        ms_data = mzml.read(file_input)
    else:
        sys.exit(f'Format {ms_format} not supported')
    correct_lengths = []
    for s in ms_data:
        correct_lengths.append(len(s['m/z array']))
    return correct_lengths

def correct_peakscount(file_input, output_path, ms_format='mzXML'):

    if ms_format == 'mzXML':
        peak_count_field = 'peaksCount'
    elif ms_format == 'mzML':
        peak_count_field = 'defaultArrayLength'
    else:
        sys.exit(f'Format {ms_format} not supported')

    peak_counts = get_peakscounts(file_input, ms_format)

    parser = etree.XMLParser(remove_blank_text=False, huge_tree=True)
    tree = etree.parse(file_input, parser)
    root = tree.getroot()

    spectra = find_spectra(root)
    n_spectra = len(spectra)
    print(f"\tFound {n_spectra} spectrum/scan elements in '{file_input}'")

    if n_spectra == 0:
        print(
            f"WARNING. No spectra found in {file_input}."
             "Exiting without changes to this file.")
        return
    if len(peak_counts) != n_spectra:
        print(
            f"WARNING. counts length does not match number of spectra in\n{file_input}."
            f"  counts: {len(peak_counts)}, spectra: {n_spectra}"
            "Exiting without changes to this file."
        )
        return

    changed = 0
    for i in range(n_spectra):
        elem = spectra[i]
        new_count = int(peak_counts[i])
        # set attribute; attribute name is typically 'peaksCount' in mzXML
        old_val = elem.get(peak_count_field)
        if old_val is None or str(old_val) != str(new_count):
            elem.set(peak_count_field, str(new_count))
            changed += 1
            # optionally, you could also update <peaks> child length if present,
            # but typically only the attribute is needed.
    print(f"\tUpdated peaksCount for {changed} spectra (out of {n_spectra} matched).")
    print(f"\tSaving modified mzXML to '{output_path}'")
    tree.write(output_path, encoding='utf-8', xml_declaration=True, pretty_print=False)



def run_msconvert(input_folder, output_folder, input_format, output_format):

    tmp_filelist = os.path.join(output_folder, 'file_list.txt')

    with open(tmp_filelist, 'w') as outf:
        input_files = [
            os.path.join(input_folder, f) + '\n'
            for f in os.listdir(input_folder)
            if f.endswith(f'.{input_format}')]
        outf.writelines(input_files)

    msconvert_cmd = shlex.split(
        f'{msconvert_path}msconvert --{output_format} -z'
        f' -f {tmp_filelist} -o {output_folder}'
        )
    subprocess.run(
        ['chmod', 'u+x', f'{msconvert_path}msconvert']
    )
    subprocess.run(
        msconvert_cmd,
        # stdout=subprocess.DEVNULL,
        # stderr=subprocess.DEVNULL,
        )


class FormatConsumer:
    def __init__(self, writer):
        self._internal_consumer = writer

    def setExperimentalSettings(self, s):
        self._internal_consumer.setExperimentalSettings(s)

    def setExpectedSize(self, a, b):
        self._internal_consumer.setExpectedSize(a, b)

    def consumeChromatogram(self, c):
        pass
        # self._internal_consumer.consumeChromatogram(c)

    def consumeSpectrum(self, s):
        self._internal_consumer.consumeSpectrum(s)


def run_openms(input_file, output_file, input_format, remove_exp):

    print(f'Transforming {os.path.split(input_file)[1]} into '
            f'{os.path.split(output_file)[1]}')
    ms_writer = oms.PlainMSDataWritingConsumer(output_file)
    writer_opts = ms_writer.getOptions()
    writer_opts.setCompression(True)
    ms_writer.setOptions(writer_opts)
    if input_format == 'mzML':
        loader = oms.MzMLFile()
    elif input_format == 'mzXML':
        loader = oms.MzXMLFile()
    if remove_exp:
        loader.transform(input_file, ms_writer)
    else:
        formatter = FormatConsumer(ms_writer)
        loader.transform(input_file, formatter)
    del ms_writer




In [None]:
# @title Run program

_, basename = os.path.split(input_folder)

if fix_peakscount:
    print('Fixing peaks counts in MS files')
    outfolder_fixcount = os.path.join(output_folder, basename + 'fixedcounts')
    os.mkdir(outfolder_fixcount, )
    infile_list = []
    outfile_list = []
    for f in os.listdir(input_folder):
        outfile, ext = os.path.splitext(f)
        outfile = outfile + '_corrected' + ext
        outfile = os.path.join(outfolder_fixcount, outfile)
        infile = os.path.join(input_folder, f)
        infile_list.append(infile)
        outfile_list.append(outfile)
    with ThreadPoolExecutor(max_workers=4) as executor:
        executor.map(correct_peakscount, infile_list, outfile_list)

if convert_to != 'none':
    print(f'Converting from {input_format} to {convert_to}')
    if fix_peakscount:
        input_convert = outfolder_fixcount
    else:
        input_convert = input_folder
    if remove_exp in ['both', 'with experiment']:
        print('Saving converted files with experimental metadata')
        msconvert_folder = os.path.join(output_folder, basename + '_msconvert')
        os.mkdir(msconvert_folder)
        run_msconvert(input_convert, msconvert_folder, input_format, convert_to)
    if remove_exp in ['both', 'without experiment']:
        print('Saving converted files without experimental metadata')
        oms_folder = os.path.join(output_folder, basename + '_noexpmetadata')
        os.makedirs(oms_folder, exist_ok=True)
        print(f'Saving files to {oms_folder}')
        infile_list = []
        outfile_list = []
        for f in os.listdir(input_convert):
            if not f.endswith(f'.{input_format}'):
                continue
            infile_list.append(os.path.join(input_convert, f))
            f, ext = os.path.splitext(f)
            if remove_exp:
                f = f + 'noexp.' + convert_to
            else:
                f = f + '.' + convert_to
            outfile_list.append(os.path.join(oms_folder, f))

        # with ProcessPoolExecutor(max_workers=4) as executor:
        #     executor.map(
        #         lambda i, o: run_openms(i,o, input_format, remove_exp=True),
        #         infile_list, outfile_list)
        for i, o in zip(infile_list, outfile_list):
            run_openms(i, o, input_format, remove_exp=True)



Fixing peaks counts in MS files
	Found 288 spectrum/scan elements in '/content/drive/MyDrive/odense_runs/20251110_Odense_ZooMS/20251111-1133_TTF_013337_ONJ_TR_MC_2025_11_11_2101004291.mzXML'
	Updated peaksCount for 288 spectra (out of 288 matched).
	Saving modified mzXML to '/content/drive/MyDrive/odense_runs/20251110_Odense_ZooMSfixedcounts/20251111-1133_TTF_013337_ONJ_TR_MC_2025_11_11_2101004291_corrected.mzXML'
	Found 384 spectrum/scan elements in '/content/drive/MyDrive/odense_runs/20251110_Odense_ZooMS/20251111-1133_TTF_013338_ONJ_TR_MC_2025_11_11_1005528.mzXML'
	Updated peaksCount for 384 spectra (out of 384 matched).
	Saving modified mzXML to '/content/drive/MyDrive/odense_runs/20251110_Odense_ZooMSfixedcounts/20251111-1133_TTF_013338_ONJ_TR_MC_2025_11_11_1005528_corrected.mzXML'
	Found 384 spectrum/scan elements in '/content/drive/MyDrive/odense_runs/20251110_Odense_ZooMS/20251111-1133_TTF_013340_ONJ_TR_MC_2025_11_11_1004289.mzXML'
	Updated peaksCount for 384 spectra (out of 38