The goal of this notebook is to create the final mgf file that will be used for training Spec2Vec for the negative ionization mode.

In [2]:
from matchms.importing import load_from_mgf

In [3]:
initial_mgf = list(load_from_mgf('/home/ioannis/thesis_data/merged_and_cleaned_libraries_1.mgf'))

In [4]:
print(len(initial_mgf))

1017531


In [7]:
from collections import Counter
ionmode = [spectrum.get('ionmode') for spectrum in initial_mgf]
print(Counter(ionmode))

Counter({'positive': 702579, 'negative': 314952})


In [15]:
def check_completeness(spectra, metadata):
    total = len(spectra)
    counts = Counter()
    for spectrum in spectra:
        for category in metadata:
            info = spectrum.get(category)
            if info is not None and str(info).strip() != "":
                counts[category] += 1

    print(f"Total spectra: {total}")
    for category in metadata:
        completeness = (counts[category] / total * 100) if total else 0
        print(f"{category:16s}: {counts[category]:6d} / {total}  ({completeness:5.2f}%)")

In [16]:
metadata = ['description', 'formula', 'inchi', 'smiles', 'adduct', 'inchikey', 'collision_energy']
check_completeness(initial_mgf, metadata)

Total spectra: 1017531
description     : 304872 / 1017531  (29.96%)
formula         : 1017531 / 1017531  (100.00%)
inchi           : 1017531 / 1017531  (100.00%)
smiles          : 1017531 / 1017531  (100.00%)
adduct          : 1017531 / 1017531  (100.00%)
inchikey        : 1017531 / 1017531  (100.00%)
collision_energy: 406846 / 1017531  (39.98%)


In [17]:
col_energy = [spectrum.get('collision_energy') for spectrum in initial_mgf]
print(Counter(col_energy))

Counter({None: 610685, '60.0': 113315, '20.0': 104517, '30.0': 57233, '45.0': 32485, '15.0': 32450, '6.0': 8616, '35.0': 7999, '40.0': 7297, '10.0': 5725, '90.0': 4436, '75.0': 3575, '50.0': 3384, '80.0': 1978, '120.0': 1908, '70.0': 1709, '150.0': 1679, '25.0': 1556, '100.0': 1196, '110.0': 1085, '55.0': 1032, '33.0': 1029, '130.0': 996, '32.0': 987, '140.0': 962, '34.0': 837, '31.0': 836, '29.0': 803, '180.0': 718, '5.0': 677, '28.0': 636, '36.0': 531, '27.0': 449, '21.0': 409, '23.0': 352, '26.0': 338, '65.0': 313, '43.0': 301, '85.0': 207, '41.0': 196, '105.0': 186, '37.0': 185, '95.0': 165, '24.0': 130, '19.0': 125, '115.0': 116, '22.0': 109, '44.0': 106, '125.0': 92, '42.0': 87, '18.0': 86, '38.0': 82, '17.0': 73, '16.0': 67, '39.0': 60, '0.0': 44, '46.0': 42, '160.0': 27, '53.0': 25, '170.0': 21, '13.0': 21, '11.0': 21, '165.0': 18, '14.0': 18, '135.0': 17, '145.0': 17, '155.0': 17, '175.0': 17, '61.0': 17, '47.0': 17, '185.0': 16, '48.0': 11, '12.0': 10, '49.0': 7, '52.0': 6, '

Split the data into two mgf files based on their ionization mode.

In [8]:
cleaned_pos = [spectrum for spectrum in initial_mgf if spectrum.get("ionmode") == "positive"]
print('Positive: ' , (len(cleaned_pos)))
cleaned_neg = [spectrum for spectrum in initial_mgf if spectrum.get("ionmode") == "negative"]
print('Negative: ' , (len(cleaned_neg)))

Positive:  702579
Negative:  314952


In [18]:
check_completeness(cleaned_neg, metadata)

Total spectra: 314952
description     : 113564 / 314952  (36.06%)
formula         : 314952 / 314952  (100.00%)
inchi           : 314952 / 314952  (100.00%)
smiles          : 314952 / 314952  (100.00%)
adduct          : 314952 / 314952  (100.00%)
inchikey        : 314952 / 314952  (100.00%)
collision_energy: 138912 / 314952  (44.11%)


From the completeness check above, it is obvious that the metadata for the collision energy isn't great. Spec2Vec doesn't need that metadata, but it can be affected by duplicates. The goal here is to filter out spectra that are near-complete duplicates. Near-complete duplicates would have the same inchikey, representing the same molecule. Nonetheless, their fragmentation can be different, due to different adduct or collision energy or other technical reasons. Since the adduct metadata is 100% complete the spectra are going to be grouped using their inchikey and their adduct and then compared using cosine similarity score. Redundant spectra that have above 0.99 similarity are collapsed by only keeping one specta (the one with the best intensity_explained). Outliers are kept, preserving this way the varience and avoiding redundancy at the same time.

In [None]:
import networkx as nx
from collections import defaultdict
from matchms.similarity import CosineGreedy
from matchms import calculate_scores

def filter_redundant_graph(spectra, similarity_threshold=0.99):
    """
    Deduplicate spectra by first grouping on (InChIKey, adduct),
    then clustering within each group using graph-based redundancy filtering.

    Parameters
    ----------
    spectra : list of matchms.Spectrum
        Input spectra to filter.
    similarity_threshold : float, optional
        Cosine similarity cutoff for redundancy (default = 0.99).

    Returns
    -------
    filtered_spectra : list of matchms.Spectrum
        Deduplicated spectra.
    """

    # group spectra by (inchikey, adduct)
    grouped = defaultdict(list)
    for s in spectra:
        key = (s.get("inchikey"), s.get("adduct"))
        grouped[key].append(s)

    filtered = []
    similarity_function = CosineGreedy()

    # cluster within each group one by one
    for group_spectra in grouped.values():
        if len(group_spectra) == 1:
            filtered.extend(group_spectra)
            continue

        # compute all-vs-all similarities inside the group
        scores = calculate_scores(group_spectra, group_spectra, similarity_function)
        scores_array = scores.to_array(name="CosineGreedy_score")

        # Build graph
        G = nx.Graph()
        G.add_nodes_from(range(len(group_spectra))) # each spectra in the group becomes a node
        for i in range(len(group_spectra)):
            for j in range(i+1, len(group_spectra)):
                if scores_array[i, j] >= similarity_threshold:
                    G.add_edge(i, j)

        # Find connected components (connected nodes in the graph)
        clusters = list(nx.connected_components(G))

        # Pick representative per cluster
        for cluster in clusters:
            best = max(
                cluster,
                key=lambda idx: float(group_spectra[idx].get("quality_explained_intensity", 0))
            )
            filtered.append(group_spectra[best])

    print(f"Original spectra: {len(spectra)}")
    print(f"Filtered spectra: {len(filtered)}")
    return filtered


In [25]:
filtered_cleaned_neg = filter_redundant_graph(cleaned_neg)

Original spectra: 314952
Filtered spectra: 193545


In [27]:
from matchms.exporting import save_as_mgf
output_file = "/home/ioannis/thesis_data/s2v_filtered.mgf"
save_as_mgf(filtered_cleaned_neg, output_file)

dict_keys(['spectra'])
