The goal of this notebook is to test the ML model also with spectra of negative ionination.

In [1]:
from matchms.importing import load_from_mgf
path_data = "/home/ioannis/thesis_data/MSn_filtering_data/merged_and_cleaned_libraries_1.mgf"
spectrums = list(load_from_mgf(path_data))
print(len(spectrums))

1017531


In [18]:
from collections import Counter
data_collector = [spectrum.get("data_collector") for spectrum in spectrums]
print(Counter(data_collector))

Counter({None: 712659, 'Corinna Brungs': 304872})


In [19]:
gnps = [spectrum for spectrum in spectrums if spectrum.get("data_collector") != "Corinna Brungs"]
print(len(gnps))

712659


In [20]:
def check_completeness(spectra, metadata):
    total = len(spectra)
    counts = Counter()
    for spectrum in spectra:
        for category in metadata:
            info = spectrum.get(category)
            if info is not None and str(info).strip() != "":
                counts[category] += 1

    print(f"Total spectra: {total}")
    for category in metadata:
        completeness = (counts[category] / total * 100) if total else 0
        print(f"{category:16s}: {counts[category]:6d} / {total}  ({completeness:5.2f}%)")

In [21]:
metadata = ['description', 'formula', 'inchi', 'smiles', 'adduct', 'inchikey', 'collision_energy']
check_completeness(gnps, metadata)

Total spectra: 712659
description     :      0 / 712659  ( 0.00%)
formula         : 712659 / 712659  (100.00%)
inchi           : 712659 / 712659  (100.00%)
smiles          : 712659 / 712659  (100.00%)
adduct          : 712659 / 712659  (100.00%)
inchikey        : 712659 / 712659  (100.00%)
collision_energy: 101974 / 712659  (14.31%)


In [22]:
ionmode = [spectrum.get('ionmode') for spectrum in gnps]
print(Counter(ionmode))

Counter({'positive': 511271, 'negative': 201388})


In [23]:
gnps_neg = [spectrum for spectrum in gnps if spectrum.get("ionmode") == 'negative']
print(len(gnps_neg))

201388


In [24]:
import random
sample_size = 5000
gnps_subset = random.sample(gnps_neg, sample_size)
print(len(gnps_subset))

5000


The step below is a quick deduplication, by grouping spectra with the same inchikey and adduct, compairing them and finding the ones that are more than 99% similar. Out of this group only one representative is kept.

In [35]:
import networkx as nx
from collections import defaultdict
from matchms.similarity import CosineGreedy
from matchms import calculate_scores

def filter_redundant_graph(spectra, similarity_threshold=0.99):
    """
    Deduplicate spectra by first grouping on (InChIKey, adduct),
    then clustering within each group using graph-based redundancy filtering.

    Parameters
    ----------
    spectra : list of matchms.Spectrum
        Input spectra to filter.
    similarity_threshold : float, optional
        Cosine similarity cutoff for redundancy (default = 0.99).

    Returns
    -------
    filtered_spectra : list of matchms.Spectrum
        Deduplicated spectra.
    """

    # group spectra by (inchikey, adduct)
    grouped = defaultdict(list)
    for s in spectra:
        key = (s.get("inchikey"), s.get("adduct"))
        grouped[key].append(s)

    filtered = []
    similarity_function = CosineGreedy()

    # cluster within each group one by one
    for group_spectra in grouped.values():
        if len(group_spectra) == 1:
            filtered.extend(group_spectra)
            continue

        # compute all-vs-all similarities inside the group
        scores = calculate_scores(group_spectra, group_spectra, similarity_function)
        scores_array = scores.to_array(name="CosineGreedy_score")

        # Build graph
        G = nx.Graph()
        G.add_nodes_from(range(len(group_spectra))) # each spectra in the group becomes a node
        for i in range(len(group_spectra)):
            for j in range(i+1, len(group_spectra)):
                if scores_array[i, j] >= similarity_threshold:
                    G.add_edge(i, j)

        # Find connected components (connected nodes in the graph)
        clusters = list(nx.connected_components(G))

        # Pick representative per cluster
        for cluster in clusters:
            best = min(cluster)

            filtered.append(group_spectra[best])

    print(f"Original spectra: {len(spectra)}")
    print(f"Filtered spectra: {len(filtered)}")
    return filtered

In [34]:
gnps_subset_filtered = filter_redundant_graph(gnps_subset)

Original spectra: 5000
Filtered spectra: 4917


In [27]:
from matchms.exporting import save_as_mgf
save_as_mgf(gnps_subset_filtered, "/home/ioannis/thesis_data/testing_negative_cleaned_filtered.mgf")

dict_keys(['spectra'])
