The goal of this notebook is to test how the model created for prioritization works in a bigger scale.

In [2]:
from matchms.importing import load_from_mgf

library = list(load_from_mgf('/home/ioannis/thesis_data/GNPS-NIST14-MATCHES.mgf'))
print('Number of spectra in this library: ', len(library))

Number of spectra in this library:  5763


In [15]:
library[0].metadata

{'charge': 1,
 'ionmode': 'positive',
 'smiles': 'CC(C)C[C@@H](C(=O)O)NC(=O)[C@H](CC1=CC=CC=C1)N',
 'inchi': 'InChI=1S/C15H22N2O3/c1-10(2)8-13(15(19)20)17-14(18)12(16)9-11-6-4-3-5-7-11/h3-7,10,12-13H,8-9,16H2,1-2H3,(H,17,18)(H,19,20)/t12-,13-/m0/s1',
 'scans': '864',
 'ms_level': '2',
 'instrument_type': 'ESI-qTof',
 'file_name': 'MSV000081152/ccms_peak/mzXML_Files/Plate_1/C18p_Plate1_BE11_01_10247.mzXML',
 'peptide_sequence': '*..*',
 'organism_name': 'GNPS-NIST14-MATCHES',
 'compound_name': 'Spectral Match to Phe-Leu from NIST14 M+H',
 'principal_investigator': 'Data from Suryasarathi Dasgupta',
 'data_collector': 'Data deposited by fevargas',
 'submit_user': 'mwang87',
 'confidence': '3',
 'spectrum_id': 'CCMSLIB00003134487',
 'precursor_mz': 279.171}

In [3]:
from collections import Counter
ionmode = [spectrum.get('ionmode') for spectrum in library]
print(Counter(ionmode))

Counter({'positive': 5277, 'negative': 484, 'positive-20ev': 2})


In [4]:
positive_library = [spectrum for spectrum in library if spectrum.get('ionmode') == 'positive']
print('Spectra with positive ionization: ', len(positive_library))

Spectra with positive ionization:  5277


In [None]:
# Further filter to only those with SMILES metadata
positive_with_smiles = [s for s in positive_library if s.get("smiles") is not None]
print("Negative spectra with SMILES:", len(positive_with_smiles))

Negative spectra with SMILES: 4667


In [6]:
from matchms.filtering.default_pipelines import DEFAULT_FILTERS
from matchms import SpectrumProcessor
processor = SpectrumProcessor(DEFAULT_FILTERS)

In [7]:
positive_cleaned, _ = processor.process_spectra(positive_with_smiles)
positive_cleaned = [s for s in positive_cleaned if s is not None]

Processing spectra:   2%|▏         | 73/4667 [00:00<00:36, 125.21it/s]



Processing spectra:   2%|▏         | 116/4667 [00:00<00:34, 132.76it/s]



Processing spectra:   3%|▎         | 145/4667 [00:01<00:33, 133.58it/s]



Processing spectra:   4%|▍         | 180/4667 [00:01<00:29, 151.93it/s]



Processing spectra:   6%|▋         | 301/4667 [00:02<00:27, 156.99it/s]



Processing spectra:   7%|▋         | 317/4667 [00:02<00:28, 153.82it/s]



Processing spectra:   7%|▋         | 350/4667 [00:02<00:27, 156.29it/s]



Processing spectra:   8%|▊         | 366/4667 [00:02<00:27, 154.25it/s]



Processing spectra:   9%|▉         | 429/4667 [00:02<00:28, 146.69it/s]



Processing spectra:  10%|▉         | 444/4667 [00:03<00:32, 129.77it/s]



Processing spectra:  10%|█         | 474/4667 [00:03<00:30, 137.33it/s]



Processing spectra:  10%|█         | 488/4667 [00:03<00:31, 131.40it/s]



Processing spectra:  11%|█         | 503/4667 [00:03<00:30, 135.08it/s]



Processing spectra:  11%|█         | 519/4667 [00:03<00:29, 141.13it/s]



Processing spectra:  12%|█▏        | 562/4667 [00:04<00:31, 128.48it/s]



Processing spectra:  13%|█▎        | 620/4667 [00:04<00:30, 131.10it/s]



Processing spectra:  14%|█▍        | 652/4667 [00:04<00:28, 141.73it/s]



Processing spectra:  14%|█▍        | 667/4667 [00:04<00:28, 140.57it/s]



Processing spectra:  15%|█▍        | 682/4667 [00:04<00:28, 138.99it/s]



Processing spectra:  16%|█▌        | 743/4667 [00:05<00:30, 129.60it/s]



Processing spectra:  19%|█▊        | 864/4667 [00:06<00:26, 144.63it/s]



Processing spectra:  20%|██        | 943/4667 [00:06<00:25, 145.02it/s]



Processing spectra:  21%|██▏       | 1001/4667 [00:07<00:28, 128.06it/s]



Processing spectra:  23%|██▎       | 1066/4667 [00:07<00:24, 149.37it/s]



Processing spectra:  24%|██▍       | 1115/4667 [00:07<00:23, 153.72it/s]



Processing spectra:  24%|██▍       | 1131/4667 [00:08<00:23, 150.77it/s]



Processing spectra:  26%|██▌       | 1216/4667 [00:08<00:22, 153.52it/s]



Processing spectra:  27%|██▋       | 1249/4667 [00:08<00:22, 150.72it/s]



Processing spectra:  27%|██▋       | 1265/4667 [00:08<00:24, 140.20it/s]



Processing spectra:  28%|██▊       | 1324/4667 [00:09<00:24, 134.88it/s]



Processing spectra:  29%|██▊       | 1341/4667 [00:09<00:23, 141.19it/s]



Processing spectra:  29%|██▉       | 1356/4667 [00:09<00:24, 136.45it/s]



Processing spectra:  30%|██▉       | 1388/4667 [00:09<00:22, 147.22it/s]



Processing spectra:  30%|███       | 1403/4667 [00:09<00:22, 144.77it/s]



Processing spectra:  31%|███       | 1433/4667 [00:10<00:23, 136.01it/s]



Processing spectra:  31%|███▏      | 1460/4667 [00:10<00:25, 127.24it/s]



Processing spectra:  32%|███▏      | 1491/4667 [00:10<00:22, 141.85it/s]



Processing spectra:  33%|███▎      | 1525/4667 [00:10<00:20, 154.15it/s]



Processing spectra:  36%|███▌      | 1657/4667 [00:11<00:21, 141.67it/s]



Processing spectra:  37%|███▋      | 1715/4667 [00:12<00:21, 134.60it/s]



Processing spectra:  37%|███▋      | 1730/4667 [00:12<00:21, 138.39it/s]



Processing spectra:  38%|███▊      | 1760/4667 [00:12<00:21, 135.39it/s]



Processing spectra:  38%|███▊      | 1775/4667 [00:12<00:20, 138.51it/s]



Processing spectra:  39%|███▉      | 1820/4667 [00:12<00:22, 126.20it/s]



Processing spectra:  41%|████      | 1909/4667 [00:13<00:20, 136.28it/s]



Processing spectra:  43%|████▎     | 2005/4667 [00:14<00:16, 158.26it/s]



Processing spectra:  44%|████▍     | 2070/4667 [00:14<00:19, 130.60it/s]



Processing spectra:  45%|████▍     | 2084/4667 [00:14<00:20, 124.05it/s]



Processing spectra:  45%|████▍     | 2098/4667 [00:15<00:20, 125.84it/s]



Processing spectra:  46%|████▋     | 2169/4667 [00:15<00:21, 113.72it/s]



Processing spectra:  48%|████▊     | 2222/4667 [00:16<00:21, 115.64it/s]



Processing spectra:  48%|████▊     | 2238/4667 [00:16<00:19, 124.11it/s]



Processing spectra:  49%|████▊     | 2266/4667 [00:16<00:18, 127.08it/s]



Processing spectra:  49%|████▉     | 2279/4667 [00:16<00:18, 127.30it/s]



Processing spectra:  50%|█████     | 2350/4667 [00:17<00:17, 130.59it/s]



Processing spectra:  51%|█████     | 2364/4667 [00:17<00:18, 124.07it/s]



Processing spectra:  51%|█████     | 2380/4667 [00:17<00:17, 132.75it/s]



Processing spectra:  52%|█████▏    | 2438/4667 [00:17<00:18, 123.12it/s]



Processing spectra:  53%|█████▎    | 2451/4667 [00:17<00:18, 116.82it/s]



Processing spectra:  53%|█████▎    | 2493/4667 [00:18<00:17, 124.80it/s]



Processing spectra:  54%|█████▍    | 2519/4667 [00:18<00:18, 117.70it/s]



Processing spectra:  55%|█████▍    | 2547/4667 [00:18<00:16, 126.73it/s]



Processing spectra:  55%|█████▍    | 2564/4667 [00:18<00:15, 137.49it/s]



Processing spectra:  55%|█████▌    | 2578/4667 [00:18<00:15, 137.37it/s]



Processing spectra:  59%|█████▉    | 2768/4667 [00:20<00:14, 131.72it/s]



Processing spectra:  62%|██████▏   | 2871/4667 [00:20<00:12, 141.97it/s]



Processing spectra:  62%|██████▏   | 2886/4667 [00:21<00:13, 133.54it/s]



Processing spectra:  62%|██████▏   | 2900/4667 [00:21<00:13, 129.29it/s]



Processing spectra:  64%|██████▎   | 2970/4667 [00:21<00:13, 127.64it/s]



Processing spectra:  64%|██████▍   | 3000/4667 [00:21<00:12, 136.58it/s]



Processing spectra:  65%|██████▍   | 3031/4667 [00:22<00:11, 141.52it/s]



Processing spectra:  67%|██████▋   | 3105/4667 [00:22<00:11, 137.58it/s]



Processing spectra:  67%|██████▋   | 3134/4667 [00:22<00:11, 134.56it/s]



Processing spectra:  68%|██████▊   | 3180/4667 [00:23<00:10, 145.91it/s]



Processing spectra:  69%|██████▉   | 3226/4667 [00:23<00:09, 145.15it/s]



Processing spectra:  71%|███████   | 3295/4667 [00:24<00:10, 126.58it/s]



Processing spectra:  71%|███████   | 3308/4667 [00:24<00:11, 118.75it/s]



Processing spectra:  72%|███████▏  | 3365/4667 [00:24<00:09, 133.14it/s]



Processing spectra:  72%|███████▏  | 3379/4667 [00:24<00:09, 134.26it/s]



Processing spectra:  73%|███████▎  | 3410/4667 [00:24<00:08, 142.01it/s]



Processing spectra:  73%|███████▎  | 3425/4667 [00:25<00:08, 139.88it/s]



Processing spectra:  74%|███████▍  | 3455/4667 [00:25<00:09, 133.11it/s]



Processing spectra:  75%|███████▌  | 3516/4667 [00:25<00:08, 131.48it/s]



Processing spectra:  77%|███████▋  | 3575/4667 [00:26<00:07, 141.38it/s]



Processing spectra:  77%|███████▋  | 3606/4667 [00:26<00:08, 130.38it/s]



Processing spectra:  79%|███████▉  | 3677/4667 [00:26<00:07, 134.40it/s]



Processing spectra:  79%|███████▉  | 3709/4667 [00:27<00:06, 144.59it/s]



Processing spectra:  80%|███████▉  | 3724/4667 [00:27<00:06, 144.01it/s]



Processing spectra:  83%|████████▎ | 3854/4667 [00:28<00:05, 154.27it/s]



Processing spectra:  85%|████████▌ | 3971/4667 [00:28<00:04, 147.57it/s]



Processing spectra:  86%|████████▌ | 4003/4667 [00:29<00:04, 143.10it/s]



Processing spectra:  86%|████████▋ | 4036/4667 [00:29<00:04, 152.31it/s]



Processing spectra:  89%|████████▊ | 4133/4667 [00:30<00:03, 142.77it/s]



Processing spectra:  89%|████████▉ | 4163/4667 [00:30<00:03, 137.12it/s]



Processing spectra:  90%|█████████ | 4209/4667 [00:30<00:03, 137.53it/s]



Processing spectra:  90%|█████████ | 4223/4667 [00:30<00:03, 134.66it/s]



Processing spectra:  91%|█████████▏| 4265/4667 [00:31<00:03, 133.16it/s]



Processing spectra:  92%|█████████▏| 4294/4667 [00:31<00:02, 137.63it/s]



Processing spectra:  92%|█████████▏| 4308/4667 [00:31<00:02, 134.22it/s]



Processing spectra:  93%|█████████▎| 4352/4667 [00:31<00:02, 139.61it/s]



Processing spectra:  94%|█████████▎| 4367/4667 [00:31<00:02, 141.01it/s]



Processing spectra:  95%|█████████▌| 4444/4667 [00:32<00:01, 145.63it/s]



Processing spectra:  97%|█████████▋| 4520/4667 [00:32<00:01, 128.95it/s]



Processing spectra:  98%|█████████▊| 4568/4667 [00:33<00:00, 147.70it/s]



Processing spectra: 100%|██████████| 4667/4667 [00:33<00:00, 137.54it/s]


In [8]:
print(len(positive_cleaned))

4667


In [11]:
from matchms.exporting import save_as_mgf
save_as_mgf(positive_cleaned, "/home/ioannis/thesis_data/testing_positive_cleaned.mgf")

dict_keys(['spectra'])


In [16]:
positive_cleaned[0].metadata

{'charge': 1,
 'ionmode': 'positive',
 'smiles': 'CC(C)C[C@@H](C(=O)O)NC(=O)[C@H](CC1=CC=CC=C1)N',
 'inchi': 'InChI=1S/C15H22N2O3/c1-10(2)8-13(15(19)20)17-14(18)12(16)9-11-6-4-3-5-7-11/h3-7,10,12-13H,8-9,16H2,1-2H3,(H,17,18)(H,19,20)/t12-,13-/m0/s1',
 'scans': '864',
 'ms_level': '2',
 'instrument_type': 'ESI-qTof',
 'file_name': 'MSV000081152/ccms_peak/mzXML_Files/Plate_1/C18p_Plate1_BE11_01_10247.mzXML',
 'peptide_sequence': '*..*',
 'organism_name': 'GNPS-NIST14-MATCHES',
 'compound_name': 'Phe-Leu',
 'principal_investigator': 'Data from Suryasarathi Dasgupta',
 'data_collector': 'Data deposited by fevargas',
 'submit_user': 'mwang87',
 'confidence': '3',
 'spectrum_id': 'CCMSLIB00003134487',
 'precursor_mz': 279.171,
 'adduct': '[M+H]+',
 'retention_index': None,
 'retention_time': None,
 'inchikey': 'RFCVXVPWSPOMFJ-STQMWFEESA-N',
 'parent_mass': 278.163724,
 'formula': 'C15H22N2O3'}

In [17]:
def check_completeness(spectra, metadata):
    total = len(spectra)
    counts = Counter()
    for spectrum in spectra:
        for category in metadata:
            info = spectrum.get(category)
            if info is not None and str(info).strip() != "":
                counts[category] += 1

    print(f"Total spectra: {total}")
    for category in metadata:
        completeness = (counts[category] / total * 100) if total else 0
        print(f"{category:16s}: {counts[category]:6d} / {total}  ({completeness:5.2f}%)")

In [21]:
metadata = ['description', 'formula', 'inchi', 'smiles', 'adduct', 'inchikey', 'collision_energy', 'fragmentation_method', 'ms_mass_analyzer']
check_completeness(positive_cleaned, metadata)

Total spectra: 4667
description     :      0 / 4667  ( 0.00%)
formula         :   4666 / 4667  (99.98%)
inchi           :   4666 / 4667  (99.98%)
smiles          :   4666 / 4667  (99.98%)
adduct          :   4667 / 4667  (100.00%)
inchikey        :   4666 / 4667  (99.98%)
collision_energy:      0 / 4667  ( 0.00%)
fragmentation_method:      0 / 4667  ( 0.00%)
ms_mass_analyzer:      0 / 4667  ( 0.00%)


In [22]:
positive_cleaned = [s for s in positive_cleaned if s.get("inchikey")]
print(len(positive_cleaned))

4666


In [30]:
import networkx as nx
from collections import defaultdict
from matchms.similarity import CosineGreedy
from matchms import calculate_scores

def filter_redundant_graph(spectra, similarity_threshold=0.99):
    """
    Deduplicate spectra by first grouping on (InChIKey, adduct),
    then clustering within each group using graph-based redundancy filtering.

    Parameters
    ----------
    spectra : list of matchms.Spectrum
        Input spectra to filter.
    similarity_threshold : float, optional
        Cosine similarity cutoff for redundancy (default = 0.99).

    Returns
    -------
    filtered_spectra : list of matchms.Spectrum
        Deduplicated spectra.
    """

    # group spectra by (inchikey, adduct)
    grouped = defaultdict(list)
    for s in spectra:
        key = (s.get("inchikey"), s.get("adduct"))
        grouped[key].append(s)

    filtered = []
    similarity_function = CosineGreedy()

    # cluster within each group one by one
    for group_spectra in grouped.values():
        if len(group_spectra) == 1:
            filtered.extend(group_spectra)
            continue

        # compute all-vs-all similarities inside the group
        scores = calculate_scores(group_spectra, group_spectra, similarity_function)
        scores_array = scores.to_array(name="CosineGreedy_score")

        # Build graph
        G = nx.Graph()
        G.add_nodes_from(range(len(group_spectra))) # each spectra in the group becomes a node
        for i in range(len(group_spectra)):
            for j in range(i+1, len(group_spectra)):
                if scores_array[i, j] >= similarity_threshold:
                    G.add_edge(i, j)

        # Find connected components (connected nodes in the graph)
        clusters = list(nx.connected_components(G))

        # Pick representative per cluster
        for cluster in clusters:
            best = max(
                cluster,
                key=lambda idx: float(group_spectra[idx].get("quality_explained_intensity", 0))
            )
            filtered.append(group_spectra[best])

    print(f"Original spectra: {len(spectra)}")
    print(f"Filtered spectra: {len(filtered)}")
    return filtered


In [31]:
filtered_cleaned_positive = filter_redundant_graph(positive_cleaned)

Original spectra: 4666
Filtered spectra: 3512


In [26]:
from matchms.exporting import save_as_mgf
save_as_mgf(filtered_cleaned_positive, "/home/ioannis/thesis_data/testing_positive_cleaned_filtered.mgf")

dict_keys(['spectra'])
