The goal of this notebook is to find motifs that have passed both MS2LDA and proccess_motifs (the function) filtering and are also reproducible across runs.

In [1]:
from MS2LDA.motif_parser import load_m2m_folder
from MS2LDA.Add_On.MassQL.MassQL4MotifDB import load_motifDB, motifDB2motifs
from MS2LDA.utils import retrieve_spec4doc

from MS2LDA.Add_On.Fingerprints.FP_annotation import annotate_motifs as calc_fingerprints

import pickle
import tomotopy as tp
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from adjustText import adjust_text

from rdkit.Chem import RDKFingerprint
from rdkit.DataStructs import TanimotoSimilarity
import numpy as np
from tqdm import tqdm
from rdkit import DataStructs
from rdkit.DataStructs.cDataStructs import ExplicitBitVect
from rdkit.Chem.Draw import MolsToGridImage
from rdkit.Chem import MolFromSmiles
from rdkit.Chem import MolFromSmarts


from typing import Dict, List, Optional
from rdkit.Chem import MolFromSmiles, rdFMCS, RDKFingerprint
from rdkit.DataStructs import TanimotoSimilarity
import numpy as np
from tqdm import tqdm

from collections import defaultdict
from matchms import calculate_scores

import numpy as np
import pandas as pd
from matchms import Spectrum
from matchms.similarity import CosineGreedy


In order to do so, everything that is needed is loaded. Basically, the motif objects and also the results from the process_motifs for each run are loaded. In order for the snippet of code below to work, make sure that the path to the needed files are correct also in your case.

In [2]:
runs = [400, 600, 800, 1000]

motifs = {}
doc2spec_maps = {}
lda_models = {}
results_sos = {}
motif_to_docs = {}
processed_ids = {}

base = "/home/ioannis/thesis_data"

for r in runs:
    # Load motifDB and motifs
    motifDB_1, motifDB_2 = load_motifDB(f"{base}/positive_{r}/motifset_optimized.json")
    motifs[r] = motifDB2motifs(motifDB_2)

    # Load doc2spec_map
    with open(f"{base}/positive_{r}/doc2spec_map.pkl", "rb") as f:
        doc2spec_maps[r] = pickle.load(f)

    # Load LDA model
    lda_models[r] = tp.LDAModel.load(f"{base}/positive_{r}/ms2lda.bin")

    # Load processed results
    with open(f"results_sos_{r}.pkl", "rb") as f:
        cache = pickle.load(f)
        results_sos[r] = cache["results_sos"]
        motif_to_docs[r] = cache["motif_to_docs"]
        processed_ids[r] = cache["results_sos"]["motif_ids"]

    print(f"Run {r}: {len(processed_ids[r])} motifs kept")


Run 400: 173 motifs kept
Run 600: 244 motifs kept
Run 800: 295 motifs kept
Run 1000: 337 motifs kept


process_motifs function doesn't save the matchms spectrum objects, but saves the ids of these motifs. Therefore, since the initial matchms spectrum objects are already loaded, by using the ids we can recover the spectrum objects of the motifs that passed the filtering.

In [11]:
print(motifs[400][0].metadata['motif_id'])

motif_399


In [None]:
print(processed_ids)

{400: [399, 79, 238, 343, 154, 319, 207, 259, 32, 80, 91, 77, 34, 115, 280, 96, 204, 87, 127, 12, 322, 148, 388, 326, 265, 50, 189, 145, 1, 94, 38, 212, 232, 233, 341, 146, 285, 182, 394, 299, 348, 197, 36, 11, 109, 215, 260, 349, 159, 340, 275, 130, 290, 325, 296, 121, 29, 245, 291, 323, 382, 320, 17, 132, 309, 111, 191, 254, 255, 52, 324, 179, 129, 308, 156, 13, 317, 59, 166, 211, 246, 74, 7, 257, 180, 229, 380, 224, 23, 209, 124, 15, 219, 288, 266, 354, 112, 392, 228, 383, 282, 43, 294, 141, 128, 58, 82, 113, 151, 298, 217, 278, 85, 314, 289, 47, 198, 208, 126, 188, 57, 143, 27, 202, 251, 203, 305, 134, 26, 73, 312, 149, 391, 147, 318, 385, 6, 54, 164, 110, 2, 311, 40, 158, 226, 100, 333, 269, 398, 329, 367, 222, 67, 169, 316, 236, 242, 248, 397, 97, 22, 176, 137, 244, 142, 16, 162, 271, 153, 185, 240, 307, 366], 600: [281, 238, 359, 431, 262, 343, 150, 488, 154, 152, 144, 319, 575, 506, 259, 378, 432, 80, 351, 548, 77, 34, 418, 96, 66, 545, 504, 569, 4, 87, 12, 322, 515, 326, 265, 

In [9]:
print(processed_ids[400][0])

399


In [25]:
processed_motifs = {}

for r in runs:
    processed_motifs[r] = []

    for spec in motifs[r]:
        motif_id = int(spec.metadata['motif_id'].split('_')[1])
        if motif_id in processed_ids[r]:
            processed_motifs[r].append(spec)

    print(r, len(processed_motifs[r]))


400 173
600 244
800 295
1000 337


In [27]:
cosine = CosineGreedy(tolerance=0.01)

In [28]:
records = []

scores_400_600 = calculate_scores(processed_motifs[400], processed_motifs[600], cosine)


for spec_400 in processed_motifs[400]:
    motif_id_400 = spec_400.metadata["motif_id"]

    # best match in 600
    row_600 = scores_400_600.scores_by_reference(spec_400, "CosineGreedy_score")
    if len(row_600) == 0:
        best_600_id = None
        score_600 = 0.0
    else:
        best_600_spec, best_600_score = max(row_600, key=lambda x: x[1][0])
        best_600_id = best_600_spec.metadata["motif_id"]
        score_600 = best_600_score[0]

    records.append({
        "motif_400": motif_id_400,
        "best_600": best_600_id,
        "score_600": score_600,
    })

reproducibility_400 = pd.DataFrame(records) 

print(reproducibility_400)

     motif_400   best_600  score_600
0    motif_399  motif_186   1.000000
1     motif_79  motif_421   1.000000
2    motif_238       None   0.000000
3    motif_343  motif_359   1.000000
4    motif_154       None   0.000000
..         ...        ...        ...
168  motif_153   motif_49   0.007348
169  motif_185  motif_354   1.000000
170  motif_240  motif_341   1.000000
171  motif_307   motif_67   0.301092
172  motif_366       None   0.000000

[173 rows x 3 columns]


In [37]:
# reproducibility_400["present_in_all_runs"] = (
#     (reproducibility_400["score_600"] > 0) &
#     (reproducibility_400["score_800"] > 0) &
#     (reproducibility_400["score_1000"] > 0)
# )

# reproducibility_400["high_reproducibility"] = (
#     (reproducibility_400["mean_score"] >= 0.95) &
#     (reproducibility_400["present_in_all_runs"])
# )
reproducibility_400["high_reproducibility"] = (reproducibility_400["score_600"] >= 0.95) 

In [38]:
n_high_95 = reproducibility_400["high_reproducibility"].sum()
print("Motifs present in all runs with mean_score ≥ 0.95:", n_high_95)

Motifs present in all runs with mean_score ≥ 0.95: 76


In [39]:
anchors_400_600_95 = reproducibility_400[reproducibility_400["high_reproducibility"]].copy()
anchors_400_600_95.to_excel("processed_400_600_95.xlsx", index=False)