# Load in spectra

In [1]:
from matchms.importing.load_from_mzml import load_from_mzml

spectra = list(load_from_mzml("../../../downloads/Case_studies/Berries/x028_p19_0min_VC_UR_Claudia_r1_DDA_POS.mzML"))


Add ionmode

In [2]:
for s in spectra:
    s.set("ionmode", "positive")
print(spectra[0].metadata)
print(len(spectra))

{'charge': None, 'scan_number': None, 'title': 'x028_p19_0min_VC_UR_Claudia_r1_DDA_POS.2.2. File:"x028_p19_0min_VC_UR_Claudia_r1_DDA_POS.raw", NativeID:"controllerType=0 controllerNumber=1 scan=2"', 'precursor_mz': 198.185440063477, 'scan_start_time': [0.014253333333], 'retention_time': [], 'ionmode': 'positive'}
1014


# Filter spectra
normalizes intensities, 
remove peaks < 10 m/z, 
remove peaks intensity < 0.001, 
Select only spectra with at least 5 peaks in the range 10-1000m/z, 
Select only spectra that have a precursor/parent mass

In [3]:
from matchms.filtering.default_filters import default_filters
from matchms.filtering.add_parent_mass import add_parent_mass
from matchms.filtering.derive_adduct_from_name import derive_adduct_from_name


print(len(spectra))

def apply_filters(s):
    s = default_filters(s)
    s = derive_adduct_from_name(s)
    s = add_parent_mass(s, 
                        estimate_from_adduct=False,
                        overwrite_existing_entry=True)
    return s

spectra = [apply_filters(s) for s in spectra]


1014


In [4]:
print(spectra[0].metadata)

{'charge': 1, 'scan_number': None, 'title': 'x028_p19_0min_VC_UR_Claudia_r1_DDA_POS.2.2. File:"x028_p19_0min_VC_UR_Claudia_r1_DDA_POS.raw", NativeID:"controllerType=0 controllerNumber=1 scan=2"', 'precursor_mz': 198.185440063477, 'scan_start_time': [0.014253333333], 'retention_time': [], 'ionmode': 'positive', 'compound_name': 'x028_p19_0min_VC_UR_Claudia_r1_DDA_POS.2.2. File:"x028_p19_0min_VC_UR_Claudia_r1_DDA_POS.raw", NativeID:"controllerType=0 controllerNumber=1 scan=2"', 'parent_mass': 197.17816361148624}


In [5]:
from ms2query.spectrum_processing import minimal_processing_multiple_spectra
preprocessed_spectra = minimal_processing_multiple_spectra(spectra)
print(len(preprocessed_spectra))

1014


In [6]:
# The sys.stdout statements is to resolve proplems with printing statements in jupyter notebook, 
# when removed progress updates printed are not shown in the notebookr
import sys
stdout = sys.stdout

from ms2query.ms2library import MS2Library
sys.stdout = stdout

tanimoto_scores_df_file = "../../../downloads/gnps_210409/ALL_GNPS_210409_positive_tanimoto_scores.pickle"
sqlite_file =  "../../../downloads/gnps_210409/spectra/ALL_GNPS_210409_train_split.sqlite"

# Models
s2v_model_file = "../../../downloads/gnps_210409/models/ALL_GNPS_210409_Spec2Vec_ms2query.model"
ms2ds_model_file = "../../../downloads/gnps_210409/models/ms2ds_20210420-141937_data210409_10k_500_500_200.hdf5"
ms2query_model = "../../../downloads/gnps_210409/train_ms2query_model/ms2query_model_all_scores_dropout_regularization.hdf5"

# Embeddings
s2v_embeddings_file =  "../../../downloads/gnps_210409/embeddings/s2v_embeddings_train_spectra_210426.pickle"
ms2ds_embeddings_file =  "../../../downloads/gnps_210409/embeddings/ms2ds_embeddings_train_spectra_210426.pickle"


In [7]:
ms2library = MS2Library(sqlite_file, s2v_model_file, ms2ds_model_file, s2v_embeddings_file, ms2ds_embeddings_file)
result = ms2library.analog_search(preprocessed_spectra[:10], ms2query_model, preselection_cut_off = 2000)

Spectrum binning: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 624.56it/s]
Create BinnedSpectrum instances: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 9934.40it/s]
Calculating vectors of reference spectrums: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 18.06it/s]


Calculating MS2Deepscore between query embeddings and library embeddings


collecting matches info: 0it [00:00, ?it/s]

Found 8 word(s) missing in the model. Weighted missing percentage not covered by the given model is 4.08%.


collecting matches info: 4it [00:04,  1.01it/s]

Found 8 word(s) missing in the model. Weighted missing percentage not covered by the given model is 4.10%.


collecting matches info: 7it [00:07,  1.09it/s]

Found 1 word(s) missing in the model. Weighted missing percentage not covered by the given model is 0.67%.


collecting matches info: 9it [00:08,  1.27it/s]

Found 2 word(s) missing in the model. Weighted missing percentage not covered by the given model is 1.16%.


collecting matches info: 10it [00:09,  1.03it/s]


In [8]:
print(result)

{0:                     parent_mass*0.001  mass_similarity  s2v_score  \
CCMSLIB00005727702           0.689398     1.990655e-48   0.334417   
CCMSLIB00005727493           0.705393     5.609378e-50   0.160801   
CCMSLIB00005904184           0.822401     2.568431e-61   0.220084   
CCMSLIB00005727425           0.705393     5.609451e-50   0.157412   
CCMSLIB00005903939           0.822401     2.568431e-61   0.214666   
...                               ...              ...        ...   
CCMSLIB00005891491           0.074041     1.166151e-12  -0.049658   
CCMSLIB00005892645           0.102071     6.069353e-10   0.006618   
CCMSLIB00006084715           0.088021     2.639713e-11   0.042721   
CCMSLIB00000219270           0.128993     2.466881e-07   0.114695   
CCMSLIB00000578049           0.108073     2.316278e-09   0.032108   

                    ms2ds_score  average_ms2ds_score_for_inchikey14  \
CCMSLIB00005727702     0.735806                            0.735806   
CCMSLIB00005727493     0.