# Wheat data illustrative example processing

# Package loading and file path specification

In [1]:
import specxplore.importing
import matchms
import matchms.filtering
import ms2query
import os
import pandas as pd

In [2]:
# Relative input and output paths
# Input paths
models_and_library_folder_path = os.path.join("models", "pos")
input_mgf_filepath = os.path.join("data", "data_wheat_output", "wheat_plus_phophe.mgf")

# ms2query required paths
input_data_folder  = os.path.join("data", "data_wheat_output")
mgf_filename       = os.path.join("wheat_plus_phophe.mgf")
output_ms2query_directory = os.path.join("data", "data_wheat_output") # folder to which ms2query puts the results csv
output_ms2query_filepath = os.path.join("data", "data_wheat_output", "wheat_plus_phophe.csv") # ms2query csv file name derived from input spectrum filename

# metadata for standards
metadata_csv_filepath =  os.path.join("data", "data_phophe_output", "metadata_phophe_standards_pos_processed.csv")

# Output paths
output_filepath = os.path.join("data", "data_wheat_output", "specxplore_wheat.pickle")

# Loading spectral data

In [3]:
spectra_matchms = list(matchms.importing.load_from_mgf(input_mgf_filepath))

# Basic spectral data processing

In [4]:
spectra_matchms = specxplore.importing.apply_basic_matchms_filters_to_spectra(spectra_matchms)

Number of spectra prior to filtering:  671
Number of spectra after to filtering:  646


In [5]:
# Check for sucess
for spec in spectra_matchms:
    assert all(spec.peaks.mz < 1000), "Peak with mz of 1000 or more found."
print('All assertions passed.')

All assertions passed.


In [6]:
# Check for uniqueness of feature_ids 
feature_ids = [spec.get("feature_id") for spec in spectra_matchms]
assert len(feature_ids) == len(set(feature_ids))
print("Uniqueness assertion passed")

Uniqueness assertion passed


# Loading metadata

In [7]:
standards_metadata = pd.read_csv(metadata_csv_filepath)
standards_metadata['feature_id'] = standards_metadata['feature_id'].astype('string')

# Run ms2query

To run or rerun ms2query, set the boolean to True. This step is deactivated as a default as it takes the longest.

In [8]:
if False:
        ms2library = ms2query.create_library_object_from_one_dir(models_and_library_folder_path)
        ms2query.run_ms2query_single_file(
                ms2library = ms2library, 
                folder_with_spectra = input_data_folder,
                spectrum_file_name = mgf_filename, 
                results_folder = output_ms2query_directory,
                settings = ms2query.utils.SettingsRunMS2Query())

# Align ms2query with feature_ids & extract analog classifications


In [9]:
raw_mgf_spectra = list(matchms.importing.load_from_mgf(input_mgf_filepath))
raw_data_spectrum_number = [iloc for iloc in range(1, len(raw_mgf_spectra)+1)]
raw_data_feature_ids = [spec.get('feature_id') for spec in raw_mgf_spectra]
raw_iloc_to_feature_id_mapping = pd.DataFrame({"feature_id": raw_data_feature_ids, "query_spectrum_nr" : raw_data_spectrum_number})
ms2query_annotation_table = pd.read_csv(output_ms2query_filepath)
ms2query_annotation_table = ms2query_annotation_table.merge(raw_iloc_to_feature_id_mapping, how = "left", on="query_spectrum_nr")
ms2query_annotation_table["feature_id"] = ms2query_annotation_table["feature_id"].astype("string") # recasting to string type if not already

In [10]:
# extracting ms2query analog classification table for heuristic highlighting
ms2query_analog_classification = ms2query_annotation_table.loc[:, ['cf_superclass', 'cf_class', 'cf_subclass',
       'cf_direct_parent', 'npc_class_results', 'npc_superclass_results',
       'npc_pathway_results', 'feature_id']]
ms2query_analog_classification

Unnamed: 0,cf_superclass,cf_class,cf_subclass,cf_direct_parent,npc_class_results,npc_superclass_results,npc_pathway_results,feature_id
0,Organic acids and derivatives,Carboxylic acids and derivatives,"Amino acids, peptides, and analogues",Beta amino acids and derivatives,,,,wheat_1961
1,Organoheterocyclic compounds,Furans,Furoic acid and derivatives,Furoic acids,,,,wheat_76
2,Organoheterocyclic compounds,Indoles and derivatives,Indolines,Indolines,Carboline alkaloids,Tryptophan alkaloids,Alkaloids,wheat_198
3,Benzenoids,Benzene and substituted derivatives,Xylenes,m-Xylenes,,,,wheat_301
4,Lipids and lipid-like molecules,Fatty Acyls,Fatty acid esters,Fatty acid esters,Triacylglycerols,Glycerolipids,Fatty acids,wheat_733
...,...,...,...,...,...,...,...,...
641,Phenylpropanoids and polyketides,"Linear 1,3-diarylpropanoids",Chalcones and dihydrochalcones,3-prenylated chalcones,Chalcones,Flavonoids,Shikimates and Phenylpropanoids,17472
642,Phenylpropanoids and polyketides,Flavonoids,O-methylated flavonoids,4'-O-methylated flavonoids,Flavones,Flavonoids,Shikimates and Phenylpropanoids,19755
643,Phenylpropanoids and polyketides,Flavonoids,O-methylated flavonoids,8-O-methylated flavonoids,Flavones,Flavonoids,Shikimates and Phenylpropanoids,20615
644,Phenylpropanoids and polyketides,Flavonoids,Flavans,8-prenylated flavanones,Flavanones,Flavonoids,Shikimates and Phenylpropanoids,21270


# Create basic specXplore session object

In [13]:
# use the specXplore constructor to get a barebones session data object (not suitable for specXplore yet)
specxplore_wheat = specxplore.importing.SessionData(spectra_matchms, models_and_library_folder_path)

Spectrum binning: 100%|██████████| 646/646 [00:00<00:00, 24847.95it/s]
Create BinnedSpectrum instances: 100%|██████████| 646/646 [00:00<00:00, 213482.54it/s]
Calculating vectors of reference spectrums: 100%|██████████| 646/646 [00:19<00:00, 33.91it/s]


# Run k-medoid clustering and t-SNE

In [14]:
# run kmedoid and tsne grid computations for assessing 
specxplore_wheat.attach_kmedoid_grid(k_values=[3, 6, 8, 10, 20, 30, 40, 60, 80, 100, 120, 140, 160, 200])
specxplore_wheat.attach_run_tsne_grid(perplexity_values=[20, 30, 40, 60, 100, 200, 400, 600])

iloc Number-of-Clusters Silhouette-Score
0 3 0.277
1 6 0.214
2 8 0.23
3 10 0.251
4 20 0.245
5 30 0.239
6 40 0.244
7 60 0.24
8 80 0.255
9 100 0.247
10 120 0.261
11 140 0.273
12 160 0.238
13 200 0.247
iloc Perplexity Pearson-score Spearman-score
0 20 0.699 0.697
1 30 0.651 0.642
2 40 0.605 0.577
3 60 0.617 0.592
4 100 0.697 0.694
5 200 0.781 0.774
6 400 0.844 0.852
7 600 0.859 0.872


# Select k-medoid and t-SNE values

In [15]:
# select a particular iloc of the tsne grid with good distance preservation
specxplore_wheat.select_tsne_coordinates(7) 
# select particular iloc(s) for kmedoid cluster assignments to add to class table
specxplore_wheat.select_kmedoid_cluster_assignments([2, 10, 11]) 

# Attach ms2query classifications to class table

In [16]:
specxplore_wheat.attach_addon_data_to_class_table(ms2query_analog_classification)

# Attach standard feature ids to highlight table

In [17]:
specxplore_wheat.construct_highlight_table(standards_metadata["feature_id"].to_list()) 

# Attach metadata

In [18]:
specxplore_wheat.attach_addon_data_to_metadata(standards_metadata)
specxplore_wheat.attach_addon_data_to_metadata(ms2query_annotation_table)
specxplore_wheat.metadata_table

Unnamed: 0.1,feature_id,spectrum_iloc,Unnamed: 0,compound_db_identity,compound_name,compound_annotation_score,mol_formula,adduct,precursor_mz,mz_diff_ppm,...,retention_index,smiles,cf_kingdom,cf_superclass,cf_class,cf_subclass,cf_direct_parent,npc_class_results,npc_superclass_results,npc_pathway_results
0,wheat_1961,0,not available,not available,not available,not available,not available,not available,not available,not available,...,not available,C1=CC(=CC=C1C(=O)NCCC(=O)O)Cl,Organic compounds,Organic acids and derivatives,Carboxylic acids and derivatives,"Amino acids, peptides, and analogues",Beta amino acids and derivatives,not available,not available,not available
1,wheat_76,1,not available,not available,not available,not available,not available,not available,not available,not available,...,not available,OC(=O)C1=COC=C1,Organic compounds,Organoheterocyclic compounds,Furans,Furoic acid and derivatives,Furoic acids,not available,not available,not available
2,wheat_198,2,not available,not available,not available,not available,not available,not available,not available,not available,...,not available,C1CC2=CC=CC=C2N1,Organic compounds,Organoheterocyclic compounds,Indoles and derivatives,Indolines,Indolines,Carboline alkaloids,Tryptophan alkaloids,Alkaloids
3,wheat_301,3,not available,not available,not available,not available,not available,not available,not available,not available,...,not available,CC1=C(C(=CC=C1)C)N,Organic compounds,Benzenoids,Benzene and substituted derivatives,Xylenes,m-Xylenes,not available,not available,not available
4,wheat_733,4,not available,not available,not available,not available,not available,not available,not available,not available,...,not available,CCCCOCCOCCOC(=O)CCCCC(=O)OCCOCCOCCCC,Organic compounds,Lipids and lipid-like molecules,Fatty Acyls,Fatty acid esters,Fatty acid esters,Triacylglycerols,Glycerolipids,Fatty acids
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641,17472,641,82.0,Isoxanthohumol: [M+H]+: 0.690,Isoxanthohumol,0.69,C21H22O5,[M+H]+,355.154,-1.42,...,not available,COC1=C(C(=O)\C=C\C2=CC=C(O)C=C2)C(O)=C(CC=C(C)...,Organic compounds,Phenylpropanoids and polyketides,"Linear 1,3-diarylpropanoids",Chalcones and dihydrochalcones,3-prenylated chalcones,Chalcones,Flavonoids,Shikimates and Phenylpropanoids
642,19755,642,83.0,Biochanin A: [M+H]+: 0.578,Biochanin A,0.58,C16H12O5,[M+H]+,285.07575,-1.07,...,not available,COc1ccc(cc1)c2cc(=O)c3c(O)cc(O)cc3o2,Organic compounds,Phenylpropanoids and polyketides,Flavonoids,O-methylated flavonoids,4'-O-methylated flavonoids,Flavones,Flavonoids,Shikimates and Phenylpropanoids
643,20615,643,84.0,not available,not available,not available,not available,not available,not available,not available,...,not available,COc1c(O)cc(O)c2c(=O)cc(oc12)c3ccccc3,Organic compounds,Phenylpropanoids and polyketides,Flavonoids,O-methylated flavonoids,8-O-methylated flavonoids,Flavones,Flavonoids,Shikimates and Phenylpropanoids
644,21270,644,85.0,8-Prenylnaringenin: [M+H]+: 0.630,8-Prenylnaringenin,0.63,C20H20O5,[M+H]+,341.13835,-1.25,...,not available,CC(=CCc1c(O)cc(O)c2C(=O)CC(Oc12)c3ccc(O)cc3)C,Organic compounds,Phenylpropanoids and polyketides,Flavonoids,Flavans,8-prenylated flavanones,Flavanones,Flavonoids,Shikimates and Phenylpropanoids


# Initialize and save specxplore running session

In [19]:
specxplore_wheat.initialize_specxplore_session()

In [20]:
specxplore_wheat.check_and_save_to_file(output_filepath)