# Summary of work

This notebook compares the t-SNE embeddings of the same planar Inchikeys between different instruments and instrument types. We find an unexpectedly high degree of similarity between the Velos CID and HCD platforms. The differences between other instrument types are markedly larger. The comparison between qToF and Orbitrap is inconclusive, possibly due to the high volume of data. <br>

Note on the HTML format: because the interactive plots created with plotly inflated the notebook beyond the data limit of Github, I've converted it to a HTML file which is for some reason much smaller but preserves the interactivity of the plots. 

# Load libraries and functions

In [4]:
import pickle
import numpy as np
import pandas as pd

from plotly.subplots import make_subplots
import plotly.graph_objects as go

from processing import get_ids_for_unique_inchikeys
colourdict = {"qToF": "#ff0000", "CID": "#0000ff", "HCD": "#00ff00", "Orbitrap": "#87cefa"}

In [None]:
def comparison_on_common_inchi(query_strings, metadata_df, var = "instrument"):
    ### Function
    # Creates a list of dataframes, with one dataframe for each query string. 
    # These dataframes will only contain the metadata referring to spectra with planar Inchikeys all dataframes have in common, 
    # making comparison based on the variable feasible and fair.
    #
    ### Example
    # comparison_on_common_inchi(query_strings = ["CID", "HCD"], var = "instrument", metadata_df = spectrum_metadata)
    # This code will output a list of two dataframes, with metadata of spectra recorded by instruments containing 'CID' (datasets_with_common_inchi[0]),
    # and one with the metadata of spectra recorded by instruments containing 'HCD' (datasets_with_common_inchi[1]). 
    # Only spectra with inchikeys that these datasets have in common will be included.
    #
    ### Input
    # query_strings: list of strings you want seperate datasets for
    # metadata_df: dataframe with spectrum metadata
    # var: string of the column variable the query strings should be in
    #
    ### Output
    # datasets_with_common_inchi: list of dataframes
    #
    ##############
    
    # Create subset based on query strings
    datasets = []
    inchi_data = []
    for string in query_strings:
        selection = metadata_df[metadata_df[var].str.contains(string, case = False)]
        datasets.append(selection)
        inchi_data.append(selection["planar_inchi"])
    
    # List of unique var values in common
    common_inchi = set.intersection(*map(set, inchi_data))S
    print(len(common_inchi), "unique planar Inchikeys in common between queries")
    
    # Subset datasets to only the var values they have in common
    datasets_with_common_inchi = []
    for i, dataset in enumerate(datasets):
        selection = dataset[dataset["planar_inchi"].isin(common_inchi)]
        datasets_with_common_inchi.append(selection)
        print(f"Found {len(selection)} spectra with common Inchikeys using query '{query_strings[i]}'")
        
    return datasets_with_common_inchi

# Load and prep data

In [15]:
data_dir = "C:/Users/Artur/Documents/Werk/Spec2Vec/Data/"
model_dir = "C:/Users/Artur/Documents/Werk/Spec2Vec/Model/"
embedding_dir = "C:/Users/Artur/Documents/Werk/Spec2Vec/Embeddings/"

# For now, we are only using the spectra that were obtained in positive ion-mode
spectra = pd.read_pickle(data_dir+"ALL_GNPS_210409_positive_cleaned_peaks_processed_s2v.pickle")

# Load the class predictions for each inchikey and shorten the inchikey to the first 14 characters. In case of duplicates, we keep the first occurence and drop the others.
# We only use the first 14 characters of the inchikey (the so-called planar inchikey) because MS spectra cannot be used to meaningfully distinguish compounds beyond these features.
inchikey_classifications = pd.read_csv(data_dir+"ALL_GNPS_210409_positive_processed_annotated_CF_NPC_classes.txt", sep = "\t")
inchikey_classifications.rename(columns = {"inchi_key": "inchikey"}, inplace = True) 

In [6]:
# We retrieve the inchikey and source instrument for all spectra, and look up the predicted classes for the inchikey.
spectrum_id = []
inchikeys = []
instruments = []

for spec in spectra:
    #short_inchikey = spec.get("inchikey")[:14]
    inchikeys.append(spec.get("inchikey"))
    instruments.append(spec.get("instrument"))
    spectrum_id.append(spec.get("spectrum_id"))
spectrum_metadata = pd.DataFrame({"ID": spectrum_id, "inchikey":inchikeys, "instrument": instruments})

# We drop all spectral records without inchikey and match the remaining records with npclassifier and classyfire compound class predictions
spectrum_metadata = spectrum_metadata[spectrum_metadata["inchikey"] != ""]
spectrum_metadata = spectrum_metadata.merge(inchikey_classifications, on = "inchikey", how = "left")
spectrum_metadata["planar_inchi"] = [key[:14] for key in spectrum_metadata["inchikey"]] # add shortened planar inchikey to metadata

inchi_spectra = [spectra[i] for i in spectrum_metadata.index]
print(f'{len(inchi_spectra)} out of {len(spectra)} spectra have an Inchikey')

# We retrieve the IDs of spectra with unique planar Inchikeys
unique_inchi = get_ids_for_unique_inchikeys(inchi_spectra)
print(f'{len(unique_inchi)} out of {len(inchi_spectra)} annotated spectra have a unique Inchikey')

187152 out of 199780 spectra have an Inchikey
16358 out of 187152 annotated spectra have a unique Inchikey


In [16]:
# Read UMAP and t-SNE dataframes
umap_df = pd.read_csv(embedding_dir+"Annotated_weighted0.5_GNPS_210409_positive_UMAP_a1b1_neighbours15.csv", sep="\t", index_col = 0)
tsne_df = pd.read_csv(embedding_dir+"Python395_sklearn0242_weighted_t-SNE_state42.csv", sep=",", index_col = 0)
tsne_df = pd.concat([tsne_df, spectrum_metadata], axis = 1)

pca_tsne = pd.read_csv(embedding_dir+"Annotated_weighted0.5_GNPS_210409_positive_PCA_t-SNE.csv", sep="\t")

tsne_louwen = pd.read_csv(embedding_dir+"ALL_GNPS_210409_positive_cleaned_peaks_processed_s2v_only_annotated_tsne2D.csv", sep=",", index_col = 0, names = ["x", "y"])
tsne_louwen = tsne_louwen.merge(spectrum_metadata, how = "inner", left_index = True, right_on = "ID")

# Comparison of HCD and CID Velos instruments

In [8]:
# We'll start by comparing the spectra of the most popular HCD and CID instruments, which are the Velos platforms. 
# These comprise the vast majority of CID and HCD spectra, and are also nice to start with because they have so many planar inchikeys in common.
HCD_CID_Velos = comparison_on_common_inchi(query_strings = ["HCD; Velos", "CID; Velos"], metadata_df=spectrum_metadata)

539 unique planar Inchikeys in common between queries
Found 10087 spectra with common Inchikeys using query 'HCD; Velos'
Found 9966 spectra with common Inchikeys using query 'CID; Velos'


In [None]:
fig = make_subplots(rows=1, cols=3, horizontal_spacing = 0.1, subplot_titles=('t-SNE embedding','PCA + t-SNE', '21/7 Joris t-SNE embedding'))
colourdict = {"HCD; Velos": "#00ff00", "CID; Velos": "#0000ff"}

for data in HCD_CID_Velos:
    instrument = data["instrument"].iloc[0]
    fig.add_trace(go.Scattergl(x=tsne_df.iloc[data.index,0], y=tsne_df.iloc[data.index,1], legendgroup = instrument, name = instrument, mode='markers', marker_symbol = "circle-open", marker_color = colourdict[instrument], marker_size = 2), 
                  row=1, col=1)
    fig.add_trace(go.Scattergl(x=pca_tsne.iloc[data.index,0], y=pca_tsne.iloc[data.index,1], legendgroup = instrument, name = instrument, mode='markers', marker_symbol = "circle-open", marker_color = colourdict[instrument], marker_size = 2, showlegend = False), 
                  row=1, col=2)
    fig.add_trace(go.Scattergl(x=tsne_louwen.iloc[data.index,0], y=tsne_louwen.iloc[data.index,1], legendgroup = instrument, name = instrument, mode='markers', marker_symbol = "circle-open", marker_color = colourdict[instrument], marker_size = 2, showlegend = False), 
                  row=1, col=3)

fig.update_layout(height=500, title_text="Spec2vec comparison of Velos HCD and Velos CID", title_x=0.5, titlefont=dict(size=24), legend={'itemsizing': 'constant'})
fig.update_yaxes(range=[-45, 45]); fig.update_xaxes(range=[-45, 45])
fig.show()

# Comparison of qToF and CID instrument types

In [9]:
qToF_CID = comparison_on_common_inchi(query_strings = ["tof", "CID"], metadata_df=spectrum_metadata)

68 unique planar Inchikeys in common between queries
Found 204 spectra with common Inchikeys using query 'tof'
Found 2598 spectra with common Inchikeys using query 'CID'


In [None]:
fig = make_subplots(rows=1, cols=3, horizontal_spacing = 0.1, subplot_titles=('t-SNE embedding','PCA + t-SNE', '21/7 Joris t-SNE embedding'))
colourdict = {"qToF": "#ff0000", "CID": "#0000ff", "HCD": "#00ff00", "Orbitrap": "#87cefa"}

for i, data in enumerate(qToF_CID):
    instrument = ["qToF", "CID"][i]
    fig.add_trace(go.Scattergl(x=tsne_df.iloc[data.index,0], y=tsne_df.iloc[data.index,1], legendgroup = instrument, name = instrument, mode='markers', marker_symbol = "circle-open", marker_color = colourdict[instrument], marker_size = 2), 
                  row=1, col=1)
    fig.add_trace(go.Scattergl(x=pca_tsne.iloc[data.index,0], y=pca_tsne.iloc[data.index,1], legendgroup = instrument, name = instrument, mode='markers', marker_symbol = "circle-open", marker_color = colourdict[instrument], marker_size = 2, showlegend = False), 
                  row=1, col=2)
    fig.add_trace(go.Scattergl(x=tsne_louwen.iloc[data.index,0], y=tsne_louwen.iloc[data.index,1], legendgroup = instrument, name = instrument, mode='markers', marker_symbol = "circle-open", marker_color = colourdict[instrument], marker_size = 2, showlegend = False), 
                  row=1, col=3)

fig.update_layout(height=500, title_text="Spec2vec comparison of qToF and CID instrument types", title_x=0.5, titlefont=dict(size=24), legend={'itemsizing': 'constant'})
fig.update_yaxes(range=[-45, 45]); fig.update_xaxes(range=[-45, 45])
fig.show()

# Comparison of qToF and HCD instrument types

In [10]:
qToF_HCD = comparison_on_common_inchi(query_strings = ["tof", "HCD"], metadata_df=spectrum_metadata)

78 unique planar Inchikeys in common between queries
Found 332 spectra with common Inchikeys using query 'tof'
Found 2672 spectra with common Inchikeys using query 'HCD'


In [None]:
fig = make_subplots(rows=1, cols=3, horizontal_spacing = 0.1, subplot_titles=('t-SNE embedding','PCA + t-SNE', '21/7 Joris t-SNE embedding'))
colourdict = {"qToF": "#ff0000", "CID": "#0000ff", "HCD": "#00ff00", "Orbitrap": "#87cefa"}

for i, data in enumerate(qToF_HCD):
    instrument = ["qToF", "HCD"][i]
    fig.add_trace(go.Scattergl(x=tsne_df.iloc[data.index,0], y=tsne_df.iloc[data.index,1], legendgroup = instrument, name = instrument, mode='markers', marker_symbol = "circle-open", marker_color = colourdict[instrument], marker_size = 2), 
                  row=1, col=1)
    fig.add_trace(go.Scattergl(x=pca_tsne.iloc[data.index,0], y=pca_tsne.iloc[data.index,1], legendgroup = instrument, name = instrument, mode='markers', marker_symbol = "circle-open", marker_color = colourdict[instrument], marker_size = 2, showlegend = False), 
                  row=1, col=2)
    fig.add_trace(go.Scattergl(x=tsne_louwen.iloc[data.index,0], y=tsne_louwen.iloc[data.index,1], legendgroup = instrument, name = instrument, mode='markers', marker_symbol = "circle-open", marker_color = colourdict[instrument], marker_size = 2, showlegend = False), 
                  row=1, col=3)

fig.update_layout(height=500, title_text="Spec2vec comparison of qToF and HCD instrument types", title_x=0.5, titlefont=dict(size=24), legend={'itemsizing': 'constant'})
fig.update_yaxes(range=[-45, 45]); fig.update_xaxes(range=[-45, 45])
fig.show()

# Comparison qToF and Orbitrap

In [11]:
qToF_orbitrap = comparison_on_common_inchi(query_strings = ["tof", "orbitrap"], metadata_df=spectrum_metadata)

1892 unique planar Inchikeys in common between queries
Found 10808 spectra with common Inchikeys using query 'tof'
Found 47637 spectra with common Inchikeys using query 'orbitrap'


In [None]:
fig = make_subplots(rows=1, cols=3, horizontal_spacing = 0.1, subplot_titles=('t-SNE embedding','PCA + t-SNE', '21/7 Joris t-SNE embedding'))
colourdict = {"qToF": "#ff0000", "CID": "#0000ff", "HCD": "#00ff00", "Orbitrap": "#87cefa"}

for i, data in enumerate(qToF_orbitrap):
    instrument = ["qToF", "Orbitrap"][i]
    fig.add_trace(go.Scattergl(x=tsne_df.iloc[data.index,0], y=tsne_df.iloc[data.index,1], legendgroup = instrument, name = instrument, mode='markers', marker_symbol = "circle-open", marker_color = colourdict[instrument], marker_size = 2), 
                  row=1, col=1)
    fig.add_trace(go.Scattergl(x=pca_tsne.iloc[data.index,0], y=pca_tsne.iloc[data.index,1], legendgroup = instrument, name = instrument, mode='markers', marker_symbol = "circle-open", marker_color = colourdict[instrument], marker_size = 2, showlegend = False), 
                  row=1, col=2)
    fig.add_trace(go.Scattergl(x=tsne_louwen.iloc[data.index,0], y=tsne_louwen.iloc[data.index,1], legendgroup = instrument, name = instrument, mode='markers', marker_symbol = "circle-open", marker_color = colourdict[instrument], marker_size = 2, showlegend = False), 
                  row=1, col=3)

fig.update_layout(height=500, title_text="Spec2vec comparison of qToF and HCD instrument types", title_x=0.5, titlefont=dict(size=24), legend={'itemsizing': 'constant'})
fig.update_yaxes(range=[-45, 45]); fig.update_xaxes(range=[-45, 45])
fig.show()