# Calculate tanimoto scores

In [1]:
import pickle
import os
import pandas as pd
import numpy as np

path_data = "C:\\HSD\\OneDrive - Hochschule Düsseldorf\\Data\\ms2query"
outfile = os.path.join(path_data, "ALL_GNPS_15_12_2021_negative_annotated.pickle")
with open(outfile, 'rb') as file:
    negative_fully_annotated = pickle.load(file)

### Use most frequent InChI for every unique inchikey

In [2]:
from collections import Counter 
  
def most_frequent(List): 
    occurence_count = Counter(List) 
    return occurence_count.most_common(1)[0][0] 

In [3]:
inchikeys_list = []
inchi_list = []
spectrum_ids = []

for s in negative_fully_annotated:
    inchikeys_list.append(s.get("inchikey"))
    inchi_list.append(s.get("inchi"))
    spectrum_ids.append(s.get("spectrumid"))

inchi_array = np.array(inchi_list)    
inchikeys14_array = np.array([x[:14] for x in inchikeys_list])
inchikeys14_unique = list({x[:14] for x in inchikeys_list})
len(inchikeys14_unique)

9941

In [5]:
inchi_mapping = []
ID_mapping = []
spectrum_ID_mapping = []

for inchikey14 in inchikeys14_unique:
    idx = np.where(inchikeys14_array == inchikey14)[0]
    
    inchi = most_frequent([negative_fully_annotated[i].get("inchi") for i in idx])
    inchi_mapping.append(inchi)
    ID = idx[np.where(inchi_array[idx] == inchi)[0][0]]
    ID_mapping.append(ID)
    spectrum_ID_mapping.append(negative_fully_annotated[ID].get("spectrumid"))

In [6]:
metadata = pd.DataFrame(list(zip(inchikeys14_unique,
                                 inchi_mapping,
                                 ID_mapping,
                                 spectrum_ID_mapping)),
                        columns=["inchikey14", "inchi", "ID", "spectrumid"])
metadata.head()

Unnamed: 0,inchikey14,inchi,ID,spectrumid
0,NGTSRFJHFKEKPL,InChI=1S/C25H29N3O3S/c1-3-18-7-8-23-22(13-18)1...,22349,CCMSLIB00006690959
1,JIQYBJXVKDJNDY,InChI=1S/C19H17FN2O2/c1-24-19(23)16-10-14-13-4...,21679,CCMSLIB00006689220
2,QIDMCIFFMHMTBT,InChI=1S/C24H34O6/c1-6-7-8-9-10-12-19(26)30-18...,63145,CCMSLIB00004708095
3,PBKZJIMGHNPKBJ,InChI=1S/C19H14O7/c1-8-6-10(19(24)25)14(12(7-8...,58029,CCMSLIB00004688323
4,LGIFMJHIMJHUBF,InChI=1S/C36H28O5/c1-40-22-24-6-19-33-32-20-15...,483,CCMSLIB00004722193


In [7]:
metadata_file = os.path.join(path_data, "metadata_negative_inchikey_inchi_mapping.csv")
metadata.to_csv(metadata_file)

In [8]:
import numpy as np
from matchms import calculate_scores
from matchms import Spectrum
from matchms.filtering import add_fingerprint
from matchms.similarity import FingerprintSimilarity
from tqdm.notebook import tqdm

# Add fingerprints
fingerprint_spectra = []
for i in tqdm(metadata.ID.values):
    fingerprint_spectra.append(add_fingerprint(negative_fully_annotated[i],
                                               fingerprint_type="daylight",
                                               nbits=2048))

# Specify type and calculate similarities
similarity_measure = FingerprintSimilarity("jaccard")
scores = calculate_scores(fingerprint_spectra, fingerprint_spectra,
                          similarity_measure, is_symmetric=True)


  0%|          | 0/9941 [00:00<?, ?it/s]

In [9]:
results = pd.DataFrame(scores.scores,
                       index = metadata.inchikey14,
                       columns = metadata.inchikey14)
results.head()

inchikey14,NGTSRFJHFKEKPL,JIQYBJXVKDJNDY,QIDMCIFFMHMTBT,PBKZJIMGHNPKBJ,LGIFMJHIMJHUBF,JLMGCBFIPZDHLZ,ACAAVKGSTVOIQB,IWIANZLCJVYEFX,RFNAJUJIDHSCFP,CQIUKKVOEOPUDV,...,HRGWILAIPXJBIB,KTZXEYGDCLLQMH,NADTWWWCPKHBGR,ICLRNDIRYAEKCT,JGGRFKDDIGORCC,FJKRCBVJIUJMSB,JPMYFOBNRRGFNO,HXVZGASCDAGAPS,FSTIKTPQGMHLFJ,VYFYYTLLBUKUHU
inchikey14,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NGTSRFJHFKEKPL,1.0,0.370169,0.274252,0.342056,0.286662,0.228835,0.283594,0.246563,0.176736,0.268525,...,0.310672,0.270625,0.303321,0.391277,0.375694,0.297342,0.311819,0.427686,0.200813,0.122004
JIQYBJXVKDJNDY,0.370169,1.0,0.340617,0.460457,0.363636,0.266667,0.34464,0.248347,0.2,0.329049,...,0.359428,0.316766,0.386693,0.463859,0.457175,0.299385,0.30411,0.256204,0.255878,0.122131
QIDMCIFFMHMTBT,0.274252,0.340617,1.0,0.34322,0.288968,0.289806,0.257732,0.192469,0.26151,0.409269,...,0.264123,0.262343,0.292167,0.382671,0.380866,0.256371,0.236722,0.201827,0.380483,0.075639
PBKZJIMGHNPKBJ,0.342056,0.460457,0.34322,1.0,0.396725,0.285999,0.435845,0.242877,0.200676,0.362005,...,0.361022,0.428571,0.378079,0.523086,0.492818,0.304682,0.3125,0.240964,0.271363,0.124251
LGIFMJHIMJHUBF,0.286662,0.363636,0.288968,0.396725,1.0,0.265509,0.31106,0.199663,0.181191,0.267138,...,0.276501,0.35336,0.322254,0.401464,0.390303,0.281324,0.304243,0.242268,0.2328,0.136033


In [10]:
pickle.dump(results,
           open(os.path.join(path_data, "GNPS_15_12_2021_neg_tanimoto_scores.pickle"), "wb"))