# split old validation data
The validation spectra used for training ms2deepscore and spec2vec are reused to create the ms2query training and validation data. 1/5th of these spectra are used for creating training data and 4/5th is used for creating testing data.

In [16]:
import os
import pickle
path_root = os.path.dirname(os.getcwd())
path_data = os.path.join(os.path.dirname(path_root), "data/gnps_24_11_2021/positive_mode/")

In [17]:
def count_annotations(spectra):
    inchi_lst = []
    smiles_lst = []
    inchikey_lst = []
    for i, spec in enumerate(spectra):
        inchi_lst.append(spec.get("inchi"))
        smiles_lst.append(spec.get("smiles"))
        inchikey = spec.get("inchikey")
        if inchikey is None:
            inchikey = spec.get("inchikey_inchi")
        inchikey_lst.append(inchikey)

    inchi_count = sum([1 for x in inchi_lst if x])
    smiles_count = sum([1 for x in smiles_lst if x])
    inchikey_count = sum([1 for x in inchikey_lst if x])
    print("nr_of_spectra:", len(spectra))
    print("Inchis:", inchi_count, "--", len(set(inchi_lst)), "unique")
    print("Smiles:", smiles_count, "--", len(set(smiles_lst)), "unique")
    print("Inchikeys:", inchikey_count, "--", 
          len(set([x[:14] for x in inchikey_lst if x])), "unique (first 14 characters)")

In [18]:
val_spectra_250_inchikeys = pickle.load(open(os.path.join(path_data, "GNPS_24_11_2021_pos_val_250_inchikeys.pickle"), "rb"))
val_spectra_3000_spectra = pickle.load(open(os.path.join(path_data, "GNPS_24_11_2021_pos_val_3000_spectra.pickle"), "rb"))

In [19]:
count_annotations(val_spectra_250_inchikeys)
count_annotations(val_spectra_3000_spectra)

nr_of_spectra: 3334
Inchis: 3334 -- 276 unique
Smiles: 3334 -- 356 unique
Inchikeys: 3334 -- 250 unique (first 14 characters)
nr_of_spectra: 3000
Inchis: 3000 -- 3000 unique
Smiles: 3000 -- 3000 unique
Inchikeys: 3000 -- 3000 unique (first 14 characters)


### split 250 inchikeys
The inchikeys are split so that the training data uses 200 unique inchikeys and the validation data uses 50 unique inchikeys

In [20]:
import numpy as np

np.random.seed(123)
inchikey_list = []

for spectrum in val_spectra_250_inchikeys:
    inchikey = spectrum.get("inchikey")[:14]
    inchikey_list.append(inchikey)
inchikey_set = set(inchikey_list)
unique_inchikeys = list(inchikey_set)
np.random.shuffle(unique_inchikeys)

ms2q_validation_inchikeys = unique_inchikeys[:50]
ms2q_test_inchikeys = unique_inchikeys[50:]

In [21]:
print(len(ms2q_validation_inchikeys))
print(len(ms2q_test_inchikeys))

50
200


In [22]:
def select_spectra_with_inchikey(inchikeys, spectra):
    selected_spectra = []
    for spectrum in spectra:
        inchikey = spectrum.get("inchikey")[:14]
        if inchikey in inchikeys:
            selected_spectra.append(spectrum)
    return selected_spectra

In [23]:
ms2q_val_spectra_250_inchi = select_spectra_with_inchikey(ms2q_validation_inchikeys, val_spectra_250_inchikeys)
ms2q_train_spectra_250_inchi = select_spectra_with_inchikey(ms2q_test_inchikeys, val_spectra_250_inchikeys)

In [24]:
count_annotations(ms2q_val_spectra_250_inchi)
count_annotations(ms2q_train_spectra_250_inchi)

nr_of_spectra: 620
Inchis: 620 -- 59 unique
Smiles: 620 -- 79 unique
Inchikeys: 620 -- 50 unique (first 14 characters)
nr_of_spectra: 2714
Inchis: 2714 -- 217 unique
Smiles: 2714 -- 277 unique
Inchikeys: 2714 -- 200 unique (first 14 characters)


# split 3000 spectra


In [25]:
np.random.seed(123)
np.random.shuffle(val_spectra_3000_spectra)

ms2q_validation_spectra_3000 = val_spectra_3000_spectra[:600]
ms2q_train_spectra_3000 = val_spectra_3000_spectra[600:]

In [26]:
print(len(ms2q_validation_spectra_3000))
print(len(ms2q_train_spectra_3000))

600
2400


In [27]:
ms2q_val_spectra = ms2q_validation_spectra_3000 + ms2q_val_spectra_250_inchi
ms2q_train_spectra = ms2q_train_spectra_3000 + ms2q_train_spectra_250_inchi


In [31]:
count_annotations(ms2q_val_spectra)
count_annotations(ms2q_train_spectra)

pickle.dump(ms2q_val_spectra, open(os.path.join(path_data, "ms2query_library_files/ms2q_val_spectra.pickle"),"wb"))
pickle.dump(ms2q_train_spectra, open(os.path.join(path_data, "ms2query_library_files/ms2q_train_spectra.pickle"),"wb"))

nr_of_spectra: 1220
Inchis: 1220 -- 659 unique
Smiles: 1220 -- 679 unique
Inchikeys: 1220 -- 650 unique (first 14 characters)
nr_of_spectra: 5114
Inchis: 5114 -- 2617 unique
Smiles: 5114 -- 2677 unique
Inchikeys: 5114 -- 2600 unique (first 14 characters)
