# split old validation data
The validation spectra used for training ms2deepscore and spec2vec are reused to create the ms2query training and validation data. 1/5th of these spectra are used for creating training data and 4/5th is used for creating testing data.

In [1]:
import os
import pickle

# path_root = os.path.dirname(os.getcwd())
# path_data = os.path.join(os.path.dirname(path_root), "data/gnps_24_11_2021/positive_mode/")
path_data = "C:\\HSD\\OneDrive - Hochschule Düsseldorf\\Data\\ms2query"

In [2]:
def count_annotations(spectra):
    inchi_lst = []
    smiles_lst = []
    inchikey_lst = []
    for i, spec in enumerate(spectra):
        inchi_lst.append(spec.get("inchi"))
        smiles_lst.append(spec.get("smiles"))
        inchikey = spec.get("inchikey")
        if inchikey is None:
            inchikey = spec.get("inchikey_inchi")
        inchikey_lst.append(inchikey)

    inchi_count = sum([1 for x in inchi_lst if x])
    smiles_count = sum([1 for x in smiles_lst if x])
    inchikey_count = sum([1 for x in inchikey_lst if x])
    print("nr_of_spectra:", len(spectra))
    print("Inchis:", inchi_count, "--", len(set(inchi_lst)), "unique")
    print("Smiles:", smiles_count, "--", len(set(smiles_lst)), "unique")
    print("Inchikeys:", inchikey_count, "--", 
          len(set([x[:14] for x in inchikey_lst if x])), "unique (first 14 characters)")

In [3]:
val_spectra_250_inchikeys = pickle.load(open(os.path.join(path_data,
                                                          "GNPS_15_12_2021_pos_val_250_inchikeys.pickle"), "rb"))
val_spectra_3000_spectra = pickle.load(open(os.path.join(path_data,
                                                         "GNPS_15_12_2021_pos_val_3000_spectra.pickle"), "rb"))

In [4]:
count_annotations(val_spectra_250_inchikeys)
count_annotations(val_spectra_3000_spectra)

nr_of_spectra: 2817
Inchis: 2817 -- 306 unique
Smiles: 2817 -- 364 unique
Inchikeys: 2817 -- 250 unique (first 14 characters)
nr_of_spectra: 3000
Inchis: 3000 -- 3000 unique
Smiles: 3000 -- 3000 unique
Inchikeys: 3000 -- 3000 unique (first 14 characters)


### split 250 inchikeys
The inchikeys are split so that the training data uses 200 unique inchikeys and the validation data uses 50 unique inchikeys

In [5]:
import numpy as np

np.random.seed(123)
inchikey_list = []

for spectrum in val_spectra_250_inchikeys:
    inchikey = spectrum.get("inchikey")[:14]
    inchikey_list.append(inchikey)
inchikey_set = set(inchikey_list)
unique_inchikeys = list(inchikey_set)
np.random.shuffle(unique_inchikeys)

ms2q_validation_inchikeys = unique_inchikeys[:50]
ms2q_test_inchikeys = unique_inchikeys[50:]

In [6]:
print(len(ms2q_validation_inchikeys))
print(len(ms2q_test_inchikeys))

50
200


In [7]:
def select_spectra_with_inchikey(inchikeys, spectra):
    selected_spectra = []
    for spectrum in spectra:
        inchikey = spectrum.get("inchikey")[:14]
        if inchikey in inchikeys:
            selected_spectra.append(spectrum)
    return selected_spectra

In [8]:
ms2q_val_spectra_250_inchi = select_spectra_with_inchikey(ms2q_validation_inchikeys, val_spectra_250_inchikeys)
ms2q_train_spectra_250_inchi = select_spectra_with_inchikey(ms2q_test_inchikeys, val_spectra_250_inchikeys)

In [9]:
count_annotations(ms2q_val_spectra_250_inchi)
count_annotations(ms2q_train_spectra_250_inchi)

nr_of_spectra: 541
Inchis: 541 -- 64 unique
Smiles: 541 -- 76 unique
Inchikeys: 541 -- 50 unique (first 14 characters)
nr_of_spectra: 2276
Inchis: 2276 -- 242 unique
Smiles: 2276 -- 288 unique
Inchikeys: 2276 -- 200 unique (first 14 characters)


# split 3000 spectra


In [10]:
np.random.seed(123)
np.random.shuffle(val_spectra_3000_spectra)

ms2q_validation_spectra_3000 = val_spectra_3000_spectra[:600]
ms2q_train_spectra_3000 = val_spectra_3000_spectra[600:]

In [11]:
print(len(ms2q_validation_spectra_3000))
print(len(ms2q_train_spectra_3000))

600
2400


In [12]:
ms2q_val_spectra = ms2q_validation_spectra_3000 + ms2q_val_spectra_250_inchi
ms2q_train_spectra = ms2q_train_spectra_3000 + ms2q_train_spectra_250_inchi


In [13]:
count_annotations(ms2q_val_spectra)
count_annotations(ms2q_train_spectra)

pickle.dump(ms2q_val_spectra, open(os.path.join(path_data,
                                                "ms2q_val_spectra.pickle"),"wb"))
pickle.dump(ms2q_train_spectra, open(os.path.join(path_data,
                                                  "ms2q_train_spectra.pickle"),"wb"))

nr_of_spectra: 1141
Inchis: 1141 -- 664 unique
Smiles: 1141 -- 676 unique
Inchikeys: 1141 -- 650 unique (first 14 characters)
nr_of_spectra: 4676
Inchis: 4676 -- 2642 unique
Smiles: 4676 -- 2688 unique
Inchikeys: 4676 -- 2600 unique (first 14 characters)
