# Split data
In this notebook the data that was cleaned in 1_Clean_GNPS_spectra is split into different subsets. 


In [1]:
import pickle
import os
root_folder = "C:/Users/jonge094/PycharmProjects/PhD_MS2Query/ms2query/data/gnps_24_11_2021"

In [2]:
outfile = os.path.join(root_folder, "in_between_files_clean_gnps_spectra/ALL_GNPS_24_11_2021_inchikeys_from_pubchem.pickle")
with open(outfile, 'rb') as file:
    gnps_spectra = pickle.load(file)

In [3]:
def count_annotations(spectra):
    inchi_lst = []
    smiles_lst = []
    inchikey_lst = []
    nr_of_spectra_with_less_than_3_peaks = 0
    nr_of_spectra_with_more_than_500_peaks = 0

    for i, spec in enumerate(spectra):
        inchi_lst.append(spec.get("inchi"))
        smiles_lst.append(spec.get("smiles"))
        inchikey = spec.get("inchikey")
        if inchikey is None:
            inchikey = spec.get("inchikey_inchi")
        inchikey_lst.append(inchikey)
        if len(spec.peaks.mz) < 3:
            nr_of_spectra_with_less_than_3_peaks += 1
        if len(spec.peaks.mz) > 500:
            nr_of_spectra_with_more_than_500_peaks += 1

    inchi_count = sum([1 for x in inchi_lst if x])
    smiles_count = sum([1 for x in smiles_lst if x])
    inchikey_count = sum([1 for x in inchikey_lst if x])
    print("nr_of_spectra:", len(spectra))
    print("Inchis:", inchi_count, "--", len(set(inchi_lst)), "unique")
    print("Smiles:", smiles_count, "--", len(set(smiles_lst)), "unique")
    print("Inchikeys:", inchikey_count, "--", 
          len(set([x[:14] for x in inchikey_lst if x])), "unique (first 14 characters)")
    print("Spectra with less than 3 peaks:", nr_of_spectra_with_less_than_3_peaks)
    print("Spectra with more than 500 peaks:", nr_of_spectra_with_more_than_500_peaks)

In [4]:
count_annotations(gnps_spectra)

nr_of_spectra: 403840
Inchis: 352717 -- 27034 unique
Smiles: 352694 -- 37299 unique
Inchikeys: 352673 -- 22615 unique (first 14 characters)
Spectra with less than 3 peaks: 0
Spectra with more than 500 peaks: 19264


## split into negative and positive mode

In [3]:
spectrums_positive = []
spectrums_negative = []
for i, spec in enumerate(gnps_spectra):
    if spec.get("ionmode") == "positive":
        spectrums_positive.append(spec)
    elif spec.get("ionmode") == "negative":
        spectrums_negative.append(spec)
    else:
        print(f"No ionmode found for spectrum {i} ({spec.get('ionmode')})")

No ionmode found for spectrum 342240 (n/a)
No ionmode found for spectrum 342245 (n/a)
No ionmode found for spectrum 342246 (n/a)
No ionmode found for spectrum 342248 (n/a)
No ionmode found for spectrum 342249 (n/a)
No ionmode found for spectrum 342252 (n/a)
No ionmode found for spectrum 342256 (n/a)
No ionmode found for spectrum 342262 (n/a)
No ionmode found for spectrum 342264 (n/a)
No ionmode found for spectrum 342268 (n/a)
No ionmode found for spectrum 342476 (n/a)
No ionmode found for spectrum 342479 (n/a)
No ionmode found for spectrum 342483 (n/a)
No ionmode found for spectrum 342485 (n/a)
No ionmode found for spectrum 342497 (n/a)
No ionmode found for spectrum 342498 (n/a)
No ionmode found for spectrum 342499 (n/a)
No ionmode found for spectrum 342503 (n/a)
No ionmode found for spectrum 342505 (n/a)
No ionmode found for spectrum 342506 (n/a)
No ionmode found for spectrum 342512 (n/a)
No ionmode found for spectrum 342514 (n/a)
No ionmode found for spectrum 342515 (n/a)
No ionmode 

In [4]:
print("Total nr of spectra:", len(gnps_spectra))
print("positive_mode_spectra:", len(spectrums_positive))
print("negative_mode_spectra:", len(spectrums_negative))
print("spectra_without_ionmode:", len(gnps_spectra)-len(spectrums_positive)-len(spectrums_negative))

Total nr of spectra: 403840
positive_mode_spectra: 328550
negative_mode_spectra: 74696
spectra_without_ionmode: 594


In [5]:
pickle.dump(spectrums_positive,
           open(os.path.join(root_folder, "ALL_GNPS_24_11_2021_positive_all.pickle"), "wb"))
pickle.dump(spectrums_negative,
           open(os.path.join(root_folder, "ALL_GNPS_24_11_2021_negative_all.pickle"), "wb"))

## Remove not fully annoated spectra
The not annotated spectra are not used as validation or test spectra, but are used to train Spec2Vec

In [6]:
outfile = os.path.join(root_folder, "ALL_GNPS_24_11_2021_positive_all.pickle")
with open(outfile, 'rb') as file:
    all_positive_spectra = pickle.load(file)
    
outfile = os.path.join(root_folder, "ALL_GNPS_24_11_2021_negative_all.pickle")
with open(outfile, 'rb') as file:
    all_negative_spectra = pickle.load(file)

In [7]:
def select_fully_annotated_spectra(spectra):
    fully_annotated_spectra =[]
    not_fully_annotated_spectra =[]
    for spectrum in spectra:
        inchikey = spectrum.get("inchikey")
        if inchikey is not None and len(inchikey) > 13:
            smiles = spectrum.get("smiles")
            inchi = spectrum.get("inchi")
            if smiles is not None and len(smiles)>0:
                if inchi is not None and len(inchi) >0:
                    fully_annotated_spectra.append(spectrum)
                else: 
                    not_fully_annotated_spectra.append(spectrum)
#                     print(smiles)
#                     print(inchi)
#                     print(inchikey)
            else: 
                not_fully_annotated_spectra.append(spectrum)
#                 print(smiles)
#                 print(inchi)
#                 print(inchikey)
        else: 
            not_fully_annotated_spectra.append(spectrum)
    return fully_annotated_spectra, not_fully_annotated_spectra

In [8]:
complete_spectra_pos, incomplete_spectra_pos = select_fully_annotated_spectra(all_positive_spectra)
print(len(complete_spectra_pos))
print(len(incomplete_spectra_pos))
pickle.dump(complete_spectra_pos,
           open(os.path.join(root_folder, "ALL_GNPS_24_11_2021_positive_annotated.pickle"), "wb"))
pickle.dump(incomplete_spectra_pos,
           open(os.path.join(root_folder, "ALL_GNPS_24_11_2021_positive_not_annotated.pickle"), "wb"))

293801
34749


In [9]:
complete_spectra_neg, incomplete_spectra_neg = select_fully_annotated_spectra(all_negative_spectra)
print(len(complete_spectra_neg))
print(len(incomplete_spectra_neg))
pickle.dump(complete_spectra_neg,
           open(os.path.join(root_folder, "ALL_GNPS_24_11_2021_negative_annotated.pickle"), "wb"))
pickle.dump(incomplete_spectra_neg,
           open(os.path.join(root_folder, "ALL_GNPS_24_11_2021_negative_not_annotated.pickle"), "wb"))

58272
16424
