# Split data
In this notebook the data that was cleaned in 1_Clean_GNPS_spectra is split into different subsets. 


In [1]:
import pickle
import os
path_data = "C:\\HSD\\OneDrive - Hochschule Düsseldorf\\Data\\ms2query"

In [3]:
file_data = os.path.join(path_data,
                         "ALL_GNPS_15_12_2021_inchikeys_from_pubchem.pickle")
with open(file_data, 'rb') as file:
    gnps_spectra = pickle.load(file)

In [4]:
def count_annotations(spectra):
    inchi_lst = []
    smiles_lst = []
    inchikey_lst = []
    nr_of_spectra_with_less_than_3_peaks = 0
    nr_of_spectra_with_more_than_500_peaks = 0

    for i, spec in enumerate(spectra):
        inchi_lst.append(spec.get("inchi"))
        smiles_lst.append(spec.get("smiles"))
        inchikey = spec.get("inchikey")
        if inchikey is None:
            inchikey = spec.get("inchikey_inchi")
        inchikey_lst.append(inchikey)
        if len(spec.peaks.mz) < 3:
            nr_of_spectra_with_less_than_3_peaks += 1
        if len(spec.peaks.mz) > 500:
            nr_of_spectra_with_more_than_500_peaks += 1

    inchi_count = sum([1 for x in inchi_lst if x])
    smiles_count = sum([1 for x in smiles_lst if x])
    inchikey_count = sum([1 for x in inchikey_lst if x])
    print("nr_of_spectra:", len(spectra))
    print("Inchis:", inchi_count, "--", len(set(inchi_lst)), "unique")
    print("Smiles:", smiles_count, "--", len(set(smiles_lst)), "unique")
    print("Inchikeys:", inchikey_count, "--", 
          len(set([x[:14] for x in inchikey_lst if x])), "unique (first 14 characters)")
    print("Spectra with less than 3 peaks:", nr_of_spectra_with_less_than_3_peaks)
    print("Spectra with more than 500 peaks:", nr_of_spectra_with_more_than_500_peaks)

In [5]:
count_annotations(gnps_spectra)

nr_of_spectra: 403427
Inchis: 384849 -- 27780 unique
Smiles: 384826 -- 37965 unique
Inchikeys: 384805 -- 23122 unique (first 14 characters)
Spectra with less than 3 peaks: 0
Spectra with more than 500 peaks: 0


## split into negative and positive mode

In [6]:
spectrums_positive = []
spectrums_negative = []
spectrums_unknown = []

logs = []

for i, spec in enumerate(gnps_spectra):
    if spec.get("ionmode") == "positive":
        spectrums_positive.append(spec)
    elif spec.get("ionmode") == "negative":
        spectrums_negative.append(spec)
    else:
        logs.append((i, spec.get('ionmode'),
                     spec.get("compound_name"), 
                     spec.get("inchikey")))
        spectrums_unknown.append(spec)

In [7]:
print("Total nr of spectra:", len(gnps_spectra))
print("positive_mode_spectra:", len(spectrums_positive))
print("negative_mode_spectra:", len(spectrums_negative))
print("spectra_without_ionmode:", len(gnps_spectra)-len(spectrums_positive)-len(spectrums_negative))

Total nr of spectra: 403427
positive_mode_spectra: 328226
negative_mode_spectra: 75175
spectra_without_ionmode: 26


In [8]:
pickle.dump(spectrums_positive,
           open(os.path.join(path_data,
                             "ALL_GNPS_15_12_2021_positive_all.pickle"), "wb"))
pickle.dump(spectrums_negative,
           open(os.path.join(path_data,
                             "ALL_GNPS_15_12_2021_negative_all.pickle"), "wb"))

## Remove not fully annoated spectra
The not annotated spectra are not used as validation or test spectra, but are used to train Spec2Vec

In [6]:
outfile = os.path.join(path_data, "ALL_GNPS_15_12_2021_positive_all.pickle")
with open(outfile, 'rb') as file:
    spectrums_positive = pickle.load(file)
    
outfile = os.path.join(path_data, "ALL_GNPS_15_12_2021_negative_all.pickle")
with open(outfile, 'rb') as file:
    spectrums_negative = pickle.load(file)

In [9]:
def select_fully_annotated_spectra(spectra):
    fully_annotated_spectra =[]
    not_fully_annotated_spectra =[]
    for spectrum in spectra:
        inchikey = spectrum.get("inchikey")
        if inchikey is not None and len(inchikey) > 13:
            smiles = spectrum.get("smiles")
            inchi = spectrum.get("inchi")
            if smiles is not None and len(smiles)>0:
                if inchi is not None and len(inchi) >0:
                    fully_annotated_spectra.append(spectrum)
                else: 
                    not_fully_annotated_spectra.append(spectrum)
#                     print(smiles)
#                     print(inchi)
#                     print(inchikey)
            else: 
                not_fully_annotated_spectra.append(spectrum)
#                 print(smiles)
#                 print(inchi)
#                 print(inchikey)
        else: 
            not_fully_annotated_spectra.append(spectrum)
    return fully_annotated_spectra, not_fully_annotated_spectra

In [11]:
complete_spectra_pos, incomplete_spectra_pos = select_fully_annotated_spectra(spectrums_positive)
print(len(complete_spectra_pos))
print(len(incomplete_spectra_pos))
pickle.dump(complete_spectra_pos,
           open(os.path.join(path_data,
                             "ALL_GNPS_15_12_2021_positive_annotated.pickle"), "wb"))
pickle.dump(incomplete_spectra_pos,
           open(os.path.join(path_data,
                             "ALL_GNPS_15_12_2021_positive_not_annotated.pickle"), "wb"))

314318
13908


In [12]:
complete_spectra_neg, incomplete_spectra_neg = select_fully_annotated_spectra(spectrums_negative)
print(len(complete_spectra_neg))
print(len(incomplete_spectra_neg))
pickle.dump(complete_spectra_neg,
           open(os.path.join(path_data, "ALL_GNPS_15_12_2021_negative_annotated.pickle"), "wb"))
pickle.dump(incomplete_spectra_neg,
           open(os.path.join(path_data, "ALL_GNPS_15_12_2021_negative_not_annotated.pickle"), "wb"))

70455
4720
