# Silicium data exploration

In [16]:
import pickle
import os, sys, inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

from dataset import SpectroDataset, SpectroDataCollator


In [7]:
data_type = "8M"
tokenizer_type = "_bbpe_1M" # ""

test_data_path = f'trial_set/{data_type}{tokenizer_type}_bart_prepared_data_test.pkl'
train_data_path = f'./trial_set/{data_type}{tokenizer_type}_bart_prepared_data_train.pkl'

train_data = SpectroDataset(train_data_path, original=False, eval_mode=False)

In [9]:
# token number of Si is 1068
num_of_si = 0
for labels in train_data.data.labels:
    if 1068 in labels:
        num_of_si += 1

In [10]:
num_of_si # 8M: 1607, 2M_derivatized: 149149

1607

In [6]:
len(train_data.data)

4641078

# Derivatization data exploration
### => more clever

In [6]:
from rdkit import Chem
import numpy as np

In [7]:
def is_derivatized(mol=None,smiles=None):
    if mol is None:
        mol = Chem.MolFromSmiles(smiles)
    mol = Chem.AddHs(mol)

    return mol.HasSubstructMatch(tms_match) or mol.HasSubstructMatch(meox_match_co) or mol.HasSubstructMatch(meox_match_cho)

In [8]:
# deriv mol pieces
tms = '[Si]([CH3])([CH3])[CH3]'
tms_match = Chem.MolFromSmarts('*~[O,N,S]' + tms)
tms_match0 = Chem.MolFromSmarts('[#0]([CH3])([CH3])[CH3]')
meox_match_co = Chem.MolFromSmarts('C([C,c])([C,c])=NO[CH3]')
meox_match_cho = Chem.MolFromSmarts('[CH]([C,c])=NO[CH3]')
meox_match0 = Chem.MolFromSmarts('[#0]=NO[CH3]')
co = Chem.MolFromSmiles('C=O')

In [9]:
# load SMILES from .txt to ndarray
smiles_path = "../../NEIMS/training_splits/test_set_smiles.txt"
neims_test_smiles = np.array([])
with open(smiles_path, "r") as f:
    for line in f:
        neims_test_smiles = np.append(neims_test_smiles, line.strip())

In [6]:
# distinguish derivatized mols
isder_array = np.array([])
for smiles in neims_test_smiles:
    isder_array = np.append(isder_array, is_derivatized(smiles=smiles))
    

[17:15:00] Conflicting single bond directions around double bond at index 1.
[17:15:00]   BondStereo set to STEREONONE and single bond directions set to NONE.
[17:15:02] Conflicting single bond directions around double bond at index 4.
[17:15:02]   BondStereo set to STEREONONE and single bond directions set to NONE.


In [7]:
# filter by isder_array
only_deriv_smiles = neims_test_smiles[isder_array.astype(np.bool_)]
only_nonderiv_smiles = neims_test_smiles[~isder_array.astype(np.bool_)]

In [11]:
print(f"In total there is {isder_array.sum()}/{isder_array.shape[0]} derivatized molecules")
print(f"In total there is {only_nonderiv_smiles.shape[0]}/{isder_array.shape[0]} nonderivatized molecules")

In total there is 531.0/11600 derivatized molecules
In total there is 11069/11600 nonderivatized molecules


In [13]:
# write deriv smiels into new file
# FOR BART PREPRO
deriv_smiles_path = "../../NEIMS/training_splits/test_set_smiles_deriv_forBARTprepro.smi"
nonderiv_smiles_path = "../../NEIMS/training_splits/test_set_smiles_nonderiv_forBARTprepro.smi"
with open(deriv_smiles_path, "w+") as d, open(nonderiv_smiles_path, "w+") as n:
    d.write("smiles zinc_id\n")
    n.write("smiles zinc_id\n")
    for s in only_deriv_smiles:
        d.write(s + " 0\n")
    for s in only_nonderiv_smiles:
        n.write(s + " 0\n")

# FOR NEIMS EXPLORATION
deriv_smiles_path = "../../NEIMS/training_splits/test_set_smiles_deriv.smi"
nonderiv_smiles_path = "../../NEIMS/training_splits/test_set_smiles_nonderiv.smi"
with open(deriv_smiles_path, "w+") as d, open(nonderiv_smiles_path, "w+") as n:
    for s in only_deriv_smiles:
        d.write(s + "\n")
    for s in only_nonderiv_smiles:
        n.write(s + "\n")


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  np.float


float

## NIST20 derivatization data exploration

In [31]:
from matchms.importing import load_from_msp
from tqdm import tqdm
from matchms import Spectrum
from matchms.exporting import save_as_msp


In [1]:
from rdkit import RDLogger 
RDLogger.DisableLog('rdApp.*') 

In [5]:
%%time
# load NIST20
nist_path = "./NIST/NIST_20/20210925_NIST_EI_MS_cleaned.msp"
nist = [*load_from_msp(nist_path, metadata_harmonization=False)]

CPU times: user 2min 27s, sys: 2.01 s, total: 2min 29s
Wall time: 2min 29s


In [37]:
# filter datapoints with no smiles or inchikey (60163 out)
# AND CANONIZE THE SMILES
clean_deriv_nist = []
for i in tqdm(range(len(nist))):
    try: 
        nist[i].metadata["smiles"]
        nist[i].metadata["inchikey"]
    except:
        continue
        
    meta = nist[i].metadata
    isder = False
    try:
        can_smiles = Chem.MolToSmiles(Chem.MolFromSmiles(meta["smiles"]),True)
        isder = is_derivatized(smiles=can_smiles)
    except:
#         print(can_smiles)
        pass
    
    if isder==True:
        # canonize SMILES and update Spectrum
        meta["canon_smiles"] = can_smiles
        s = Spectrum(mz=nist[i].mz, intensities=nist[i].intensities, metadata=meta, metadata_harmonization=False)
        clean_deriv_nist.append(s)


100%|██████████| 350618/350618 [02:14<00:00, 2611.94it/s]


In [34]:
# carefully, doesn't create new file, adds to existing one!!!!
save_as_msp(spectrums=clean_deriv_nist, filename="./NIST/NIST_20deriv/NIST20deriv.msp")

In [35]:
# save inchikeys in separte file
with open("./NIST/NIST_20deriv/NIST_20deriv_inchikeys.txt", "w+") as f:
    for s in clean_deriv_nist:
        f.write(s.metadata["inchikey"] + "\n")

In [36]:
# stats
print(f"in NIST database we have {len(clean_deriv_nist)} / {len(nist)} derivatized molecules")



in NIST database we have 8592 / 350618 derivatized molecules
