# Wiley brief analysis

We got a new dataset, let's take a look at it.

In [26]:
from matchms.importing import load_from_msp
from tqdm import tqdm
import pandas as pd

import sys
sys.path.append('..')

In [3]:
original_path = "datasets/WILEY/WILEY12_1.msp"
annotated_path = "datasets/WILEY/20240207_wiley_highres_annotations_all.msp"

In [4]:
# takes 13mins, 24s
original_spectrums = list(load_from_msp(original_path, metadata_harmonization=False))
annotated_spectrums = list(load_from_msp(annotated_path, metadata_harmonization=False))

In [5]:
id = 1
original_spectrums[id].metadata, annotated_spectrums[id].metadata

({'formula': 'DH',
  'casno': '13983205',
  'id': '2',
  'comment': 'SpectrumID: 1113421; Source: HE-1982-0-0; QI: 1000; Class: Other non-metal hydrides',
  'num_peaks': '2',
  'compound_name': 'Deuterohydrogen',
  'nominal_mass': '3',
  'parent_mass': '3.0219268'},
 {'scannumber': '-1',
  'ionmode': 'Positive',
  'spectrumtype': 'Centroid',
  'formula': 'C20H23NO3',
  'inchikey': '',
  'inchi': 'InChI=1S/C20H23NO3/c1-23-18-9-7-16(8-10-18)19(21-11-13-24-14-12-21)15-20(22)17-5-3-2-4-6-17/h2-10,19H,11-15H2,1H3',
  'smiles': 'COC1=CC=C(C=C1)C(CC(=O)C2=CC=CC=C2)N3CCOCC3',
  'authors': '',
  'instrument': '',
  'ionization': '',
  'license': '',
  'comment': 'SpectrumID: 802540; Source: F-53-7863-1; QI: 44; Class: Retro-dihydrochalcones; CASRN not real! |RI:2627|',
  'peak_comments': {105.033489: 'Theoretical m/z 105.033489, Mass diff 0.033 (318.94 ppm), SMILES O=CC1=CC=CC=C1, Annotation [C7H6O-H]+, Rule of HR True',
   161.059701: 'Theoretical m/z 161.059701, Mass diff 0.06 (370.82 ppm), S

In [6]:
len(original_spectrums), len(annotated_spectrums)

(725560, 389652)

### Compare WILEY and NIST datasets

In [19]:
NIST_TRAIN_PATH = '../clean_paper/data/nist/train.jsonl'
NIST_TEST_PATH = '../clean_paper/data/nist/test.jsonl'
NIST_VAL_PATH = '../clean_paper/data/nist/valid.jsonl'

In [21]:
nist_train = pd.read_json(NIST_TRAIN_PATH, lines=True, orient='records')
nist_test = pd.read_json(NIST_TEST_PATH, lines=True, orient='records')
nist_valid = pd.read_json(NIST_VAL_PATH, lines=True, orient='records')

In [22]:
wiley_smiles = [s.metadata["smiles"] for s in annotated_spectrums]

In [27]:
# are wiley smiles canonical?

from utils.spectra_process_utils import remove_stereochemistry_and_canonicalize

canonical_wiley_smiles = [remove_stereochemistry_and_canonicalize(s) for s in tqdm(wiley_smiles)]


100%|██████████| 389652/389652 [01:07<00:00, 5756.31it/s]


In [28]:
nist_train_set = set(nist_train.smiles)
nist_test_set = set(nist_test.smiles)
nist_valid_set = set(nist_valid.smiles)
wiley_smiles_set = set(canonical_wiley_smiles)

print(f"overlap of wiley and nist train: {len(wiley_smiles_set.intersection(nist_train_set))}")
print(f"overlap of wiley and nist test: {len(wiley_smiles_set.intersection(nist_test_set))}")
print(f"overlap of wiley and nist valid: {len(wiley_smiles_set.intersection(nist_valid_set))}")
print(f"unique wiley: {len(wiley_smiles_set)}")
print(f"unique wiley over nist: {len(wiley_smiles_set.difference(nist_train_set.union(nist_test_set).union(nist_valid_set)))}")

overlap of wiley and nist train: 68911
overlap of wiley and nist test: 8464
overlap of wiley and nist valid: 8577
unique wiley: 334992
unique wiley over nist: 249040


False

### Reconstruct SMILES from name

In [2]:
from concurrent.futures import ProcessPoolExecutor, TimeoutError, as_completed
from tqdm import tqdm
import cirpy
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors

# Function to resolve compound name to SMILES
def name_to_smiles(name):
    try:
        smiles = cirpy.resolve(name, 'smiles')
        print("cirpy resolved", name, "to", smiles)
        return smiles
    except Exception as e:
        print(f"Failed to resolve SMILES for {name}: {e}")
        return None

# Function to convert SMILES to molecular formula
def smiles_to_formula(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        print("mol is None")
        return None
    else:
        return rdMolDescriptors.CalcMolFormula(mol)

# Function to process a single spectrum
def process_spectrum(s):
    s_new = s.clone()

    smiles = name_to_smiles(s.metadata['compound_name'])
    if smiles is None:
        print("Failed to resolve SMILES for", s.metadata['compound_name'])
        return None

    new_formula = smiles_to_formula(smiles)
    old_formula = s.metadata.get("formula", None)
    if old_formula is None or new_formula is None or old_formula != new_formula:
        return None

    s_new.metadata['smiles'] = smiles
    return s_new

# Function to run the multiprocessing with timeout using ProcessPoolExecutor
def parallel_process_spectrums(spectrums, num_workers=4, timeout=10):
    cleaned_spectrums = []

    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        futures = {executor.submit(process_spectrum, s): s for s in spectrums}

        for future in tqdm(as_completed(futures), total=len(spectrums)):
            try:
                result = future.result(timeout=timeout)
                if result is not None:
                    cleaned_spectrums.append(result)
            except TimeoutError:
                continue 
            except Exception as e:
                continue  
    print(cleaned_spectrums)
    return cleaned_spectrums

### Split original sdf

In [44]:
chunk_size = 100
chunks = [original_spectrums[i:i + chunk_size] for i in range(0, len(original_spectrums), chunk_size)]
print(len(chunks))
cleaned_chunks = []

36278


In [45]:
# save chunks
from pathlib import Path

chunk_dir = Path("datasets/WILEY/preprocessed_chunks")
chunk_dir.mkdir(exist_ok=True)

for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i+1}/{len(chunks)}")
    cleaned_chunk = parallel_process_spectrums(chunk, num_workers=1, timeout=10)
    cleaned_chunks.append(cleaned_chunk)
    break


Processing chunk 1/36278


  0%|          | 0/20 [00:00<?, ?it/s]

Failed to resolve SMILES for Hydrogen


  5%|▌         | 1/20 [00:00<00:04,  4.28it/s]

Failed to resolve SMILES for Deuterohydrogen


 10%|█         | 2/20 [00:00<00:04,  4.42it/s]

Failed to resolve SMILES for Deuterium


 15%|█▌        | 3/20 [00:00<00:03,  4.48it/s]

Failed to resolve SMILES for Methane


 20%|██        | 4/20 [00:00<00:03,  4.51it/s]

Failed to resolve SMILES for Methane


 25%|██▌       | 5/20 [00:01<00:03,  4.52it/s]

Failed to resolve SMILES for Methane


 30%|███       | 6/20 [00:01<00:03,  4.53it/s]

Failed to resolve SMILES for Methane


 35%|███▌      | 7/20 [00:01<00:02,  4.53it/s]

Failed to resolve SMILES for Monodeuteromethane


 40%|████      | 8/20 [00:01<00:02,  4.53it/s]

Failed to resolve SMILES for Dideuteromethane


 45%|████▌     | 9/20 [00:01<00:02,  4.54it/s]

Failed to resolve SMILES for Trideuteromethane


 50%|█████     | 10/20 [00:02<00:02,  4.53it/s]

Failed to resolve SMILES for Methane-D4


 55%|█████▌    | 11/20 [00:02<00:01,  4.53it/s]

Failed to resolve SMILES for Ammonia


 60%|██████    | 12/20 [00:02<00:01,  4.53it/s]

Failed to resolve SMILES for Ammonia


 65%|██████▌   | 13/20 [00:02<00:01,  4.53it/s]

Failed to resolve SMILES for Water


 70%|███████   | 14/20 [00:03<00:01,  4.54it/s]

Failed to resolve SMILES for Neon


 75%|███████▌  | 15/20 [00:03<00:01,  4.54it/s]

Failed to resolve SMILES for Acetylene


 80%|████████  | 16/20 [00:03<00:00,  4.54it/s]

Failed to resolve SMILES for Acetylene


 85%|████████▌ | 17/20 [00:03<00:00,  4.55it/s]

Failed to resolve SMILES for Acetylene


 90%|█████████ | 18/20 [00:03<00:00,  4.54it/s]

Failed to resolve SMILES for Acetylene


 95%|█████████▌| 19/20 [00:04<00:00,  4.55it/s]

Failed to resolve SMILES for Monodeuteroacetylene


100%|██████████| 20/20 [00:04<00:00,  4.53it/s]


[]


In [31]:
cleaned_chunks

[[], [], []]

In [29]:
len(chunks)

146

In [3]:
cirpy.resolve("hydrogen", 'formula')

'H'

In [48]:
import cirpy