In [None]:
%load_ext autoreload
%autoreload 2

import copy
import os
import sys
from pathlib import Path
import subprocess
from platformdirs import user_cache_dir

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import Draw, AllChem

import ms_pred.common as common
from ms_pred.dag_pred.iceberg_elucidation import candidates_from_pubchem, iceberg_prediction, load_real_spec, load_pred_spec, elucidation_over_candidates, plot_top_mols, explain_peaks, modi_finder

In [None]:
%cd /home/roger/ms-pred

# Global configs
Please modify the following paths/configs accordingly; for `pythonpath` on molgpu, you can either
1) set up your own ms-gen environment from source 
2) use Mrunali's pre-built environment, after telling conda to look in my folder with `conda config --add envs_dirs /home/mrunali/miniconda3/envs`
3) Clone Mrunali's pre-built environment into your own environment with `conda --clone` and then source that instead. 

In [None]:
config = dict(
    python_path='/home/mrunali/miniconda3/envs/ms-gen/bin/python',
    gen_ckpt='/home/runzhong/ms-models/iceberg_results_20240630/dag_nist20/split_1_rnd1/version_0/best.ckpt',
    inten_ckpt='/home/runzhong/ms-models/iceberg_results_20240630/dag_inten_nist20/split_1_rnd1/version_0/best.ckpt',
    cuda_devices=1,
    batch_size=8,
    num_workers=6,
    sparse_k=100,
    max_nodes=100,
    threshold=0.0,
    binned_out=False,
    ppm=20,
    num_bins=15000,
    dist_func='entropy'
)

The following cell enables SA score. See instructions in [the rdkik blog](https://greglandrum.github.io/rdkit-blog/posts/2023-12-01-using_sascore_and_npscore.html)
You can safely skip it if you are not setting ``sa_score=True`` in function ``plot_top_mols``

In [None]:
# sys.path.append(os.path.join(os.environ['CONDA_PREFIX'],'share','RDKit','Contrib'))
sys.path.append(os.path.join('/home/roger/miniforge3/envs/ms-main','share','RDKit','Contrib'))
from SA_Score import sascorer

# Elucidation on Pesticide Degradation

### Run SIRIUS to assign formula and adduct

In [None]:
inp_mgf = '/home/roger/ms_collaborators/Joules-Pesticide/sirius.mgf'
candidates_csv = '/home/roger/ms_collaborators/Joules-Pesticide/chemical_transformation_predictions.csv'

# load mgf and candidates
all_specs = common.parse_spectra_mgf(inp_mgf)
feat_id_to_specs = {
    entry[0]['FEATURE_ID']: entry[1][0][1]
    for entry in all_specs if entry[0]['MSLEVEL'] == '2'
}
all_candidates = pd.read_csv(candidates_csv)

In [None]:
# Run SIRIUS to explain formula
profile = 'orbitrap'
ppm_max = 5
sirius_path = '/home/roger/miniforge3/envs/ms-main/bin/sirius'
top_k_sirius_preds = 5

def form_from_mgf(inp_mgf):
    sirius_config_cmd = f'--ignore-formula --noCite formula ' \
                        f'-p {profile} --ppm-max={ppm_max} write-summaries'
    exp_hash = common.md5(inp_mgf) + '||' + sirius_config_cmd
    out_dir = Path(user_cache_dir(f"ms-pred/sirius-out/{common.str_to_hash(exp_hash)}"))
    out_dir.mkdir(parents=True, exist_ok=True)

    if not (out_dir / 'sirius_run_successful').exists():
        sirius_command = (f'/home/roger/miniforge3/condabin/conda run -n ms-main '
                          f'sirius -o {out_dir} '
                          f'-i {inp_mgf} ' + sirius_config_cmd)
        print("Running SIRIUS, command:\n" + sirius_command + "\n")
        run_result = subprocess.run(sirius_command, shell=True)

        if run_result.returncode == 0:  # successful
            (out_dir / 'sirius_run_successful').touch()

    feature_id_to_form = {}
    for per_cmpd_out_dir in out_dir.glob('*'):
        feature_id = per_cmpd_out_dir.stem.split('_')[-1]
        sirius_cands_path = per_cmpd_out_dir / 'formula_candidates.tsv'
        if sirius_cands_path.exists():
            adduct_and_form = []
            df = pd.read_csv(sirius_cands_path, sep='\t')
            for idx, sirius_row in df.iterrows():
                if idx >= top_k_sirius_preds:
                    continue
                adduct = sirius_row['adduct'].replace(" ", "")
                adduct_and_form.append(dict(rnk=idx+1, adduct=adduct, form=sirius_row['molecularFormula']))
            feature_id_to_form[feature_id] = adduct_and_form
    return feature_id_to_form

In [None]:
# Run SIRIUS
feature_id_to_form = form_from_mgf(inp_mgf)

# prepare entries
feat_id_to_dict = {}
for feature_id, all_info in feature_id_to_form.items():
    adduct_to_smiles = {}
    for info in all_info:
        for cand_form, cand_smi in zip(all_candidates['formula'], all_candidates['smiles']):
            diff = common.formula_difference(info['form'], cand_form)
            if len(diff) == 0:  # same formula
                if info['adduct'] in adduct_to_smiles:
                    adduct_to_smiles[info['adduct']].append(cand_smi)
                else:
                    adduct_to_smiles[info['adduct']] = [cand_smi]
    if len(adduct_to_smiles) > 0:
        # remove duplicate inchikey
        for adduct, smiles in adduct_to_smiles.items():
            inchikeys = [common.inchikey_from_smiles(smi) for smi in smiles]
            _, uniq_idx = np.unique(inchikeys, return_index=True)
            adduct_to_smiles[adduct] = np.array(smiles)[uniq_idx].tolist()

        feat_id_to_dict[feature_id] = {
            'feature_id': feature_id,
            'spec': {'nan': feat_id_to_specs[feature_id]},
            'adduct_to_smiles': adduct_to_smiles,
        }

In [None]:
# write feature matchings for analysis
row_to_match = {}
for i in feat_id_to_dict.values():
    for adduct, smis in i['adduct_to_smiles'].items():
        for smi in smis:
            for row_idx in all_candidates.index[all_candidates['smiles'] == smi]:
                if row_idx not in row_to_match:
                    row_to_match[row_idx] = f"feat_id={i['feature_id']}, {adduct}"
                else:
                    row_to_match[row_idx] += f"; feat_id={i['feature_id']}, {adduct}"
all_candidates['matched features'] = row_to_match
all_candidates.to_csv(candidates_csv + '2', index=False)
all_candidates

### Run ICEBERG prediction

In [None]:
def pesticide_elucidation(info_dict, vis_peaks=False, energy=[30, 40, 60]):
    pesticide_config = copy.deepcopy(config)
    pesticide_config['nce'] = True # use nce for collision energy
    pesticide_config['step_collision_energy'] = True  # step collision energy i.e. multiple spectrum are obtained then merged
    pesticide_config['real_spec_type'] = 'raw'

    for adduct, smiles in info_dict['adduct_to_smiles'].items():
        pesticide_config['adduct'] = adduct

        # Run ICEBERG to predict spectra
        result_path, pmz = iceberg_prediction(smiles, energy, **pesticide_config)

        # Compare spectrum similarity for elucidation
        topk_results = elucidation_over_candidates(result_path, info_dict["spec"], precursor_mass=pmz, mol_name=info_dict["feature_id"], topk=15, **pesticide_config)

        # Plot top results
        img = plot_top_mols(topk_results)

        # Visualize and explain peaks
        if vis_peaks:
            for smi, _, __ in topk_results:
                explain_peaks(result_path, info_dict["spec"], pmz, smi, num_peaks=10, **pesticide_config)

        return img

In [None]:
pesticide_elucidation(feat_id_to_dict['1505'], True)

# Reaction discovery

In [None]:
inp_mgf = '/home/roger/ms_collaborators/Bo-reaction-discovery/20240817_orbitrap_msms/sirius.mgf'
all_specs = common.parse_spectra_mgf(inp_mgf)
feat_id_to_specs = {
    entry[0]['FEATURE_ID']: entry[1][0][1]
    for entry in all_specs if entry[0]['MSLEVEL'] == '2'
}

In [None]:
# Run sirius
feature_id_to_form = form_from_mgf(inp_mgf)
feature_id_to_form['2171']

In [None]:
# visualize candidates
import itertools
cand_smiles = []
for a, b, c in itertools.permutations(['(Cl)', '(Br)', '(O)'], 3):
    smiles = f'C1{a}=CC=CC=C1/N=C(C=C)/CCC{b}C{c}'
    cand_smiles.append(common.rm_stereo(smiles))
# cand_smiles += candidates_from_pubchem('C13H15BrClNO').tolist()
mols = [Chem.MolFromSmiles(smi) for smi in cand_smiles][:15]
Draw.MolsToGridImage(mols, molsPerRow=5, subImgSize=(250,250), maxMols=len(mols))

In [None]:
feature_id = '2171'
pesticide_elucidation({
    'feature_id': feature_id,
    'spec': {'nan': feat_id_to_specs[feature_id]},
    'adduct_to_smiles': {'[M+H]+': cand_smiles},
}, True, energy=[20, 35]
)

# Elucidation on Withanolide data

In [None]:
def spec_from_csv(csv_paths, precursor, colli_eng='collision 30', ppm=20):
    clean_ref_spec = None
    for csv_path in csv_paths:
        spec_df = pd.read_csv(csv_path, header=2)
        ref_spec = common.process_spec_file({'parentmass': precursor}, [(colli_eng, np.array(spec_df))], merge_specs=False)
        if clean_ref_spec is None:
            clean_ref_spec = ref_spec
        else:
            new_clean_peaks = []
            for mz, inten in clean_ref_spec[colli_eng]:
                match = np.abs(mz - ref_spec[colli_eng][:, 0]) / mz < 1e-6 * ppm  # only keep peaks that exist in all replicates
                if np.any(match):
                    otherinten = ref_spec[colli_eng][np.where(match), 1]
                    new_clean_peaks.append((mz, inten + otherinten.item()))
            clean_ref_spec = {colli_eng: np.array(new_clean_peaks)}
    clean_ref_spec[colli_eng][:, 1] /= np.max(clean_ref_spec[colli_eng][:, 1])
    return clean_ref_spec

def mol_ok(mol):
    try:
        Chem.SanitizeMol(mol)
        return True
    except ValueError:
        return False

def ring_OK(mol):
    if not mol.HasSubstructMatch(Chem.MolFromSmarts("[R]")):
        return True

    ring_allene = mol.HasSubstructMatch(Chem.MolFromSmarts("[R]=[R]=[R]"))

    cycle_list = mol.GetRingInfo().AtomRings()
    max_cycle_length = max([len(j) for j in cycle_list])
    macro_cycle = max_cycle_length > 6

    double_bond_in_small_ring = mol.HasSubstructMatch(
        Chem.MolFromSmarts("[r3,r4]=[r3,r4]")
    )

    return not ring_allene and not macro_cycle and not double_bond_in_small_ring

def generate_candidates(base_mols, all_rxn_smarts):
    iterable = True
    try:
        _ = iter(base_mols)
    except TypeError as te:
        iterable = False

    if not iterable:
        base_mols = [base_mols]

    candidate_pathways = []
    all_new_mols = []
    for rxn_smarts in all_rxn_smarts:
        rxn = AllChem.ReactionFromSmarts(rxn_smarts)
        for base_mol in base_mols:
            new_mol_trial = rxn.RunReactants((base_mol,))
            new_mols = np.array([mol[0] for mol in new_mol_trial if mol_ok(mol[0]) and ring_OK(mol[0])])
            new_inchikey = np.array([Chem.MolToInchiKey(mol) for mol in new_mols])
            _, indices = np.unique(new_inchikey, return_index=True)
            new_mols = new_mols[indices]
            candidate_pathways += [(Chem.MolToSmiles(base_mol), Chem.MolToSmiles(m)) for m in new_mols]
            all_new_mols += new_mols.tolist()

    new_inchikey = np.array([Chem.MolToInchiKey(mol) for mol in all_new_mols])
    _, indices = np.unique(new_inchikey, return_index=True)
    all_new_mols = np.array(all_new_mols)[indices]
    return all_new_mols, candidate_pathways

def add_atom(base_mols, new_atom='O'):  # Hydrogen is ignored, new_atom='C' means 'CH2'
    all_rxn_smarts = [
        "[*:1]>>[*:1]X".replace("X", new_atom),  # append
        # "[*:1]-[*:2]>>[*:1]X[*:2]".replace("X", new_atom),  # insert
    ]
    if new_atom == 'O':
        all_rxn_smarts.append(
            "[*:1]=[*:2]>>[*:1]1X[*:2]1".replace("X", new_atom),  # double bond -> C-O-C ring
        )
    return generate_candidates(base_mols, all_rxn_smarts)

def add_unsaturation(base_mols):  # add double bond or add one ring
    all_rxn_smarts = [
        "[*:1]-[*:2]>>[*:1]=[*:2]",  # single bond -> double bond
        "[*:1]=[*:2]>>[*:1]#[*:2]",  # double bond -> triple bond
        "[*;!r;!H0:1]~[*;!r:2]~[*;!r;!H0:3]>>[*:1]1~[*:2]~[*:3]1",  # rings
        "[*;!r;!H0:1]~[*!r:2]~[*!r:3]~[*;!r;!H0:4]>>[*:1]1~[*:2]~[*:3]~[*:4]1",
        "[*;!r;!H0:1]~[*!r:2]~[*:3]~[*:4]~[*;!r;!H0:5]>>[*:1]1~[*:2]~[*:3]~[*:4]~[*:5]1",
        "[*;!r;!H0:1]~[*!r:2]~[*:3]~[*:4]~[*!r:5]~[*;!r;!H0:6]>>[*:1]1~[*:2]~[*:3]~[*:4]~[*:5]~[*:6]1",
    ]
    return generate_candidates(base_mols, all_rxn_smarts)

def withanolide_elucidation(meta, visual_peaks=False):
    withanolide_config = copy.deepcopy(config)
    withanolide_config['real_spec_type'] = 'raw'
    withanolide_config['adduct'] = meta["adduct"]

    # Run ICEBERG to predict spectra
    result_path, pmz = iceberg_prediction(meta["candidate_smiles"], [30], **withanolide_config)

    # Compare spectrum similarity for elucidation
    topk_results = elucidation_over_candidates(result_path, meta["ref_spec"], precursor_mass=pmz, mol_name=meta["cmpd_name"], topk=30, **withanolide_config)

    # Plot top results
    img = plot_top_mols(topk_results)

    # Visualize and explain peaks
    if visual_peaks:
        if 'real_smi' in meta:
            all_smiles = [meta['real_smi']]
        else:
            all_smiles = meta["candidate_smiles"]
        for smi in all_smiles:
            explain_peaks(result_path, meta["ref_spec"], pmz, smi, num_peaks=10, **withanolide_config)

    return img

In [None]:
precursor = common.rm_stereo(r'C[C@]12C(C[C@@H](O)CC2)=CC[C@]3([H])[C@]1([H])CC[C@@]4(C)[C@@]3([H])CC[C@@H]4[C@H](C)CC/C(C)=C(C)/C')
cmpd1_smi = common.rm_stereo(r'C[C@]12C(C[C@@H](O)CC2)=CC[C@]3([H])[C@]1([H])CC[C@@]4(C)[C@@]3([H])CC[C@@H]4[C@H](C)[C@H](O)C/C(C)=C(C)/C')
cmpd5_smi = common.rm_stereo(r'CC(C)=C(C)CC(O)C(C)C1CCC2C3CC=C4CC(O)CC(O)C4(C)C3CCC12C')
cmpd35_smi = common.rm_stereo(r'C[C@]12C(C[C@@H](O)C[C@@H]2O)=CCC3C1CC[C@@]4(C)C3CC[C@@H]4[C@H](C)[C@H](O)C/C(C)=C(CO)/C')
cmpd36_smi = common.rm_stereo(r'C[C@]12C(C[C@@H](O)CC2O)=CCC3C1CC[C@@]4(C)C3CC[C@@H]4[C@H](C)C5OC(C(C)=C(C)C5)=O')

withanolide_meta = {}

withanolide_meta['cmpd1'] = {
    "cmpd_name": "cmpd1",
    "precursor": 397.3465,
    "adduct": '[M+H-H2O]+',
    "spec_name": 'cmpd1-ms2',
    "replicate_suffix": ('1', '2', '3'),
    "start_smi": precursor,
    "form_change": '+O',
    "real_smi": cmpd1_smi,
}

withanolide_meta['cmpd5'] = {
    "cmpd_name": "cmpd5",
    "precursor": 413.3415,
    "adduct": '[M+H-H2O]+',
    "spec_name": 'cmpd5-ms2',
    "replicate_suffix": ('1', '2'),
    "start_smi": cmpd1_smi,
    "form_change": '+O',
    "real_smi": cmpd5_smi,
}

withanolide_meta['cmpd35'] = {
    "cmpd_name": "cmpd35",
    "precursor": 429.336,
    "adduct": '[M+H-H2O]+',
    "spec_name": 'cmpd35-ms2',
    "replicate_suffix": ('1', '2'),
    "start_smi": cmpd5_smi,
    "form_change": '+O',
    "real_smi": cmpd35_smi,
}

# update 2024/07/24
withanolide_meta['cmpd6'] = {
    "cmpd_name": "cmpd6",
    "precursor": 413.3415,
    "adduct": '[M+H-H2O]+',
    "spec_name": 'cmpd6-ms2',
    "replicate_suffix": ('1', '2'),
    "start_smi": cmpd1_smi,
    "form_change": '+O',
}

withanolide_meta['cmpd36'] = {
    "cmpd_name": "cmpd36",
    "precursor": 443.315,
    "adduct": '[M+H]+',
    "spec_name": 'cmpd36-ms2',
    "replicate_suffix": ('1', '2'),
    "candidate_smiles": [cmpd36_smi],
    "real_smi": cmpd36_smi,
}

withanolide_meta['cmpd41'] = {
    "cmpd_name": "cmpd41",
    "precursor": 441.29959,
    "adduct": '[M+H]+',
    "spec_name": 'cmpd41-ms2',
    "replicate_suffix": ('1', '2'),
    "candidate_smiles": [
        common.rm_stereo(r'C[C@]12C(CC(CC2O)=O)=CCC3C1CC[C@@]4(C)C3CC[C@@H]4[C@H](C)C5OC(C(C)=C(C)C5)=O'),
        common.rm_stereo(r'C[C@]12C(C[C@@H](O)CC2=O)=CCC3C1CC[C@@]4(C)C3CC[C@@H]4[C@H](C)C5OC(C(C)=C(C)C5)=O'),
]
}

withanolide_meta['cmpd42'] = {
    "cmpd_name": "cmpd42",
    "precursor": 521.25675,
    "adduct": '[M+H]+',
    "spec_name": 'cmpd42-ms2',
    "replicate_suffix": ('1', '2'),
    "candidate_smiles": [
        common.rm_stereo(r'C[C@]12C(CC(CC2OS(=O)(O)=O)=O)=CCC3C1CC[C@@]4(C)C3CC[C@@H]4[C@H](C)C5OC(C(C)=C(C)C5)=O'),
        common.rm_stereo(r'C[C@]12C(C[C@@H](OS(=O)(O)=O)CC2=O)=CCC3C1CC[C@@]4(C)C3CC[C@@H]4[C@H](C)C5OC(C(C)=C(C)C5)=O'),
    ]
}

withanolide_meta['cmpd42-neg'] = {
    "cmpd_name": "cmpd42",
    "precursor": 519.2422,
    "adduct": '[M-H]-',
    "spec_name": 'cmpd42-ms2-neg',
    "replicate_suffix": ('1', '2'),
    "candidate_smiles": [
        common.rm_stereo(r'C[C@]12C(C[C@@H](OS(=O)(O)=O)CC2=O)=CCC3C1CC[C@@]4(C)C3CC[C@@H]4[C@H](C)C5OC(C(C)=C(C)C5)=O'),
        common.rm_stereo(r'C[C@]12C(CC(CC2OS(=O)(O)=O)=O)=CCC3C1CC[C@@]4(C)C3CC[C@@H]4[C@H](C)C5OC(C(C)=C(C)C5)=O')
    ]
}


# generate candidate smiles
for meta in withanolide_meta.values():
    if "start_smi" in meta:
        start_smi = common.rm_stereo(meta["start_smi"])
        start_mol = Chem.MolFromSmiles(start_smi)
        assert "+" == meta["form_change"][0]
        mol_candidates, _ = add_atom(start_mol, meta["form_change"][1:])
        meta["candidate_smiles"] = [Chem.MolToSmiles(_) for _ in mol_candidates]

    # read spec from csv
    real_spec_paths = [f'/home/roger/ms_collaborators/Erin-withanolide/20240326_withanolide_intermediates_MS2_centroided/{meta["spec_name"]}-{suffix}.csv' for suffix in meta["replicate_suffix"]]
    meta["ref_spec"] = spec_from_csv(real_spec_paths, meta["precursor"])

In [None]:
withanolide_elucidation(withanolide_meta['cmpd5'], True)

# Modifinder

In [None]:
meta1 = withanolide_meta['cmpd1']
meta2 = withanolide_meta['cmpd6']
form_diff = '+O'

assert meta1['adduct'] == meta2['adduct']
withanolide_config = copy.deepcopy(config)
withanolide_config['real_spec_type'] = 'raw'
withanolide_config['adduct'] = meta1["adduct"]
withanolide_config['max_nodes'] = 500
withanolide_config['sparse_k'] = 500

# Run ICEBERG to predict spectra for mol1
result_path, pmz = iceberg_prediction(meta1["candidate_smiles"], [30], **withanolide_config)

imgs = modi_finder(
    form_diff, result_path, meta1["real_smi"],
    meta1["cmpd_name"], meta1["precursor"], meta1["ref_spec"], "raw",
    meta2["cmpd_name"], meta2["precursor"], meta2["ref_spec"], "raw",
    topk_peaks=15, return_thresh=0.1
)
for img in imgs:
    plt.figure()
    plt.axis('off')
    plt.title('Possible modification sites')
    plt.imshow(img)

# Elucidation on Broad data

In [None]:
def broad_elucidation(spec_file, formula, real_smi=None, name="", vis_peaks=False, **kwargs):
    broad_config = copy.deepcopy(config)
    broad_config['nce'] = True # use nce for collision energy
    for k, v in kwargs.items():
        broad_config[k] = v
    exp_spec_path = f'/home/roger/ms_collaborators/broad-Julian/spec_files/{spec_file}.ms'

    # Get candidates
    smiles = candidates_from_pubchem(formula)

    # Run ICEBERG to predict spectra
    result_path, pmz = iceberg_prediction(smiles, [10, 20, 30, 40, 50], **broad_config)

    # Compare spectrum similarity for elucidation
    topk_results = elucidation_over_candidates(result_path, exp_spec_path, precursor_mass=pmz, mol_name=name, real_smiles=real_smi, **broad_config)

    # Plot top results
    img = plot_top_mols(topk_results)

    # Visualize and explain peaks
    if vis_peaks:
        explain_peaks(result_path, exp_spec_path, pmz, real_smi, num_peaks=10, **broad_config)

    return img


## GABA-Arg isomers elucidation

In [None]:
broad_elucidation('mxp4308', 'C10H21N5O3', r'C(C[C@@H](C(=O)O)NC(=O)CCCN)CN=C(N)N', 'GABA-Arg', False)

In [None]:
broad_elucidation('mxp4309', 'C10H21N5O3', r'O=C(NCCCC(O)=O)C(N)CCC/N=C(N)\N', 'Arg-GABA')

In [None]:
broad_elucidation('mxp4155', 'C10H21N5O3', r'C[C@@H](C(=O)N[C@@H](CCCCN=C(N)N)C(=O)O)N', 'Alanylhomoaruginine')

In [None]:
broad_elucidation('mxp4156', 'C10H21N5O3', r'N/C(N)=N\CCCC(C(O)=O)NC(C(N)CC)=O', '2-(2-aminobutanamido)-5- [(diaminomethylidene)amino]pentanoic acid')

In [None]:
broad_elucidation('mxp4157', 'C10H21N5O3', r'O=C(O)C(N)CCCNC(CCC/N=C(N)\N)=O', '2-amino-5-{4 [(diaminomethylidene)amino]butanamido}pentanoic acid')

## OH-Tryptopha elucidation

In [None]:
broad_elucidation('mxp4310', 'C11H12N2O3', 'C1=CC2=C(C(=C1)O)C(=CN2)CC(C(=O)O)N', '4-Hydroxy Tryptophan')

In [None]:
broad_elucidation('mxp4310_all', 'C11H12N2O3', 'C1=CC2=C(C(=C1)O)C(=CN2)CC(C(=O)O)N', '4-Hydroxy Tryptophan (all peaks)')

In [None]:
broad_elucidation('mxp0141', 'C11H12N2O3', 'C1=CC2=C(C=C1O)C(=CN2)C[C@@H](C(=O)O)N', '5-Hydroxy Tryptophan')

In [None]:
broad_elucidation('mxp0141_all', 'C11H12N2O3', 'C1=CC2=C(C=C1O)C(=CN2)C[C@@H](C(=O)O)N', '5-Hydroxy Tryptophan (all peaks)')

In [None]:
broad_elucidation('mxp4412', 'C11H12N2O3', 'C1=CC2=C(C(=C1)O)NC=C2CC(C(=O)O)N', '7-Hydroxy Tryptophan')

In [None]:
broad_elucidation('mxp4412_all', 'C11H12N2O3', 'C1=CC2=C(C(=C1)O)NC=C2CC(C(=O)O)N', '7-Hydroxy Tryptophan (all peaks)')

In [None]:
broad_elucidation('mxp4411', 'C11H12N2O3', 'C1=CC2=C(C=C1O)NC=C2C[C@@H](C(=O)O)N', '6-Hydroxy Tryptophan')

In [None]:
broad_elucidation('mxp4411_all', 'C11H12N2O3', 'C1=CC2=C(C=C1O)NC=C2C[C@@H](C(=O)O)N', '6-Hydroxy Tryptophan (all peaks)')

In [None]:
broad_elucidation('QI9873', 'C11H12N2O3', 'C1=CC2=C(C=C1)NC=C2C(O)[C@@H](C(=O)O)N')

# Tri peptite elucidation

In [None]:
broad_elucidation('mxp4039', 'C17H32N4O4', r'CC(C)C[C@@H](C(=O)O)NC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCCN)N', 'Lys-Pro-Leu')

In [None]:
broad_elucidation('mxp4040', 'C17H32N4O4', r'CC[C@H](C)[C@@H](C(=O)O)NC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCCN)N', 'Lys-Pro-Ile')

In [None]:
broad_elucidation('mxp4041', 'C17H32N4O4', r'CC(C)C[C@@H](C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCCN)C(=O)O)N', 'Leu-Pro-Lys')

In [None]:
broad_elucidation('mxp4042', 'C17H32N4O4', r'CC[C@H](C)[C@@H](C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCCN)C(=O)O)N', 'Ile-Pro-Lys', True)

In [None]:
broad_elucidation('QI8422', 'C17H32N4O4', 'CC(C)CC(C(=O)N1CCCC1C(=O)O)NC(=O)C(CCCCN)N', 'Unknown-Lys-Leu-Pro', True)

Visualize the real spectrum difference (small-scale molecular networking)

In [None]:
import itertools
from sklearn.manifold import MDS
from ms_pred.retrieval.retrieval_benchmark import entropy_dist_bin

broad_config = copy.deepcopy(config)
broad_config['nce'] = True # use nce for collision energy

num_bins = 15000
precursor_mz = 357.249631956
isomer_info = [
    # (name, filename, smiles)
    ('KPL', 'mxp4039', None),
    # ('KPI', 'mxp4040', None),
    ('LPK', 'mxp4041', None),
    # ('IPK', 'mxp4042', None),
    ('Unknown', 'QI8422', None),
    ('KPL*', None, r'CC(C)C[C@@H](C(=O)O)NC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCCN)N'),
    # ('KPI*', None, r'CC[C@H](C)[C@@H](C(=O)O)NC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCCN)N'),
    ('LPK*', None, r'CC(C)C[C@@H](C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCCN)C(=O)O)N'),
    # ('IPK*', None, r'CC[C@H](C)[C@@H](C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCCN)C(=O)O)N'),
    ('PLK*', None, r'O=C(NC(C(O)=O)CCCCN)C(NC(C1CCCN1)=O)CC(C)C'),
    # ('PIK*', None, r'CCC(C)C(C(=O)NC(CCCCN)C(=O)O)NC(=O)C1CCCN1'),
    ('KLP*', None, r'CC(C)CC(C(=O)N1CCCC1C(=O)O)NC(=O)C(CCCCN)N'),
    # ('KIP*', None, r'CCC(C)C(C(=O)N1CCCC1C(=O)O)NC(=O)C(CCCCN)N'),
    ('PKL*', None, r'CC(C)CC(C(=O)O)NC(=O)C(CCCCN)NC(=O)C1CCCN1'),
    # ('PKI*', None, r'CCC(C)C(C(=O)O)NC(=O)C(CCCCN)NC(=O)C1CCCN1'),
]

def get_spec(name, fname, smi):
    if fname is not None:  # real spec
        spec = load_real_spec(f'/home/roger/ms_collaborators/broad-Julian/spec_files/{fname}.ms', real_spec_type='ms', nce=True, precursor_mass=precursor_mz)
        spec_binned = {k: common.bin_spectra([v], num_bins)[0] for k, v in spec.items()}
    elif smi is not None:  # in-silico spec
        result_path, pmz = iceberg_prediction([smi], [10, 20, 30, 40, 50], **broad_config)
        _, pred_specs, __ = load_pred_spec(result_path, False)
        spec_binned = {k: common.bin_spectra([v], num_bins)[0] for k, v in pred_specs[0].items()}
    else:
        raise ValueError
    return spec_binned

num_isomers = len(isomer_info)
dissim_mat = np.zeros((num_isomers, num_isomers))
for (id1, (name1, fname1, smi1)), (id2, (name2, fname2, smi2)) in itertools.combinations(zip(range(num_isomers), isomer_info), 2):
    spec1_binned = [get_spec(name1, fname1, smi1)]
    spec2_binned = get_spec(name2, fname2, smi2)
    dist = entropy_dist_bin(spec1_binned, spec2_binned, ignore_peak=(precursor_mz - 1) * 10, sparse=False)
    dissim_mat[id1, id2] = dist[0]
    dissim_mat[id2, id1] = dist[0]

embedding = MDS(n_components=2, normalized_stress='auto')
plot_pos = embedding.fit_transform(dissim_mat)
for info, (x, y) in zip(isomer_info, plot_pos):
    if info[1] is None:
        plt.scatter(x, y, marker='^', c='c')
    else:
        plt.scatter(x, y, marker='o', c='r')
    plt.gca().annotate(info[0], (x + 0.01, y + 0.01))

colors = plt.cm.inferno(dissim_mat)
for (id1, (x1, y1)), (id2, (x2, y2)) in itertools.combinations(zip(range(num_isomers), plot_pos), 2):
    plt.plot((x1, x2), (y1, y2), '--', c=colors[id1, id2], linewidth=0.5)
    plt.text((x1 + x2) / 2, (y1 + y2) / 2, f"{dissim_mat[id1, id2]:.3f}", fontsize=5)
plt.axis('off')

## Food biomarkers

In [None]:
broad_config = copy.deepcopy(config)
broad_config['nce'] = True # use nce for collision energy

# Get candidates
df = pd.read_csv('/home/roger/ms_collaborators/broad-Julian/24_0723_Compounds_to_generate_in_silico_MSMS.csv')
out_dir = Path('/home/roger/ms_collaborators/broad-Julian/foodbiomarker')
out_dir.mkdir(parents=True, exist_ok=True)

# fix formula mismatch
for idx, row in df.iterrows():
    df.at[idx, 'Chemical Formula'] = common.form_from_smi(row['SMILES'])

smiles = []
for idx, row in df.iterrows():
    smiles.append(common.rm_stereo(row['SMILES']))

result_path, pmz = iceberg_prediction(smiles, [10, 20, 30, 40, 50], **broad_config)

smiles, pred_specs, pred_frags = load_pred_spec(result_path, False)
for smi, pred_spec in zip(smiles, pred_specs):
    inchikey = common.inchikey_from_smiles(smi)
    out_arr = np.zeros((0, 3))
    for ev, spec in pred_spec.items():
        spec[:, 1] *= 100
        out_arr = np.concatenate(
            (out_arr,
             np.concatenate((spec, np.ones((spec.shape[0], 1)) * float(ev)), axis=-1)),
            axis=0
        )
    df = pd.DataFrame(data=out_arr, columns=['m/z', 'inten', 'ev'])
    df.to_csv(out_dir / f'{inchikey}.csv', index=False)
