# Extract MoBiAS PDF reports

The PDF reports are a great source of additional information as they contain data for peaks that may not have been assigned to any known product.
Mining these reports is much faster than reprocessing of the raw LCMS data.

Here, we are in particular interested to identify:
1. leftover starting materials
2. systematic side products that we have not been previously looking for


In [None]:
import pathlib
import re
import sys
from datetime import datetime

sys.path.append(str(pathlib.Path().resolve().parents[1]))

from pypdf import PdfReader
import pandas as pd
import numpy as np

from src.util.db_utils import SynFermDatabaseConnection
from src.definitions import DATA_DIR, PLATE_LIST_PATH
from src.util.rdkit_util import smiles_to_lcms_mass

In [None]:
con = SynFermDatabaseConnection()

In [None]:
def count_assigned_peaks():
    assigned_peaks = con.con.execute("SELECT peak_id, assignment, lp.experiment_id FROM lcms_peaks_assignment JOIN lcms_peaks lp on lcms_peaks_assignment.peak_id = lp.id").fetchall()
    df = pd.DataFrame(assigned_peaks, columns=["peak_id", "assignment", "experiment_id"])
    # count the number of reaction each species is found in
    return df[["assignment", "experiment_id"]].drop_duplicates().groupby(["assignment"]).count()

## Extract PDF data

In [None]:
def import_lcms_full_report(path):
    # set up pdf reader
    reader = PdfReader(path)
    number_of_pages = len(reader.pages)
    lines = []
    found_data = False
    # iterate from second page until entire peak summary table is read completely
    for i in range(1, number_of_pages):
        page = reader.pages[i]
        text = page.extract_text()
        line_list = text.splitlines()
        if (line_list[0] != '# RT [min] Area I S/N Max. m/z FWHM [min] Area % Int. %') and found_data:  # stop when not encountering another data header
            break
        else:
            found_data = True
        lines += line_list
    
    if len(lines) == 0:
        raise RuntimeError("No data extracted")
        
    # remove spaces in header
    cleaned_lines = ['# RT[min] Area I S/N max_m/z FWHM[min] Area% Int%',]
    # remove footers and data headers on not-first page
    cleaned_lines += [line for line in lines if line[0].isnumeric()] 

    # split the lines into individual fields
    data = [line.split() for line in cleaned_lines]
    
    # assemble DataFrame from data
    df = pd.DataFrame(data[1:], columns=data[0]).astype("float")

    return df

In [None]:
%%capture output
# import the plate list to obtain LCMS identifier - plate_nr relation
plate_list = pd.read_csv(PLATE_LIST_PATH)
# we will record any files that cause exceptions for manual inspection
files_with_exceptions = []

# iterate over all experiments/plates
for exp_nr in range(1, 30):
    for plate_nr in range(1, 7):
        # print progress indicator
        now = datetime.now()
        print(f"exp {exp_nr}-{plate_nr}, started {now.strftime('%H:%M:%S')}")
 
        
        lcms_id = plate_list.loc[(plate_list["exp_nr"] == exp_nr) & (plate_list["plate_nr"] == plate_nr), "results_file_name"].item().split("_")[0]
        exp_path = DATA_DIR / "pdf_reports" / lcms_id
        full_report_paths = list(exp_path.glob("*_LCMS_Fullreport.pdf"))
        
        for path in full_report_paths:
            try:
                # get well from filename
                regex = r'_P\d{1}-[A-Z]-\d{1,2}_'
                match = re.search(regex, path.name)
                well = "".join(match.group().strip("_").split("-")[1:])
                reaction_id = con.get_reaction_id((exp_nr, plate_nr, well))
                df = import_lcms_full_report(path)
                # we persist this to the database for re-use
                # first reformat df to fit DB
                df.insert(0, "reaction_id", reaction_id)
                df.insert(2, "retention_time_s", (df["RT[min]"] * 60).astype("int"))
                df = df.drop(columns="RT[min]")
                df = df.rename(columns={"#": "peak_number", 
                           "Area": "area", 
                           "I": "intensity",
                           "S/N": "signal_to_noise",
                           "max_m/z": "mz_max",
                           "FWHM[min]": "fwhm_min", 
                           "Area%": "%area",
                           "Int%": "%intensity"
                          }).astype({"peak_number": "int",
                                     "area": "int",
                                     "intensity": "int"})
                # write all extracted peaks to DB
                with con.con:
                    con.con.executemany(
                        'INSERT INTO lcms_peaks (experiment_id, peak_nr, retention_time_s, area, intensity, signal_to_noise, mz_max, fwhm_min, "%area", "%intensity") VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?);', 
                        [tuple(row) for row in df.to_numpy()]
                    )
            except Exception as e:
                print(f"Something went wrong for {str(path)}")
                files_with_exceptions.append(path)
                print(e)

Note:
We have later found one failure mode for the extraction:
Sometimes values (intensity) are missing in the PDF table, in which case, that specific row is parsed incorrectly.
We can remove these rows by checking for NaN values in the last column (%intensity).


In [None]:
len(con.con.execute('SELECT * FROM lcms_peaks WHERE "%intensity" IS NULL').fetchall())

In [None]:
with con.con:
    con.con.execute('DELETE FROM lcms_peaks WHERE "%intensity" IS NULL')

## Peak assignments - common contaminants

In [None]:
# define some expected contaminants
mz_dmso = [79.0212, 101.0032, 157.0351]  # M+H+, M+Na+ 2M+H+
mz_lock_tmp = 142.1590  # tetramethylpiperidine
mz_lock_hmp = 322.0481  # hexamethoxyphosphazene
mz_is = 361.1201  # fenofibrate

In [None]:
# write DMSO assignments to DB
# the median retention time for DMSO is 48 seconds, but it's a broad peak, so we just use <120s
for mz in mz_dmso:
    with con.con:
        con.con.execute("INSERT INTO lcms_peaks_assignment (peak_id, assignment) SELECT id, 'DMSO' FROM lcms_peaks WHERE mz_max BETWEEN ? AND ? AND retention_time_s < 120", (mz - 0.02, mz + 0.02))


In [None]:
# write lock mass assignments to DB
# since lock molecules are part of the solvent, we do not put a retention time constraint
with con.con:
    con.con.execute("INSERT INTO lcms_peaks_assignment (peak_id, assignment) SELECT id, 'lock mass tetramethylpiperidine' FROM lcms_peaks WHERE mz_max BETWEEN ? AND ?", (mz_lock_tmp - 0.02, mz_lock_tmp + 0.02))
with con.con:
    con.con.execute("INSERT INTO lcms_peaks_assignment (peak_id, assignment) SELECT id, 'lock mass hexamethoxyphosphazene' FROM lcms_peaks WHERE mz_max BETWEEN ? AND ?", (mz_lock_hmp - 0.02, mz_lock_hmp + 0.02))

In [None]:
# write IS assignments to DB
# the median retention time for fenofibrate was 368 s. We allow a 20 s window around this (the latest picked peak is at 388 so the upper bound is not necessary)
with con.con:
    con.con.execute("INSERT INTO lcms_peaks_assignment (peak_id, assignment) SELECT id, 'IS fenofibrate' FROM lcms_peaks WHERE mz_max BETWEEN ? AND ? AND retention_time_s > 348", (mz_is - 0.02, mz_is + 0.02))

## Peak assignments - Known reactants/products

We read extracted data from the DB, compare it with the expected masses and save the results back to the db

In [None]:
def calculate_lcms_peak_differences(identifier):

    # import relevant reactants and products
    # import starting material mzs
    sms = con.get_starting_materials_for_reaction(identifier)
    mz_i, mz_m, mz_t = [smiles_to_lcms_mass(smi) for smi in sms]
    mz_iacid = mz_i - 52.0096  # mass difference of degradation KAT-H to carboxylic acid
    mz_baa = mz_m - 124.05243  # mass difference of degradation to beta-amino acid
    mzs = {"I": mz_i, "M": mz_m, "T": mz_t, "I_acid": mz_iacid, "M_bAA": mz_baa}

    # import product mzs
    known_product_smiles = con.get_product_smiles(identifier)
    known_product_mzs = {s: smiles_to_lcms_mass(smi) for s, smi in zip("ABCDEFGH", known_product_smiles) if smi}  # "if smi" bc we sometimes have None for product H
    mzs.update(known_product_mzs)

    # assign known peaks
    for s, mz in mzs.items():
        with con.con:
            con.con.execute("INSERT INTO lcms_peaks_assignment (peak_id, assignment) SELECT id, ? AS assignment FROM lcms_peaks WHERE experiment_id=? AND mz_max BETWEEN ? AND ?", (s, identifier, mz - 0.02, mz + 0.02))

    # some peaks are still unexplained
    # for these, we calculate the difference to known mzs to see if a pattern emerges
    lcms_peaks = con.get_lcms_peaks(identifier, with_assignment=True)
    # ignore peaks that are already explained or that are insignificant
    unexplained_peaks = lcms_peaks.loc[lcms_peaks.assignment.isna() & (lcms_peaks["%area"] > 5)]
    unexplained_mzs = unexplained_peaks.mz_max.to_numpy()
    mzs_arr = np.array(list(mzs.values()))
    # calculate the differences with all the considered masses
    delta = pd.concat([unexplained_peaks[["experiment_id", "id"]].reset_index(drop=True), pd.DataFrame(unexplained_mzs[:, None] - mzs_arr, columns=list(mzs.keys()))], axis=1)

    # add "H" column if it does not exist
    if "H" not in delta.columns:
        delta["H"] = np.nan
    
    with con.con:
        # persist differences to DB
        con.con.executemany("INSERT INTO lcms_peaks_differences (experiment_id, peak_id, delta_I, delta_M, delta_T, delta_Iacid, delta_bAA, delta_A, delta_B, delta_C, delta_D, delta_E, delta_F, delta_G, delta_H) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);",
                           delta.to_numpy())
    return delta

In [None]:
# we will record any files that cause exceptions for manual inspection
records_with_exceptions = []
# iterate over all experiments/plates
for exp_nr in range(1, 30):
    for plate_nr in range(1, 7):
        # print progress indicator
        now = datetime.now()
        print(f"exp {exp_nr}-{plate_nr}, started {now.strftime('%H:%M:%S')}")

        identifiers = con.get_reaction_ids_for_plate((exp_nr, plate_nr))
        for i in identifiers:
            try:
                calculate_lcms_peak_differences(i)
            except Exception as e:
                records_with_exceptions.append(i)

In [None]:
len(df["experiment_id"].unique())

## Peak assignments - Find more contaminants

In [None]:
## get all peaks to identify common contaminants
res = con.con.execute('SELECT lcms_peaks.id, lcms_peaks.experiment_id, lcms_peaks.peak_nr, retention_time_s, area, intensity, signal_to_noise, mz_max, fwhm_min, "%area", "%intensity", a.assignment FROM lcms_peaks LEFT JOIN lcms_peaks_assignment a on lcms_peaks.id = a.peak_id;',
                ).fetchall()
df = pd.DataFrame(
    res,
    columns=[
       "peak_id",
        "reaction_id",
        "peak_nr",
        "retention_time_s",
        "area",
        "intensity",
        "signal_to_noise",
        "mz_max",
        "fwhm_min",
        "%area",
        "%intensity",
        "assignment",
    ]
)

In [None]:
df.loc[df.assignment.isna(), "mz_max"].round(2).value_counts()

#### m/z 128.11

In [None]:
# what mass exactly do we observe?
df.loc[df["mz_max"].between(128.105, 128.109), "mz_max"].describe()

In [None]:
# when does the 128.1064 peak occur?
df.loc[df["mz_max"].between(128.105, 128.109), "retention_time_s"].describe()

An m/z of 128.1064 corrsponds to C7H13NO (calc m/z 128.1070)
This could e.g. be 2-azacyclooctanone.

The peak usually occurs after 184 seconds. We will assign anything with ±15s and ±0.002 m/z to this formula


In [None]:
# assign
peak_ids = df.loc[df["mz_max"].between(128.105, 128.109) & df["retention_time_s"].between(169, 199), "peak_id"].to_numpy()
with con.con:
    con.con.executemany('INSERT INTO lcms_peaks_assignment (peak_id, assignment) VALUES (?, "common contaminant C7H13NO");', peak_ids[:, None].tolist())

In [None]:
# assign
df.loc[df["mz_max"].between(128.105, 128.109) & ~df["retention_time_s"].between(169, 199)]

#### m/z 185.11

In [None]:
# what mass exactly do we observe?
df.loc[df["mz_max"].between(185.10, 185.12), "mz_max"].describe()

In [None]:
# when does the 185.1141 peak occur?
df.loc[df["mz_max"].between(185.1121, 185.1161), "retention_time_s"].describe()

An m/z of 185.1141 could correspond to C8H18O3Na+ (calc m/z 185.1148)
This could e.g. be diethyl-diethyleneglycol (or other short PEG chains) which are common LCMS contaiminants

The peak usually occurs after 198 seconds. We will assign anything with ±15s and ±0.002 m/z to this formula


In [None]:
# assign
peak_ids = df.loc[df["mz_max"].between(185.1128, 185.1168) & df["retention_time_s"].between(183, 213), "peak_id"].to_numpy()
with con.con:
    con.con.executemany('INSERT INTO lcms_peaks_assignment (peak_id, assignment) VALUES (?, "common PEG contaminant");', peak_ids[:, None].tolist())

#### m/z 239.09

In [None]:
# what mass exactly do we observe?
df.loc[df["mz_max"].between(239.08, 239.10), "mz_max"].describe()

In [None]:
# when does the 239.0885 peak occur?
df.loc[df["mz_max"].between(239.0865, 239.0905), "retention_time_s"].describe()

An m/z of 239.0885 could correspond to many different compounds.
The peak usually occurs after 196 seconds. We will assign anything with ±15s and ±0.002 m/z to "common contaminant"


In [None]:
# assign
peak_ids = df.loc[df["mz_max"].between(239.0865, 239.0905) & df["retention_time_s"].between(181, 211), "peak_id"].to_numpy()
with con.con:
    con.con.executemany('INSERT INTO lcms_peaks_assignment (peak_id, assignment) VALUES (?, "common contaminant");', peak_ids[:, None].tolist())

## Identify systematic mass differences
Here's an idea how to go about identifying systematic mass differences:
Identify those records, where a certain mass difference (e.g. delta_T) is within a narrow span, but which use different (relevant) building blocks, in the example different terminators.
Rationale: If the observed m/z is dependent on terminator mass, the responsible species contains the terminator in some form.

In [None]:
reaction_ids = con.get_reaction_ids_for_building_block(filter_exp_nr=(4, 29))
df = pd.concat([con.get_lcms_peaks(i, with_delta=True, with_assignment=True, with_building_blocks=True) for i in reaction_ids]).reset_index(drop=True)
# filter a bit: only those with a retention time > 3 min (180 s), and only with an m/z over 200
df = df.loc[(df["retention_time_s"] > 180) & (df["mz_max"] > 200)]
df

In [None]:
# get unique differences
t_diff = df.round(2).groupby("delta_T")["terminator"].unique()
t_diff

In [None]:
t_diff.loc[t_diff.apply(lambda x: len(x) > 20)]

#### <sup>81</sup>Br isotope

In [None]:
df.loc[df["delta_T"].between(166.915, 166.925)]

These are actually all caused by bromine isotope 81Br. The mass difference to 79Br is 1.9979535, which coincides with delta_T.

In [None]:
delta_br_isotopes = (1.9959, 1.9999)

In [None]:
# obtain a list of building blocks that contain bromine
building_blocks = [x[0] for x in con.con.execute("SELECT short FROM building_block_shorts;").fetchall()]
has_bromine = ["Br" in con.get_smiles(bb) for bb in building_blocks]
building_blocks_with_bromine = [x[0] for x in zip(building_blocks, has_bromine) if x[1]]
building_blocks_with_bromine

In [None]:
# select based on the isotope mass difference, but only if the relevant building blocks actually contain bromine
peaks_br_d = df.loc[df["delta_D"].between(*delta_br_isotopes) & (df["initiator"].isin(building_blocks_with_bromine) | df["terminator"].isin(building_blocks_with_bromine)), "id"].to_numpy()
with con.con:
    con.con.executemany("INSERT INTO lcms_peaks_assignment (peak_id, assignment) VALUES (?, 'D 81Br');", peaks_br_d[:, None].tolist())

Now obviously, the same problem may occur for other products, so we check the other differences as well

In [None]:
# select based on the isotope mass difference, but only if the relevant building blocks actually contain bromine
peaks_br_a = df.loc[df["delta_A"].between(1.9976, 1.9982) & (df["initiator"].isin(building_blocks_with_bromine) | df["monomer"].isin(building_blocks_with_bromine) | df["terminator"].isin(building_blocks_with_bromine)), "id"].to_numpy()
peaks_br_b = df.loc[df["delta_B"].between(1.9976, 1.9982) & (df["initiator"].isin(building_blocks_with_bromine) | df["monomer"].isin(building_blocks_with_bromine) | df["terminator"].isin(building_blocks_with_bromine)), "id"].to_numpy()
peaks_br_c = df.loc[df["delta_C"].between(1.9976, 1.9982) & (df["initiator"].isin(building_blocks_with_bromine) | df["monomer"].isin(building_blocks_with_bromine) | df["terminator"].isin(building_blocks_with_bromine)), "id"].to_numpy()
peaks_br_e = df.loc[df["delta_E"].between(1.9976, 1.9982) & df["terminator"].isin(building_blocks_with_bromine), "id"].to_numpy()
peaks_br_f = df.loc[df["delta_F"].between(1.9976, 1.9982) & (df["initiator"].isin(building_blocks_with_bromine) | df["monomer"].isin(building_blocks_with_bromine)), "id"].to_numpy()
peaks_br_g = df.loc[df["delta_G"].between(1.9976, 1.9982) & (df["initiator"].isin(building_blocks_with_bromine) | df["monomer"].isin(building_blocks_with_bromine)), "id"].to_numpy()
peaks_br_h = df.loc[df["delta_H"].between(1.9976, 1.9982) & (df["monomer"].isin(building_blocks_with_bromine) | df["terminator"].isin(building_blocks_with_bromine)), "id"].to_numpy()

# check how many we found
for s, i in zip("ABCDEFGH", [peaks_br_a, peaks_br_b, peaks_br_c, peaks_br_d, peaks_br_e, peaks_br_f, peaks_br_g, peaks_br_h]):
    print(f"{s}: {len(i)}")

In [None]:
# commit to DB
with con.con:
    for s, i in zip("ABCEFGH", [peaks_br_a, peaks_br_b, peaks_br_c, peaks_br_e, peaks_br_f, peaks_br_g, peaks_br_h]):
        assignment = [f"{s} 81Br" for _ in range(len(i))]
        con.con.executemany("INSERT INTO lcms_peaks_assignment (peak_id, assignment) VALUES (?, ?);", tuple(zip(i.tolist(), assignment)))

In [None]:
count_assigned_peaks()

#### 4-hydroxy version of 8-Quin-4-alkoxy-KATS

In [None]:
df.loc[df["delta_T"].between(153.015, 153.025)]

In [None]:
# these all use either I21 or I22. What's unique about these?
con.get_smiles("I21")

In [None]:
con.get_smiles("I22")

Both contain a 4-hydroxyquinone moiety. The other 8-Quin KATs we have are substituted on the 5-position.

The mass difference corresponds to absence of the side on the oxygen atom (i.e. the methyl group for 8-Quin003/I21 and the benzyl group for 8-Quin004/I22).
The mass differences for OR-->OH are:
- 8-Quin003/I21: -14.01565
- 8-Quin004/I22: -90.04695

In [None]:
delta_quin003 = (-14.01565 - 0.002, -14.01565 + 0.002)
delta_quin004 = (-90.04695 - 0.002, -90.04695 + 0.002)

In [None]:
# add assignment to DB
# select based on the isotope mass difference, but only if the relevant building blocks actually contain bromine
for s in "ABCDEFGH":
    peaks_hydroxy_quin = df.loc[(df[f"delta_{s}"].between(*delta_quin003) & (df["initiator"] == "I21")) | (df[f"delta_{s}"].between(*delta_quin004) & (df["initiator"] == "I22")), "id"].to_numpy()
    print(f"Found {len(peaks_hydroxy_quin)} peaks for product {s}")
    assignment = [f"{s} hydroxyquinone lost oxygen substituent" for _ in range(len(peaks_hydroxy_quin))]
    with con.con:
        con.con.executemany("INSERT INTO lcms_peaks_assignment (peak_id, assignment) VALUES (?, ?);", tuple(zip(peaks_hydroxy_quin.tolist(), assignment)))

#### F->OH exchange on aromatic F-containing KATs (−1.9957)
(mostly I6, containing 3-fluoropyridine)

In [None]:
df.loc[df["delta_T"].between(110.015, 110.025)]

In [None]:
con.get_smiles("I43")

This mass difference (occurring exclusively with I43) corresponds to TMS->H exchange

In [None]:
df.loc[df["delta_T"].between(103.005, 103.015)]

In [None]:
con.get_smiles("I6")

This mass difference (occurring exclusively with I6) corresponds to F->OH exchange (expected difference -1.99566)

In [None]:
delta_f_to_oh = (-1.99566 - 0.002, -1.99566 + 0.002)

In [None]:
# obtain a list of building blocks that contain fluorine, other than the BF3 group
# so for initiators, there need to be at least 4 fluorines, for M and T at least 1
building_blocks = [x[0] for x in con.con.execute("SELECT short FROM building_block_shorts;").fetchall()]
has_arom_fluorine = ["c(F)" in con.get_smiles(bb) for bb in building_blocks]  # not 100% safe, but good enough heuristic
building_blocks_with_arom_fluorine = [x[0] for x in zip(building_blocks, has_arom_fluorine) if x[1]]
list(set(building_blocks_with_arom_fluorine))

In [None]:
# add assignment to DB
# select based on the isotope mass difference, but only if the relevant building blocks actually contain aromatic fluorine
for s in "ABCDEFGH":
    peaks_f_to_oh = df.loc[(df[f"delta_{s}"].between(*delta_f_to_oh) & (df["initiator"].isin(building_blocks_with_arom_fluorine) | df["monomer"].isin(building_blocks_with_arom_fluorine) | df["terminator"].isin(building_blocks_with_arom_fluorine))), "id"].to_numpy()
    print(f"Found {len(peaks_f_to_oh)} peaks for product {s}")
    assignment = [f"{s} fluorine to OH exchange" for _ in range(len(peaks_f_to_oh))]
    with con.con:
        con.con.executemany("INSERT INTO lcms_peaks_assignment (peak_id, assignment) VALUES (?, ?);", tuple(zip(peaks_br_f.tolist(), assignment)))

#### Terminator + cyclohexanone condensation (+80.06)

In [None]:
df.loc[df["delta_T"].between(80.055, 80.065)].sort_values(["monomer", "terminator"])

The mass difference +80.0622 (compared to terminator) occurs for many peaks and occurs across most I, M, and T. Something interesting may be going on here.

In [None]:
con.get_product_smiles(79851)

In [None]:
con.get_smiles("T22")

In [None]:
con.get_smiles("T23")

In [None]:
con.get_smiles("T25")

This seems to be the terminator, condensed with cyclohexanone to form something like `Sc1c(/N=C2CCCCC/2)cccc1` or `Sc1c(NC2=CCCCC2)cccc1` or `Nc1c(SC2=CCCCC2)cccc1`.
Under certain circumstances (reaction with the T-dimer) `c1(NC2CCCCC2S3)c3cccc1` may also be possible.
(In theory, reaction with hexylKAT would explain the mass, too – but it is highly improbable that we would somehow produce that)

The expected mass difference for any of these with the terminator is 80.0626 (C6H8).

In [None]:
delta_cyclohexanone_condensation = (80.0626 - 0.002, 80.0626 + 0.002)

In [None]:
# find and count occurences
peaks_cyclohexanone_condensation = df.loc[df["delta_T"].between(*delta_cyclohexanone_condensation), "id"].to_numpy()
print(len(peaks_cyclohexanone_condensation))

In [None]:
assignment = [f"T cyclohexanone condensate" for _ in range(len(peaks_cyclohexanone_condensation))]
with con.con:
    con.con.executemany("INSERT INTO lcms_peaks_assignment (peak_id, assignment) VALUES (?, ?);", tuple(zip(peaks_cyclohexanone_condensation.tolist(), assignment)))

In [None]:
count_assigned_peaks()

#### Monomer - H<sub>4</sub>O<sub>2</sub> (−36.02)

In [None]:
# let's try to be systematic and find the most common differences
for s in "IMTABCDEFGH":
    mz_delta_count = df.round(2).groupby(f"delta_{s}")["id"].count().sort_values(ascending=False)
    print("Frequent (>1000 examples) of mass differences for", s)
    print(mz_delta_count[mz_delta_count > 1000].index.tolist())

In [None]:
peaks = df.loc[df.delta_M.round(2) == -36.02]
peaks

In [None]:
# check for how many different building blocks this occurs
print("I:", len(peaks.initiator.unique()))
print("M:", len(peaks.monomer.unique()))
print("T:", len(peaks.terminator.unique()))

In [None]:
# let's check which monomers are involved
for short in peaks.monomer.unique():
    print(short, con.get_long_name(short))

In [None]:
con.get_product_smiles(14739)

In [None]:
con.get_product_smiles(58109)

In [None]:
con.get_smiles("M27")

This peak comes from the monomer only.
Actually, the mz value can typically be found in the QC LCMS data of the individual monomer (I checked for M36, M27 and M13 and found it in 3 out of 4 QC traces as a small but defined peak).
We conclude that the contaminant is carried over from the monomer stock solutions.

The m/z corresponds to monomer −H4O2 (−36.0211).
We don't have a plausible structure, but we mark this peak M−H4O2

In [None]:
delta_h4o2 = (-36.0211 - 0.002, -36.0211 + 0.002)

In [None]:
# find and count occurences
peaks_m_h4o2 = df.loc[df["delta_M"].between(*delta_h4o2), "id"].to_numpy()
print(len(peaks_m_h4o2))

In [None]:
# commit to DB
assignment = [f"M - H4O2" for _ in range(len(peaks_m_h4o2))]
with con.con:
    con.con.executemany("INSERT INTO lcms_peaks_assignment (peak_id, assignment) VALUES (?, ?);", tuple(zip(peaks_m_h4o2.tolist(), assignment)))

#### Boc -> carbamic acid MS fragmentation (-56.06)


In [None]:
peaks = df.loc[df.delta_M.round(2) == -56.06]
peaks

In [None]:
# check for how many different building blocks this occurs
print("I:", len(peaks.initiator.unique()))
print("M:", len(peaks.monomer.unique()))
print("T:", len(peaks.terminator.unique()))

In [None]:
# let's check which monomers are involved
for short in peaks.monomer.unique():
    print(short, con.get_long_name(short))

In [None]:
# let's check if all of these have boc
print("M (#boc, #cbz, #tbu, #tms)")
for short in peaks.monomer.unique():
    print(short, con.list_pg(short))

In [None]:
# M52 doesn't have a boc. Check number of peaks for M52 to determine whether this is a random collision
peaks.monomer.value_counts()

The -56.06 difference to monomer occurs from loss of the tert-butyl group (part of the Boc group).
The expected mass difference (-C4H8) is 56.0636.

In [None]:
delta_tert_butyl = (-56.0636 - 0.002, -56.0636 + 0.002)

In [None]:
# does the collision with M52 still occur in a more precise mass range?
df.loc[df["delta_M"].between(*delta_tert_butyl)].monomer.value_counts()

No, no more collision with M52 or M16. For the rest, the mass difference is safe to apply.
However, one question is left: Does this degradation occur in the MS, or do we actually have the carbamic acid species (usually not stable)?

To answer this, we need to manually inspect MS traces.


In [None]:
def get_lab_journal_number_well(experiment_id):
    return con.con.execute("SELECT lab_journal_number, well FROM experiments WHERE id = ?", (experiment_id,)).fetchone()

In [None]:
# let's sample a few of the alrger peaks to inspect manually peaks
peaks_to_inspect = df.loc[df["delta_M"].between(*delta_tert_butyl) & (df["%area"] > 20), ["experiment_id", "peak_nr", "monomer"]].sample(3, random_state=1)
peaks_to_inspect["identifier"] = peaks_to_inspect["experiment_id"].apply(lambda x: get_lab_journal_number_well(x))
peaks_to_inspect

Outcome of manual inspection:
- JG248/A16: Peak 48 contains both monomer and the carbamic acid derivative (ca. 3:5 int.), but not the fully deprotected amine. There is no other monomer peak.
- JG366/F16: Same picture here for peak 17. Contains a miniscule amount of fully deprotected amine. No other monomer peak
- JG280/F11: Exact same picture again. Only other peaks are shoulders of peak 17.

_In conclusion_, our assumption that the conversion to carbamic acid occurs in the LCMS and does not indicate formation of a separate stable species is confirmed by the data.

In [None]:
# find and count occurences
peaks_m_carbamic_acid = df.loc[df["delta_M"].between(*delta_tert_butyl), "id"].to_numpy()
print(len(peaks_m_carbamic_acid))

In [None]:
# commit to DB
assignment = [f"M (Boc->carbamic acid fragmentation in MS)" for _ in range(len(peaks_m_carbamic_acid))]
with con.con:
    con.con.executemany("INSERT INTO lcms_peaks_assignment (peak_id, assignment) VALUES (?, ?);", tuple(zip(peaks_m_carbamic_acid.tolist(), assignment)))

#### Terminator +C<sub>6</sub>H<sub>6</sub>O<sub>2</sub> (+110.04)

In [None]:
peaks = df.loc[df.delta_T.round(2) == 110.04]
peaks

In [None]:
# check for how many different building blocks this occurs
print("I:", len(peaks.initiator.unique()))
print("M:", len(peaks.monomer.unique()))
print("T:", len(peaks.terminator.unique()))

In [None]:
# let's check which terminators are involved
for short in peaks.terminator.unique():
    print(short, con.get_long_name(short))

In [None]:
# obviously all are ABTs. Let's see where we have frequent occurences
peaks.terminator.value_counts()

In [None]:
con.get_smiles("T17")

In [None]:
con.get_smiles("T8")

The mass difference is only observed for ABT terminators.
It corresponds to +C<sub>6</sub>H<sub>6</sub>O<sub>2</sub> (+110.0368).
I don't find a good structure to explain this mass difference.
We will leave the peaks unassigned for now.

#### Terminator delta only with methyl-ABT (+215.08)

In [None]:
peaks = df.loc[df.delta_T.round(2) == 215.08]
peaks

In [None]:
peaks.mz_max.median()

In [None]:
# check for how many different building blocks this occurs
print("I:", len(peaks.initiator.unique()))
print("M:", len(peaks.monomer.unique()))
print("T:", len(peaks.terminator.unique()))

In [None]:
# let's check which terminators are involved
for short in peaks.terminator.unique():
    print(short, con.get_long_name(short))

In [None]:
# Let's see where we have frequent occurrences
peaks.terminator.value_counts()

In [None]:
con.show_image("T18")

In [None]:
con.show_image("T3")

In [None]:
con.show_image("T7")

T18, T3, T7 are all isomers of each other (Me-substituted ABTs).
Since they all have the same mass, the fact that the same delta_T comes up for all of them does not tell us anything about whether the terminator is part of the unknown species.
We don't investigate this one further.

#### Terminator +77.04 / product E +78.05 (C6H6)
Initially, we only considered T+77.04 here, but analysis of the spectra showed that this is actually a frequent fragment arising from the product E + C6H6 species.

In [None]:
peaks = df.loc[df.delta_T.round(2) == 77.04]
peaks

In [None]:
get_lab_journal_number_well(41780)

In [None]:
con.get_product_smiles(26661)

In [None]:
peaks.delta_T.describe()

In [None]:
# check for how many different building blocks this occurs
print("I:", len(peaks.initiator.unique()))
print("M:", len(peaks.monomer.unique()))
print("T:", len(peaks.terminator.unique()))

In [None]:
# let's check which terminators are involved
for short in peaks.terminator.unique():
    print(short, con.get_long_name(short))

In [None]:
# Let's see where we have frequent occurrences
peaks.terminator.value_counts()

In [None]:
con.get_smiles("T1")

In [None]:
con.show_image("T7")

Again, with only two terminators forming this, we have very little evidence to determine the structure of the unknown species.
Looking into the spectrum reveals that the actual base peak has a different m/z (327.0984 for T1 and 355.1296 for T7, difference between the two is C2H4). In both cases, a difference +78.0468 (C6H6) is observed to terminator dimer E.
It should be noted that we also observe delta_E +78.05 frequently for other terminators.
Let's check those!

In [None]:
peaks = df.loc[df.delta_E.round(2) == 78.05]
peaks

In [None]:
# check for how many different building blocks this occurs
print("I:", len(peaks.initiator.unique()))
print("M:", len(peaks.monomer.unique()))
print("T:", len(peaks.terminator.unique()))

In [None]:
# let's check which terminators are involved
for short in peaks.terminator.unique():
    print(short, con.get_long_name(short))

In [None]:
# Let's see where we have frequent occurrences
peaks.terminator.value_counts()

So this occurs mainly with T1-3, T7, T18, all small ABTs (plain, Me- and F- substituted)

In [None]:
con.get_smiles("T18")

The mass difference of +78.048 clearly corresponds to +C6H6.
A possible explanation is pi-stacking with benzene, but it's not evident where the benzene could come from.
In the end, we don't have any good hypothesis for this one.

#### Terminator +73.01

In [None]:
peaks = df.loc[df.delta_T.round(2) == 73.01]
peaks

In [None]:
peaks.delta_T.describe()

In [None]:
# check for how many different building blocks this occurs
print("I:", len(peaks.initiator.unique()))
print("M:", len(peaks.monomer.unique()))
print("T:", len(peaks.terminator.unique()))

In [None]:
# let's check which terminators are involved
for short in peaks.terminator.unique():
    print(short, con.get_long_name(short))

In [None]:
# Let's see where we have frequent occurrences
peaks.terminator.value_counts()

In [None]:
con.get_smiles("T4")

In [None]:
con.show_image("T3")

These are all particularly electron-rich ABTs, raising electrophilic aromatic substitution as a possible mechanism.
One way to explain the difference would be +C3H3NCl -O, but T4 does not have an oxygen atom.
We don't find an acceptable explanation for this mass difference.

#### Product F −56.06 / Product G -28.07
(these are the same, as the difference between F and G is constant)

In [None]:
peaks = df.loc[df.delta_F.round(2) == -56.06]
peaks

In [None]:
# check for how many different building blocks this occurs
print("I:", len(peaks.initiator.unique()))
print("M:", len(peaks.monomer.unique()))
print("T:", len(peaks.terminator.unique()))

In [None]:
# let's check which monomers are involved
for short in peaks.monomer.unique():
    print(short, con.get_long_name(short))

In [None]:
# Let's see where we have frequent occurrences
peaks.monomer.value_counts()

In [None]:
con.show_image("M20")

All of these monomers have a Boc or tert-butyl group. We already know this mass difference from the Boc -> carbamic acid fragmentation seen for monomers. For the tert-butyl esters the same fragmentation can occur.

In [None]:
# find and count occurences
peaks_f_carbamic_acid = df.loc[df["delta_F"].between(*delta_tert_butyl), "id"].to_numpy()
print(len(peaks_f_carbamic_acid))

In [None]:
# commit to DB
assignment = [f"F (-tert-butyl fragmentation in MS for Boc or tert-butyl ester)" for _ in range(len(peaks_f_carbamic_acid))]
with con.con:
    con.con.executemany("INSERT INTO lcms_peaks_assignment (peak_id, assignment) VALUES (?, ?);", tuple(zip(peaks_f_carbamic_acid.tolist(), assignment)))

Curiously, we don't usually get the fragmentation peak for all other products, only F and unreacted M.

#### Product F −71.985 / Product G -43.990
(these are the same)
I found this one during analysing the fate of M58.
It corresponds to loss of CO2 from G, which is a plausible decarboxylation for beta-2-monomers.

In [None]:
peaks = df.loc[df.delta_G.between(-43.991, -43.989)]
peaks

In [None]:
# check for how many different building blocks this occurs
print("I:", len(peaks.initiator.unique()))
print("M:", len(peaks.monomer.unique()))
print("T:", len(peaks.terminator.unique()))

In [None]:
# let's check which monomers are involved
for short in peaks.monomer.unique():
    print(short, con.get_long_name(short))

In [None]:
# Let's see where we have frequent occurrences
peaks.monomer.value_counts()

The ones with M11, M68, M73 are likely random m/z collisions. We only consider the ones with M57 and M58.

In [None]:
# find and count occurences
peaks_g_decarbox = peaks.loc[peaks["monomer"].isin(["M57", "M58"]), "id"].to_numpy()
print(len(peaks_g_decarbox))

In [None]:
# commit to DB
assignment = [f"G_decarboxylated" for _ in range(len(peaks_g_decarbox))]
with con.con:
    con.con.executemany("INSERT INTO lcms_peaks_assignment (peak_id, assignment) VALUES (?, ?);", tuple(zip(peaks_g_decarbox.tolist(), assignment)))

## Summary

In [None]:
df.loc[df["experiment_id"] == 20589]

In [None]:
# let's get a new version from DB and see what unassigned peaks we have left!
reaction_ids = con.get_reaction_ids_for_building_block(filter_exp_nr=(4, 29))
df = pd.concat([con.get_lcms_peaks(i, with_delta=True, with_assignment=True, with_building_blocks=True) for i in reaction_ids]).reset_index(drop=True)
# filter a bit: only unidentified peaks, and only those with a retention time > 4 min (240 s), and only with an m/z over 200
df_unassigned = df.loc[df["assignment"].isna() & (df["retention_time_s"] > 240) & (df["mz_max"] > 200)]
df_unassigned

In [None]:
con.con.execute('SELECT COUNT(*) FROM lcms_peaks WHERE "%area" = 100').fetchone()[0]


In [None]:
con.con.execute('SELECT COUNT(*) FROM lcms_peaks JOIN lcms_peaks_assignment ON lcms_peaks.id = lcms_peaks_assignment.peak_id WHERE assignment IS NOT NULL AND "%area" = 100;').fetchone()[0]


In [None]:
# number of peaks in the "interesting" region
len(df.loc[(df["retention_time_s"] > 240) & (df["mz_max"] > 200)])

In [None]:
# number of unassigned peaks in the "interesting" region
len(df_unassigned)

In [None]:
# total number of peaks >5% area
con.con.execute('SELECT COUNT(*) FROM lcms_peaks WHERE "%area" > 5;').fetchone()[0]

In [None]:
# total number of assigned peaks >5% area
con.con.execute('SELECT COUNT(*) FROM lcms_peaks JOIN lcms_peaks_assignment ON lcms_peaks.id = lcms_peaks_assignment.peak_id WHERE assignment IS NOT NULL AND "%area" > 5;').fetchone()[0]