# Extract MoBiAS PDF reports

The PDF reports are a great source of additional information as they contain data for peaks that may not have been assigned to any known product.
Mining these reports is much faster than reprocessing of the raw LCMS data.

Here, we are in particular interested to identify
1. leftover starting materials
2. systematic side products that we have not been previously looking for


In [1]:
import sys
import pathlib
import re
from datetime import datetime

sys.path.insert(0, str(pathlib.Path().resolve().parents[1]))

from pypdf import PdfReader
import pandas as pd
import numpy as np

from src.util.db_utils import SynFermDatabaseConnection
from src.definitions import DATA_DIR, PLATE_LIST_PATH
from src.util.rdkit_util import smiles_to_lcms_mass

In [3]:
con = SynFermDatabaseConnection()

In [81]:
def count_assigned_peaks():
    assigned_peaks = con.con.execute("SELECT peak_id, assignment, lp.experiment_id FROM lcms_peaks_assignment JOIN lcms_peaks lp on lcms_peaks_assignment.peak_id = lp.id").fetchall()
    df = pd.DataFrame(assigned_peaks, columns=["peak_id", "assignment", "experiment_id"])
    # count the number of reaction each species is found in
    return df[["assignment", "experiment_id"]].drop_duplicates().groupby(["assignment"]).count()

## Extract PDF data

In [3]:
def import_lcms_full_report(path):
    # set up pdf reader
    reader = PdfReader(path)
    number_of_pages = len(reader.pages)
    lines = []
    found_data = False
    # iterate from second page until entire peak summary table is read completely
    for i in range(1, number_of_pages):
        page = reader.pages[i]
        text = page.extract_text()
        line_list = text.splitlines()
        if (line_list[0] != '# RT [min] Area I S/N Max. m/z FWHM [min] Area % Int. %') and found_data:  # stop when not encountering another data header
            break
        else:
            found_data = True
        lines += line_list
    
    if len(lines) == 0:
        raise RuntimeError("No data extracted")
        
    # remove spaces in header
    cleaned_lines = ['# RT[min] Area I S/N max_m/z FWHM[min] Area% Int%',]
    # remove footers and data headers on not-first page
    cleaned_lines += [line for line in lines if line[0].isnumeric()] 

    # split the lines into individual fields
    data = [line.split() for line in cleaned_lines]
    
    # assemble DataFrame from data
    df = pd.DataFrame(data[1:], columns=data[0]).astype("float")

    return df

In [None]:
%%capture output
# import the plate list to obtain LCMS identifier - plate_nr relation
plate_list = pd.read_csv(PLATE_LIST_PATH)
# we will record any files that cause exceptions for manual inspection
files_with_exceptions = []

# iterate over all experiments/plates
for exp_nr in range(1, 30):
    for plate_nr in range(1, 7):
        # print progress indicator
        now = datetime.now()
        print(f"exp {exp_nr}-{plate_nr}, started {now.strftime('%H:%M:%S')}")
 
        
        lcms_id = plate_list.loc[(plate_list["exp_nr"] == exp_nr) & (plate_list["plate_nr"] == plate_nr), "results_file_name"].item().split("_")[0]
        exp_path = DATA_DIR / "pdf_reports" / lcms_id
        full_report_paths = list(exp_path.glob("*_LCMS_Fullreport.pdf"))
        
        for path in full_report_paths:
            try:
                # get well from filename
                regex = r'_P\d{1}-[A-Z]-\d{1,2}_'
                match = re.search(regex, path.name)
                well = "".join(match.group().strip("_").split("-")[1:])
                reaction_id = con.get_reaction_id((exp_nr, plate_nr, well))
                df = import_lcms_full_report(path)
                # we persist this to the database for re-use
                # first reformat df to fit DB
                df.insert(0, "reaction_id", reaction_id)
                df.insert(2, "retention_time_s", (df["RT[min]"] * 60).astype("int"))
                df = df.drop(columns="RT[min]")
                df = df.rename(columns={"#": "peak_number", 
                           "Area": "area", 
                           "I": "intensity",
                           "S/N": "signal_to_noise",
                           "max_m/z": "mz_max",
                           "FWHM[min]": "fwhm_min", 
                           "Area%": "%area",
                           "Int%": "%intensity"
                          }).astype({"peak_number": "int",
                                     "area": "int",
                                     "intensity": "int"})
                # write all extracted peaks to DB
                with con.con:
                    con.con.executemany(
                        'INSERT INTO lcms_peaks (experiment_id, peak_nr, retention_time_s, area, intensity, signal_to_noise, mz_max, fwhm_min, "%area", "%intensity") VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?);', 
                        [tuple(row) for row in df.to_numpy()]
                    )
            except Exception as e:
                print(f"Something went wrong for {str(path)}")
                files_with_exceptions.append(path)
                print(e)

Note:
We have later found one failure mode for the extraction:
Sometimes values (intensity) are missing in the PDF table, in which case, that specific row is parsed incorrectly.
We can remove these rows by checking for NaN values in the last column (%intensity).


In [125]:
len(con.con.execute('SELECT * FROM lcms_peaks WHERE "%intensity" IS NULL').fetchall())

38630

In [126]:
with con.con:
    con.con.execute('DELETE FROM lcms_peaks WHERE "%intensity" IS NULL')

## Peak assignments - common contaminants

In [24]:
# define some expected contaminants
mz_dmso = [79.0212, 101.0032, 157.0351]  # M+H+, M+Na+ 2M+H+
mz_lock_tmp = 142.1590  # tetramethylpiperidine
mz_lock_hmp = 322.0481  # hexamethoxyphosphazene
mz_is = 361.1201  # fenofibrate

In [22]:
# write DMSO assignments to DB
# the median retention time for DMSO is 48 seconds, but it's a broad peak, so we just use <120s
for mz in mz_dmso:
    with con.con:
        con.con.execute("INSERT INTO lcms_peaks_assignment (peak_id, assignment) SELECT id, 'DMSO' FROM lcms_peaks WHERE mz_max BETWEEN ? AND ? AND retention_time_s < 120", (mz - 0.02, mz + 0.02))


In [27]:
# write lock mass assignments to DB
# since lock molecules are part of the solvent, we do not put a retention time constraint
with con.con:
    con.con.execute("INSERT INTO lcms_peaks_assignment (peak_id, assignment) SELECT id, 'lock mass tetramethylpiperidine' FROM lcms_peaks WHERE mz_max BETWEEN ? AND ?", (mz_lock_tmp - 0.02, mz_lock_tmp + 0.02))
with con.con:
    con.con.execute("INSERT INTO lcms_peaks_assignment (peak_id, assignment) SELECT id, 'lock mass hexamethoxyphosphazene' FROM lcms_peaks WHERE mz_max BETWEEN ? AND ?", (mz_lock_hmp - 0.02, mz_lock_hmp + 0.02))

In [28]:
# write IS assignments to DB
# the median retention time for fenofibrate was 368 s. We allow a 20 s window around this (the latest picked peak is at 388 so the upper bound is not necessary)
with con.con:
    con.con.execute("INSERT INTO lcms_peaks_assignment (peak_id, assignment) SELECT id, 'IS fenofibrate' FROM lcms_peaks WHERE mz_max BETWEEN ? AND ? AND retention_time_s > 348", (mz_is - 0.02, mz_is + 0.02))

## Peak assignments - Known reactants/products

We read extracted data from the DB, compare it with the expected masses and save the results back to the db

In [20]:
def calculate_lcms_peak_differences(identifier):

    # import relevant reactants and products
    # import starting material mzs
    sms = con.get_starting_materials_for_reaction(identifier)
    mz_i, mz_m, mz_t = [smiles_to_lcms_mass(smi) for smi in sms]
    mz_iacid = mz_i - 52.0096  # mass difference of degradation KAT-H to carboxylic acid
    mz_baa = mz_m - 124.05243  # mass difference of degradation to beta-amino acid
    mzs = {"I": mz_i, "M": mz_m, "T": mz_t, "I_acid": mz_iacid, "M_bAA": mz_baa}

    # import product mzs
    known_product_smiles = con.get_product_smiles(identifier)
    known_product_mzs = {s: smiles_to_lcms_mass(smi) for s, smi in zip("ABCDEFGH", known_product_smiles) if smi}  # "if smi" bc we sometimes have None for product H
    mzs.update(known_product_mzs)

    # assign known peaks
    for s, mz in mzs.items():
        with con.con:
            con.con.execute("INSERT INTO lcms_peaks_assignment (peak_id, assignment) SELECT id, ? AS assignment FROM lcms_peaks WHERE experiment_id=? AND mz_max BETWEEN ? AND ?", (s, identifier, mz - 0.02, mz + 0.02))

    # some peaks are still unexplained
    # for these, we calculate the difference to known mzs to see if a pattern emerges
    lcms_peaks = con.get_lcms_peaks(identifier, with_assignment=True)
    # ignore peaks that are already explained or that are insignificant
    unexplained_peaks = lcms_peaks.loc[lcms_peaks.assignment.isna() & (lcms_peaks["%area"] > 5)]
    unexplained_mzs = unexplained_peaks.mz_max.to_numpy()
    mzs_arr = np.array(list(mzs.values()))
    # calculate the differences with all the considered masses
    delta = pd.concat([unexplained_peaks[["experiment_id", "id"]].reset_index(drop=True), pd.DataFrame(unexplained_mzs[:, None] - mzs_arr, columns=list(mzs.keys()))], axis=1)

    # add "H" column if it does not exist
    if "H" not in delta.columns:
        delta["H"] = np.nan
    
    with con.con:
        # persist differences to DB
        con.con.executemany("INSERT INTO lcms_peaks_differences (experiment_id, peak_id, delta_I, delta_M, delta_T, delta_Iacid, delta_bAA, delta_A, delta_B, delta_C, delta_D, delta_E, delta_F, delta_G, delta_H) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);",
                           delta.to_numpy())
    return delta

In [21]:
# we will record any files that cause exceptions for manual inspection
records_with_exceptions = []
# iterate over all experiments/plates
for exp_nr in range(1, 30):
    for plate_nr in range(1, 7):
        # print progress indicator
        now = datetime.now()
        print(f"exp {exp_nr}-{plate_nr}, started {now.strftime('%H:%M:%S')}")

        identifiers = con.get_reaction_ids_for_plate((exp_nr, plate_nr))
        for i in identifiers:
            try:
                calculate_lcms_peak_differences(i)
            except Exception as e:
                records_with_exceptions.append(i)

exp 1-1, started 14:22:14
exp 1-2, started 14:22:17
exp 1-3, started 14:22:19
exp 1-4, started 14:22:21
exp 1-5, started 14:22:24
exp 1-6, started 14:22:26
exp 2-1, started 14:22:29
exp 2-2, started 14:22:31
exp 2-3, started 14:22:33
exp 2-4, started 14:22:36
exp 2-5, started 14:22:38
exp 2-6, started 14:22:41
exp 3-1, started 14:22:43
exp 3-2, started 14:22:45
exp 3-3, started 14:22:48
exp 3-4, started 14:22:51
exp 3-5, started 14:22:54
exp 3-6, started 14:22:56
exp 4-1, started 14:22:59
exp 4-2, started 14:23:01
exp 4-3, started 14:23:04
exp 4-4, started 14:23:07
exp 4-5, started 14:23:09
exp 4-6, started 14:23:12
exp 5-1, started 14:23:14
exp 5-2, started 14:23:17
exp 5-3, started 14:23:19
exp 5-4, started 14:23:22
exp 5-5, started 14:23:25
exp 5-6, started 14:23:28
exp 6-1, started 14:23:30
exp 6-2, started 14:23:33
exp 6-3, started 14:23:35
exp 6-4, started 14:23:38
exp 6-5, started 14:23:40
exp 6-6, started 14:23:42
exp 7-1, started 14:23:45
exp 7-2, started 14:23:47
exp 7-3, sta

In [30]:
len(df["experiment_id"].unique())

55477

## Peak assignments - Find more contaminants

In [31]:
## get all peaks to identify common contaminants
res = con.con.execute('SELECT lcms_peaks.id, lcms_peaks.experiment_id, lcms_peaks.peak_nr, retention_time_s, area, intensity, signal_to_noise, mz_max, fwhm_min, "%area", "%intensity", a.assignment FROM lcms_peaks LEFT JOIN lcms_peaks_assignment a on lcms_peaks.id = a.peak_id;',
                ).fetchall()
df = pd.DataFrame(
    res,
    columns=[
       "peak_id",
        "reaction_id",
        "peak_nr",
        "retention_time_s",
        "area",
        "intensity",
        "signal_to_noise",
        "mz_max",
        "fwhm_min",
        "%area",
        "%intensity",
        "assignment",
    ]
)

In [32]:
df.loc[df.assignment.isna(), "mz_max"].round(2).value_counts()

100.11    194848
128.11     78793
111.04     70752
99.53      61966
84.08      54308
           ...  
553.83         1
756.27         1
766.21         1
79.08          1
253.75         1
Name: mz_max, Length: 27420, dtype: int64

#### m/z 128.11

In [33]:
# what mass exactly do we observe?
df.loc[df["mz_max"].between(128.105, 128.109), "mz_max"].describe()

count    78398.000000
mean       128.106494
std          0.000541
min        128.105000
25%        128.106200
50%        128.106400
75%        128.106700
max        128.109000
Name: mz_max, dtype: float64

In [34]:
# when does the 128.1064 peak occur?
df.loc[df["mz_max"].between(128.105, 128.109), "retention_time_s"].describe()

count    78398.000000
mean       202.097936
std         44.703646
min         44.000000
25%        183.000000
50%        185.000000
75%        189.000000
max        388.000000
Name: retention_time_s, dtype: float64

An m/z of 128.1064 corrsponds to C7H13NO (calc m/z 128.1070)
This could e.g. be 2-azacyclooctanone.

The peak usually occurs after 184 seconds. We will assign anything with ±15s and ±0.002 m/z to this formula


In [35]:
# assign
peak_ids = df.loc[df["mz_max"].between(128.105, 128.109) & df["retention_time_s"].between(169, 199), "peak_id"].to_numpy()
with con.con:
    con.con.executemany('INSERT INTO lcms_peaks_assignment (peak_id, assignment) VALUES (?, "common contaminant C7H13NO");', peak_ids[:, None].tolist())

In [36]:
# assign
df.loc[df["mz_max"].between(128.105, 128.109) & ~df["retention_time_s"].between(169, 199)]

Unnamed: 0,peak_id,reaction_id,peak_nr,retention_time_s,area,intensity,signal_to_noise,mz_max,fwhm_min,%area,%intensity,assignment
444,445,86392,50,302,1399,3947,1.7,128.1064,0.013,0.053290,4.06,
485,486,86413,32,249,4175,2389,21.6,128.1066,0.029,0.053340,0.13,
627,628,86201,13,201,29472,10757,135.0,128.1066,0.037,0.204459,0.24,
657,658,86201,43,310,476,1841,23.1,128.1065,0.004,0.003304,0.04,
659,660,86201,45,314,2394,1152,15.5,128.1063,0.037,0.016613,0.03,
...,...,...,...,...,...,...,...,...,...,...,...,...
3116160,3116161,70085,41,268,1512,1794,17.3,128.1077,0.017,0.006253,0.06,
3116402,3116403,70001,17,208,2132,1199,14.3,128.1060,0.022,0.009843,0.03,
3117415,3117416,69925,18,200,32,2,209.7,128.1065,0.061,0.000152,0.00,
3117549,3117550,69916,22,200,33,2,297.4,128.1064,0.141,0.000170,0.00,


#### m/z 185.11

In [37]:
# what mass exactly do we observe?
df.loc[df["mz_max"].between(185.10, 185.12), "mz_max"].describe()

count    53431.000000
mean       185.114160
std          0.000348
min        185.106300
25%        185.113900
50%        185.114100
75%        185.114400
max        185.118500
Name: mz_max, dtype: float64

In [38]:
# when does the 185.1141 peak occur?
df.loc[df["mz_max"].between(185.1121, 185.1161), "retention_time_s"].describe()

count    53303.000000
mean       196.797460
std          4.850153
min         64.000000
25%        193.000000
50%        198.000000
75%        200.000000
max        326.000000
Name: retention_time_s, dtype: float64

An m/z of 185.1141 could correspond to C8H18O3Na+ (calc m/z 185.1148)
This could e.g. be diethyl-diethyleneglycol (or other short PEG chains) which are common LCMS contaiminants

The peak usually occurs after 198 seconds. We will assign anything with ±15s and ±0.002 m/z to this formula


In [39]:
# assign
peak_ids = df.loc[df["mz_max"].between(185.1128, 185.1168) & df["retention_time_s"].between(183, 213), "peak_id"].to_numpy()
with con.con:
    con.con.executemany('INSERT INTO lcms_peaks_assignment (peak_id, assignment) VALUES (?, "common PEG contaminant");', peak_ids[:, None].tolist())

#### m/z 239.09

In [40]:
# what mass exactly do we observe?
df.loc[df["mz_max"].between(239.08, 239.10), "mz_max"].describe()

count    37169.000000
mean       239.088666
std          0.000433
min        239.080700
25%        239.088400
50%        239.088600
75%        239.088900
max        239.099500
Name: mz_max, dtype: float64

In [41]:
# when does the 239.0885 peak occur?
df.loc[df["mz_max"].between(239.0865, 239.0905), "retention_time_s"].describe()

count    36976.000000
mean       196.820532
std          3.800570
min         46.000000
25%        196.000000
50%        197.000000
75%        198.000000
max        297.000000
Name: retention_time_s, dtype: float64

An m/z of 239.0885 could correspond to many different compounds.
The peak usually occurs after 196 seconds. We will assign anything with ±15s and ±0.002 m/z to "common contaminant"


In [42]:
# assign
peak_ids = df.loc[df["mz_max"].between(239.0865, 239.0905) & df["retention_time_s"].between(181, 211), "peak_id"].to_numpy()
with con.con:
    con.con.executemany('INSERT INTO lcms_peaks_assignment (peak_id, assignment) VALUES (?, "common contaminant");', peak_ids[:, None].tolist())

## Identify systematic mass differences
Here's an idea how to go about identifying systematic mass differences:
Identify those records, where a certain mass difference (e.g. delta_T) is within a narrow span, but which use different (relevant) building blocks, in the example different terminators.
Rationale: If the observed m/z is dependent on terminator mass, the responsible species contains the terminator in some form.

In [89]:
reaction_ids = con.get_reaction_ids_for_building_block(filter_exp_nr=(4, 29))
df = pd.concat([con.get_lcms_peaks(i, with_delta=True, with_assignment=True, with_building_blocks=True) for i in reaction_ids]).reset_index(drop=True)
# filter a bit: only those with a retention time > 3 min (180 s), and only with an m/z over 200
df = df.loc[(df["retention_time_s"] > 180) & (df["mz_max"] > 200)]
df

Unnamed: 0,id,experiment_id,peak_nr,retention_time_s,area,intensity,signal_to_noise,mz_max,fwhm_min,%area,...,delta_bAA,delta_A,delta_B,delta_C,delta_D,delta_E,delta_F,delta_G,delta_H,assignment
0,522635,10578,72,196,597544,103890,39.8,239.0890,0.048,9.17523,...,20.950312,-259.072270,-305.077750,-287.067185,-59.951427,-52.006049,-145.031845,-117.036930,-104.058479,common contaminant
1,522719,10578,156,321,746204,245426,95.3,342.1919,0.040,11.45789,...,124.053212,-155.969370,-201.974850,-183.964285,43.151473,51.096851,-41.928945,-13.934030,-0.955579,M
2,522750,10578,187,343,5609629,2104344,826.3,259.1262,0.041,86.13517,...,40.987512,-239.035070,-285.040550,-267.029985,-39.914227,-31.968849,-124.994645,-96.999730,-84.021279,T cyclohexanone condensate
3,524124,10579,83,196,453060,88700,44.1,239.0889,0.052,6.79321,...,20.950212,-223.047218,-269.052698,-251.042133,-23.926375,20.044155,-145.031945,-117.037030,-68.033427,common contaminant
4,524156,10579,115,245,371927,72549,35.9,274.2743,0.088,5.57669,...,56.135612,-187.861818,-233.867298,-215.856733,11.259025,55.229555,-109.846545,-81.851630,-32.848027,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205656,401424,85521,28,267,444307,136591,59.2,475.1570,0.043,25.84048,...,291.080167,55.998207,9.992728,28.003293,221.057200,198.074229,131.027734,159.022648,205.082271,
205657,401428,85521,32,274,140629,47175,20.2,473.1438,0.041,8.17887,...,289.066967,53.985007,7.979528,25.990093,219.044000,196.061029,129.014534,157.009448,203.069071,
205658,401431,85521,35,293,124932,41486,17.4,691.2559,0.043,7.26593,...,507.179067,272.097107,226.091628,244.102193,437.156100,414.173129,347.126634,375.121548,421.181171,
205659,401434,85521,38,305,131011,48852,20.3,711.2244,0.043,7.61949,...,527.147567,292.065607,246.060128,264.070693,457.124600,434.141629,367.095134,395.090048,441.149671,


In [90]:
# get unique differences
t_diff = df.round(2).groupby("delta_T")["terminator"].unique()
t_diff

delta_T
-92.04         [T26]
-91.91         [T26]
-91.50         [T26]
-90.93         [T26]
-90.07         [T26]
             ...    
 803.27    [T21, T2]
 804.26        [T39]
 807.24        [T18]
 821.26         [T1]
 841.22         [T1]
Name: terminator, Length: 8984, dtype: object

In [91]:
t_diff.loc[t_diff.apply(lambda x: len(x) > 20)]

delta_T
80.06     [T25, T34, T18, T22, T36, T1, T39, T28, T32, T...
101.03    [T20, T31, T32, T25, T22, T36, T17, T1, T37, T...
103.01    [T18, T12, T2, T23, T11, T29, T19, T33, T5, T2...
110.02    [T25, T39, T34, T17, T36, T18, T22, T1, T23, T...
153.02    [T25, T36, T1, T8, T18, T22, T7, T3, T40, T12,...
166.92    [T31, T40, T20, T7, T3, T34, T39, T17, T36, T1...
173.05    [T27, T41, T9, T37, T10, T14, T35, T12, T2, T1...
Name: terminator, dtype: object

#### <sup>81</sup>Br isotope

In [92]:
df.loc[df["delta_T"].between(166.915, 166.925)]

Unnamed: 0,id,experiment_id,peak_nr,retention_time_s,area,intensity,signal_to_noise,mz_max,fwhm_min,%area,...,delta_bAA,delta_A,delta_B,delta_C,delta_D,delta_E,delta_F,delta_G,delta_H,assignment
6397,435466,13160,42,344,681948,277515,108.0,349.9781,0.043,21.03909,...,219.891842,-109.070389,-155.075868,-137.065304,1.998025,50.893221,8.964951,36.959866,90.888136,D 81Br
6398,435468,13160,44,344,895042,284261,3795.0,349.9781,0.046,27.61334,...,219.891842,-109.070389,-155.075868,-137.065304,1.998025,50.893221,8.964951,36.959866,90.888136,D 81Br
6403,452354,13161,48,274,417192,112765,1181.0,360.9690,0.053,6.36450,...,230.882742,-109.070321,-155.075801,-137.065236,1.998093,39.902457,19.955851,47.950766,90.888204,D 81Br
6404,452355,13161,49,274,398529,116612,48.7,360.9689,0.053,6.07978,...,230.882642,-109.070421,-155.075901,-137.065336,1.997993,39.902357,19.955751,47.950666,90.888104,D 81Br
6408,443039,13162,55,335,403455,172618,56.5,325.9238,0.042,5.99284,...,195.837542,-109.070545,-155.076025,-137.065460,1.997869,74.947209,-15.089349,12.905566,90.88798,D 81Br
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203114,379448,84915,51,371,414345,142573,2415.8,387.9546,0.044,6.21591,...,114.810102,-252.128952,-298.134431,-280.123867,1.997705,12.916081,-96.116792,-68.121878,-52.170427,D 81Br
203125,376386,84917,45,343,825214,330219,124.2,349.9777,0.044,11.96499,...,76.833202,-252.129032,-298.134511,-280.123947,1.997625,50.892821,-134.093692,-106.098778,-52.170507,D 81Br
203126,376387,84917,46,343,1073677,340251,2920.8,349.9778,0.046,15.56751,...,76.833302,-252.128932,-298.134411,-280.123847,1.997725,50.892921,-134.093592,-106.098678,-52.170407,D 81Br
204428,393234,85237,48,344,662514,261112,74.3,349.9777,0.044,9.35856,...,165.900867,-163.061367,-209.066846,-191.056282,1.997625,50.892821,-45.026027,-17.031113,36.897158,D 81Br


These are actually all caused by bromine isotope 81Br. The mass difference to 79Br is 1.9979535, which coincides with delta_T.

In [93]:
delta_br_isotopes = (1.9959, 1.9999)

In [94]:
# obtain a list of building blocks that contain bromine
building_blocks = [x[0] for x in con.con.execute("SELECT short FROM building_block_shorts;").fetchall()]
has_bromine = ["Br" in con.get_smiles(bb) for bb in building_blocks]
building_blocks_with_bromine = [x[0] for x in zip(building_blocks, has_bromine) if x[1]]
building_blocks_with_bromine

['I4', 'I11', 'I23', 'I53', 'I68', 'I78', 'M7', 'T5', 'T15', 'I53']

In [95]:
# select based on the isotope mass difference, but only if the relevant building blocks actually contain bromine
peaks_br_d = df.loc[df["delta_D"].between(*delta_br_isotopes) & (df["initiator"].isin(building_blocks_with_bromine) | df["terminator"].isin(building_blocks_with_bromine)), "id"].to_numpy()
with con.con:
    con.con.executemany("INSERT INTO lcms_peaks_assignment (peak_id, assignment) VALUES (?, 'D 81Br');", peaks_br_d[:, None].tolist())

Now obviously, the same problem may occur for other products, so we check the other differences as well

In [96]:
# select based on the isotope mass difference, but only if the relevant building blocks actually contain bromine
peaks_br_a = df.loc[df["delta_A"].between(1.9976, 1.9982) & (df["initiator"].isin(building_blocks_with_bromine) | df["monomer"].isin(building_blocks_with_bromine) | df["terminator"].isin(building_blocks_with_bromine)), "id"].to_numpy()
peaks_br_b = df.loc[df["delta_B"].between(1.9976, 1.9982) & (df["initiator"].isin(building_blocks_with_bromine) | df["monomer"].isin(building_blocks_with_bromine) | df["terminator"].isin(building_blocks_with_bromine)), "id"].to_numpy()
peaks_br_c = df.loc[df["delta_C"].between(1.9976, 1.9982) & (df["initiator"].isin(building_blocks_with_bromine) | df["monomer"].isin(building_blocks_with_bromine) | df["terminator"].isin(building_blocks_with_bromine)), "id"].to_numpy()
peaks_br_e = df.loc[df["delta_E"].between(1.9976, 1.9982) & df["terminator"].isin(building_blocks_with_bromine), "id"].to_numpy()
peaks_br_f = df.loc[df["delta_F"].between(1.9976, 1.9982) & (df["initiator"].isin(building_blocks_with_bromine) | df["monomer"].isin(building_blocks_with_bromine)), "id"].to_numpy()
peaks_br_g = df.loc[df["delta_G"].between(1.9976, 1.9982) & (df["initiator"].isin(building_blocks_with_bromine) | df["monomer"].isin(building_blocks_with_bromine)), "id"].to_numpy()
peaks_br_h = df.loc[df["delta_H"].between(1.9976, 1.9982) & (df["monomer"].isin(building_blocks_with_bromine) | df["terminator"].isin(building_blocks_with_bromine)), "id"].to_numpy()

# check how many we found
for s, i in zip("ABCDEFGH", [peaks_br_a, peaks_br_b, peaks_br_c, peaks_br_d, peaks_br_e, peaks_br_f, peaks_br_g, peaks_br_h]):
    print(f"{s}: {len(i)}")

A: 41
B: 40
C: 0
D: 585
E: 215
F: 46
G: 5
H: 0


In [97]:
# commit to DB
with con.con:
    for s, i in zip("ABCEFGH", [peaks_br_a, peaks_br_b, peaks_br_c, peaks_br_e, peaks_br_f, peaks_br_g, peaks_br_h]):
        assignment = [f"{s} 81Br" for _ in range(len(i))]
        con.con.executemany("INSERT INTO lcms_peaks_assignment (peak_id, assignment) VALUES (?, ?);", tuple(zip(i.tolist(), assignment)))

In [98]:
count_assigned_peaks()

Unnamed: 0_level_0,experiment_id
assignment,Unnamed: 1_level_1
A,22943
A 81Br,25
A hydroxyquinone lost oxygen substituent,36
B,14541
B 81Br,25
B hydroxyquinone lost oxygen substituent,61
C,3630
D,25151
D 81Br,340
D hydroxyquinone lost oxygen substituent,202


#### 4-hydroxy version of 8-Quin-4-alkoxy-KATS

In [99]:
df.loc[df["delta_T"].between(153.015, 153.025)]

Unnamed: 0,id,experiment_id,peak_nr,retention_time_s,area,intensity,signal_to_noise,mz_max,fwhm_min,%area,...,delta_bAA,delta_A,delta_B,delta_C,delta_D,delta_E,delta_F,delta_G,delta_H,assignment
551,532420,10688,48,279,954653,290147,92.7,332.0860,0.048,12.83589,...,86.098735,-240.984287,-286.989766,-268.979202,-14.014863,40.990951,-126.943862,-98.948947,-38.910059,D hydroxyquinone lost oxygen substituent
581,528619,10693,25,239,670829,240257,95.0,325.0761,0.046,9.70599,...,79.088835,-240.984351,-286.989830,-268.979265,-14.014927,48.000723,-133.953762,-105.958847,-38.910123,
584,530939,10694,38,269,2741743,880861,297.3,279.0588,0.046,37.88788,...,33.071535,-240.984938,-286.990417,-268.979853,-14.015514,30.007330,-179.971062,-151.976147,-38.91071,D hydroxyquinone lost oxygen substituent
589,526455,10695,45,317,788845,159523,69.4,329.0752,0.068,10.96065,...,83.087935,-240.984188,-286.989667,-268.979103,-14.014764,-20.007571,-129.954662,-101.959747,-38.90996,D hydroxyquinone lost oxygen substituent
591,524027,10696,34,294,1140407,354130,135.7,293.0747,0.048,15.20169,...,47.087435,-240.984688,-286.990167,-268.979603,-14.015264,15.991929,-165.955162,-137.960247,-38.91046,D hydroxyquinone lost oxygen substituent
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198475,482313,84027,38,326,1014524,60218,20.8,373.9919,0.234,15.39831,...,177.895077,-191.094347,-237.099826,-219.089262,-14.015368,-0.915960,-35.147516,-7.152602,10.979881,D hydroxyquinone lost oxygen substituent
198501,481948,84030,33,239,364263,117819,45.2,325.0753,0.050,6.42974,...,128.978477,-191.094705,-237.100185,-219.089620,-14.015727,47.999923,-84.064116,-56.069202,10.979522,
198509,484162,84031,39,269,1058978,432597,101.2,279.0581,0.041,22.47033,...,82.961277,-191.095193,-237.100672,-219.090107,-14.016214,30.006630,-130.081316,-102.086402,10.979035,D hydroxyquinone lost oxygen substituent
198520,492769,84033,39,294,695881,219059,63.2,293.0740,0.047,10.21235,...,96.977177,-191.094943,-237.100422,-219.089857,-14.015964,15.991229,-116.065416,-88.070502,10.979285,D hydroxyquinone lost oxygen substituent


In [100]:
# these all use either I21 or I22. What's unique about these?
con.get_smiles("I21")

'COc1ccnc2c(C(=O)[B-](F)(F)F)cccc12.[K+]'

In [101]:
con.get_smiles("I22")

'O=C(c1cccc2c(OCc3ccccc3)ccnc12)[B-](F)(F)F.[K+]'

Both contain a 4-hydroxyquinone moiety. The other 8-Quin KATs we have are substituted on the 5-position.

The mass difference corresponds to absence of the side on the oxygen atom (i.e. the methyl group for 8-Quin003/I21 and the benzyl group for 8-Quin004/I22).
The mass differences for OR-->OH are:
- 8-Quin003/I21: -14.01565
- 8-Quin004/I22: -90.04695

In [102]:
delta_quin003 = (-14.01565 - 0.002, -14.01565 + 0.002)
delta_quin004 = (-90.04695 - 0.002, -90.04695 + 0.002)

In [103]:
# add assignment to DB
# select based on the isotope mass difference, but only if the relevant building blocks actually contain bromine
for s in "ABCDEFGH":
    peaks_hydroxy_quin = df.loc[(df[f"delta_{s}"].between(*delta_quin003) & (df["initiator"] == "I21")) | (df[f"delta_{s}"].between(*delta_quin004) & (df["initiator"] == "I22")), "id"].to_numpy()
    print(f"Found {len(peaks_hydroxy_quin)} peaks for product {s}")
    assignment = [f"{s} hydroxyquinone lost oxygen substituent" for _ in range(len(peaks_hydroxy_quin))]
    with con.con:
        con.con.executemany("INSERT INTO lcms_peaks_assignment (peak_id, assignment) VALUES (?, ?);", tuple(zip(peaks_hydroxy_quin.tolist(), assignment)))

Found 47 peaks for product A
Found 95 peaks for product B
Found 1 peaks for product C
Found 229 peaks for product D
Found 0 peaks for product E
Found 234 peaks for product F
Found 7 peaks for product G
Found 0 peaks for product H


#### F->OH exchange on aromatic F-containing KATs (−1.9957)
(mostly I6, containing 3-fluoropyridine)

In [104]:
df.loc[df["delta_T"].between(110.015, 110.025)]

Unnamed: 0,id,experiment_id,peak_nr,retention_time_s,area,intensity,signal_to_noise,mz_max,fwhm_min,%area,...,delta_bAA,delta_A,delta_B,delta_C,delta_D,delta_E,delta_F,delta_G,delta_H,assignment
447,519534,10668,67,345,450027,294935,74.8,289.0799,0.030,6.28280,...,43.092635,-299.008450,-345.013929,-327.003365,-72.039026,-2.015149,-184.968025,-156.973110,-81.916159,
448,519535,10668,68,345,882828,297775,2990.4,289.0799,0.043,12.32510,...,43.092635,-299.008450,-345.013929,-327.003365,-72.039026,-2.015149,-184.968025,-156.973110,-81.916159,
451,520207,10669,34,245,362719,104112,38.6,253.0544,0.050,5.19895,...,7.067135,-299.008798,-345.014277,-327.003713,-72.039374,34.009655,-220.993525,-192.998610,-81.916507,
465,518670,10671,55,349,464172,140949,1888.8,306.1067,0.048,14.12194,...,60.119435,-299.008199,-345.013678,-327.003114,-72.038775,-19.041448,-167.941225,-139.946310,-81.915909,
466,518671,10671,56,349,215996,140566,30.4,306.1067,0.049,6.57146,...,60.119435,-299.008199,-345.013678,-327.003114,-72.038775,-19.041448,-167.941225,-139.946310,-81.915909,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198313,483843,84006,38,245,335360,103763,1004.1,253.0537,0.047,5.09938,...,56.956877,-249.119053,-295.124532,-277.113967,-72.040074,34.008955,-171.103779,-143.108865,-32.026762,
198335,479601,84008,45,350,125573,36802,559.3,306.1058,0.050,12.88179,...,110.008977,-249.118654,-295.124133,-277.113568,-72.039675,-19.042348,-118.051679,-90.056765,-32.026363,
198344,477892,84010,48,312,569737,153714,50.2,282.0692,0.044,8.58492,...,85.972377,-249.118868,-295.124348,-277.113783,-72.039890,4.993823,-142.088279,-114.093365,-32.026578,
198345,477893,84010,49,312,490992,155109,1908.7,282.0692,0.046,7.39837,...,85.972377,-249.118868,-295.124348,-277.113783,-72.039890,4.993823,-142.088279,-114.093365,-32.026578,


In [105]:
con.get_smiles("I43")

'C[Si](C)(C)C#Cc1ccc(C(=O)[B-](F)(F)F)cc1.[K+]'

This mass difference (occurring exclusively with I43) corresponds to TMS->H exchange

In [106]:
df.loc[df["delta_T"].between(103.005, 103.015)]

Unnamed: 0,id,experiment_id,peak_nr,retention_time_s,area,intensity,signal_to_noise,mz_max,fwhm_min,%area,...,delta_bAA,delta_A,delta_B,delta_C,delta_D,delta_E,delta_F,delta_G,delta_H,assignment
39845,1295922,30023,21,296,1000213,66886,10.5,243.0582,0.241,5.958616,...,82.961381,-238.143440,-284.148919,-266.138354,-97.064461,-34.024571,-163.113913,-135.118999,-3.036515,
58038,1783555,39902,29,270,2286821,764960,281.0,263.0034,0.044,18.167640,...,130.901492,-115.080369,-161.085848,-143.075284,-1.996305,-53.970126,-20.105465,7.889449,24.958222,
58039,1783556,39902,30,270,834886,271477,2515.0,263.0034,0.045,6.632750,...,130.901492,-115.080369,-161.085848,-143.075284,-1.996305,-53.970126,-20.105465,7.889449,24.958222,
58042,1796373,39903,22,241,1854356,408540,145.3,247.0330,0.059,21.268640,...,114.931092,-115.080320,-161.085799,-143.075234,-1.996256,-37.999627,-36.075865,-8.080951,24.958271,
58051,1789198,39905,27,231,2750829,605496,279.7,287.0594,0.060,34.862830,...,119.977893,-150.059653,-196.065132,-178.054568,-1.995990,-14.015977,-31.029064,-3.034149,-10.021062,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146653,1740584,70953,17,241,3691226,782140,276.9,247.0330,0.062,58.894230,...,-78.122811,-308.134226,-354.139706,-336.129141,-1.996256,-37.999627,-229.129772,-201.134857,-168.095635,
146666,1753433,70955,32,231,1016229,232625,81.9,287.0595,0.059,10.308200,...,115.001511,-155.036039,-201.041518,-183.030953,-1.995890,-14.015877,-36.005450,-8.010535,-14.997448,
146676,1754259,70956,26,285,1768655,658128,205.3,297.0302,0.041,16.125710,...,124.972211,-155.036011,-201.041490,-183.030925,-1.995862,-87.996040,-26.034750,1.960165,-14.99742,
146683,1753308,70957,50,333,428707,99893,32.3,395.1178,0.056,5.009430,...,223.059811,-155.035253,-201.040733,-183.030168,-1.995105,-122.072606,72.052850,100.047765,-14.996662,


In [107]:
con.get_smiles("I6")

'O=C(c1ccc(F)cn1)[B-](F)(F)F.[K+]'

This mass difference (occurring exclusively with I6) corresponds to F->OH exchange (expected difference -1.99566)

In [108]:
delta_f_to_oh = (-1.99566 - 0.002, -1.99566 + 0.002)

In [109]:
# obtain a list of building blocks that contain fluorine, other than the BF3 group
# so for initiators, there need to be at least 4 fluorines, for M and T at least 1
building_blocks = [x[0] for x in con.con.execute("SELECT short FROM building_block_shorts;").fetchall()]
has_arom_fluorine = ["c(F)" in con.get_smiles(bb) for bb in building_blocks]  # not 100% safe, but good enough heuristic
building_blocks_with_arom_fluorine = [x[0] for x in zip(building_blocks, has_arom_fluorine) if x[1]]
list(set(building_blocks_with_arom_fluorine))

['I65', 'I3', 'T2', 'I5', 'I66', 'I6', 'M65', 'T16', 'I48', 'M62']

In [110]:
# add assignment to DB
# select based on the isotope mass difference, but only if the relevant building blocks actually contain aromatic fluorine
for s in "ABCDEFGH":
    peaks_f_to_oh = df.loc[(df[f"delta_{s}"].between(*delta_f_to_oh) & (df["initiator"].isin(building_blocks_with_arom_fluorine) | df["monomer"].isin(building_blocks_with_arom_fluorine) | df["terminator"].isin(building_blocks_with_arom_fluorine))), "id"].to_numpy()
    print(f"Found {len(peaks_f_to_oh)} peaks for product {s}")
    assignment = [f"{s} fluorine to OH exchange" for _ in range(len(peaks_f_to_oh))]
    with con.con:
        con.con.executemany("INSERT INTO lcms_peaks_assignment (peak_id, assignment) VALUES (?, ?);", tuple(zip(peaks_br_f.tolist(), assignment)))

Found 17 peaks for product A
Found 46 peaks for product B
Found 1 peaks for product C
Found 306 peaks for product D
Found 0 peaks for product E
Found 24 peaks for product F
Found 0 peaks for product G
Found 0 peaks for product H


#### Terminator + cyclohexanone condensation (+80.06)

In [111]:
df.loc[df["delta_T"].between(80.055, 80.065)].sort_values(["monomer", "terminator"])

Unnamed: 0,id,experiment_id,peak_nr,retention_time_s,area,intensity,signal_to_noise,mz_max,fwhm_min,%area,...,delta_bAA,delta_A,delta_B,delta_C,delta_D,delta_E,delta_F,delta_G,delta_H,assignment
183427,776139,79851,42,332,474388,209429,100.0,206.0989,0.036,8.09507,...,-133.056166,-356.101737,-402.107216,-384.096651,-35.964515,-42.952570,-295.087860,-267.092946,-205.038408,T cyclohexanone condensate
183522,779825,79871,42,332,354264,154346,49.9,206.0990,0.035,5.20091,...,-133.056066,-425.134334,-471.139813,-453.129248,-104.997112,-42.952470,-364.120458,-336.125543,-205.038308,T cyclohexanone condensate
183835,774359,79931,38,332,582202,255373,95.4,206.0990,0.037,9.24560,...,-133.056066,-340.106722,-386.112201,-368.101637,-19.969500,-42.952470,-279.092846,-251.097931,-205.038308,T cyclohexanone condensate
183950,772304,79951,51,332,1013855,412097,142.4,206.0989,0.039,15.87101,...,-133.056166,-440.145333,-486.150812,-468.140247,-120.008111,-42.952570,-379.131457,-351.136542,-205.038408,T cyclohexanone condensate
184246,786389,80011,40,332,1028077,411293,152.0,206.0988,0.039,14.54953,...,-133.056266,-316.081770,-362.087249,-344.076685,4.055452,-42.952670,-255.067894,-227.072979,-205.038508,T cyclohexanone condensate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182205,924561,79540,50,299,277172,87293,22.5,252.1165,0.046,5.35046,...,80.058511,-139.024512,-185.029991,-167.019426,14.015637,-24.958877,-31.993923,-3.999008,-37.940448,T cyclohexanone condensate
182316,914469,79560,24,299,3729885,1093574,321.8,252.1161,0.049,75.98351,...,80.058111,-249.036539,-295.042018,-277.031453,-95.996390,-24.959277,-142.005950,-114.011035,-37.940848,T cyclohexanone condensate
182657,914778,79640,20,299,304741,95732,31.9,252.1164,0.048,6.22826,...,80.058411,-226.980696,-272.986175,-254.975611,-73.940548,-24.958977,-119.950107,-91.955193,-37.940548,T cyclohexanone condensate
182748,928711,79660,29,299,415807,129593,49.4,252.1164,0.047,6.50729,...,80.058411,-177.979139,-223.984618,-205.974053,-24.938990,-24.958977,-70.948550,-42.953635,-37.940548,T cyclohexanone condensate


The mass difference +80.0622 (compared to terminator) occurs for many peaks and occurs across most I, M, and T. Something interesting may be going on here.

In [112]:
con.get_product_smiles(79851)

('COc1ccc(C(=O)N[C@H]2CN(C(=O)OCCc3ccc(OC)c(OC)c3)C[C@H]2c2nc3ccccc3s2)cc1',
 'COc1ccc(C(=O)N[C@H]2CN(C(=O)OCCc3ccc(OC)c(OC)c3)C[C@H]2C2(C(=O)O)Nc3ccccc3S2)cc1',
 'COc1ccc(C2=[N+]3c4ccccc4SC3(C(=O)[O-])[C@@H]3CN(C(=O)OCCc4ccc(OC)c(OC)c4)C[C@@H]3N2)cc1',
 'COc1ccc(-c2nc3ccccc3s2)cc1',
 'Nc1ccccc1SSc1ccccc1N',
 'COc1ccc(C(=O)N[C@H]2CN(C(=O)OCCc3ccc(OC)c(OC)c3)C[C@H]2C(=O)C(=O)O)cc1',
 'COc1ccc(C(=O)N[C@H]2CN(C(=O)OCCc3ccc(OC)c(OC)c3)C[C@H]2C(=O)O)cc1',
 'COc1ccc(CCOC(=O)N2CC=C(c3nc4ccccc4s3)C2)cc1OC')

In [113]:
con.get_smiles("T22")

'COc1ccc(C(=S)N[NH3+])cc1.[Cl-]'

In [114]:
con.get_smiles("T23")

'COc1ccc(C(=S)NN)cn1.Cl'

In [115]:
con.get_smiles("T25")

'Cl.NNC(=S)/C=C/c1ccccc1'

This seems to be the terminator, condensed with cyclohexanone to form something like `Sc1c(/N=C2CCCCC/2)cccc1` or `Sc1c(NC2=CCCCC2)cccc1` or `Nc1c(SC2=CCCCC2)cccc1`.
Under certain circumstances (reaction with the T-dimer) `c1(NC2CCCCC2S3)c3cccc1` may also be possible.
(In theory, reaction with hexylKAT would explain the mass, too – but it is highly improbable that we would somehow produce that)

The expected mass difference for any of these with the terminator is 80.0626 (C6H8).

In [116]:
delta_cyclohexanone_condensation = (80.0626 - 0.002, 80.0626 + 0.002)

In [117]:
# find and count occurences
peaks_cyclohexanone_condensation = df.loc[df["delta_T"].between(*delta_cyclohexanone_condensation), "id"].to_numpy()
print(len(peaks_cyclohexanone_condensation))

15339


In [118]:
assignment = [f"T cyclohexanone condensate" for _ in range(len(peaks_cyclohexanone_condensation))]
with con.con:
    con.con.executemany("INSERT INTO lcms_peaks_assignment (peak_id, assignment) VALUES (?, ?);", tuple(zip(peaks_cyclohexanone_condensation.tolist(), assignment)))

In [119]:
count_assigned_peaks()

Unnamed: 0_level_0,experiment_id
assignment,Unnamed: 1_level_1
A,22943
A 81Br,25
A hydroxyquinone lost oxygen substituent,47
B,14541
B 81Br,25
B hydroxyquinone lost oxygen substituent,93
C,3630
C hydroxyquinone lost oxygen substituent,1
D,25151
D 81Br,340


#### Monomer - H<sub>4</sub>O<sub>2</sub> (−36.02)

In [120]:
# let's try to be systematic and find the most common differences
for s in "IMTABCDEFGH":
    mz_delta_count = df.round(2).groupby(f"delta_{s}")["id"].count().sort_values(ascending=False)
    print("Frequent (>1000 examples) of mass differences for", s)
    print(mz_delta_count[mz_delta_count > 1000].index.tolist())

Frequent (>1000 examples) of mass differences for I
[]
Frequent (>1000 examples) of mass differences for M
[0.0, -124.05, -36.02, -56.06]
Frequent (>1000 examples) of mass differences for T
[80.06, 110.04, 215.08, 99.04, 77.04, 73.01]
Frequent (>1000 examples) of mass differences for A
[]
Frequent (>1000 examples) of mass differences for B
[]
Frequent (>1000 examples) of mass differences for C
[]
Frequent (>1000 examples) of mass differences for D
[-19.97]
Frequent (>1000 examples) of mass differences for E
[78.05, -35.96, -37.99, -9.96, -24.96, -5.95, -73.94]
Frequent (>1000 examples) of mass differences for F
[-56.06]
Frequent (>1000 examples) of mass differences for G
[-28.07]
Frequent (>1000 examples) of mass differences for H
[]


In [121]:
peaks = df.loc[df.delta_M.round(2) == -36.02]
peaks

Unnamed: 0,id,experiment_id,peak_nr,retention_time_s,area,intensity,signal_to_noise,mz_max,fwhm_min,%area,...,delta_bAA,delta_A,delta_B,delta_C,delta_D,delta_E,delta_F,delta_G,delta_H,assignment
1836,552157,10970,19,295,492813,36196,8.3,413.1888,0.266,6.86842,...,88.032985,-236.028876,-282.034355,-264.023790,70.109095,38.280940,-80.082045,-52.087131,,M - H4O2
2421,552613,11089,26,301,446999,29492,9.6,413.1891,0.247,6.79770,...,88.033285,-184.058295,-230.063774,-212.053209,122.079676,194.144355,-106.043022,-78.048107,,M - H4O2
3991,559383,11402,14,199,468603,157782,42.5,305.1315,0.038,6.68689,...,88.034343,-211.082055,-257.087534,-239.076969,-13.002742,-31.972400,-106.041964,-78.047049,-27.974114,
4006,555980,11404,14,199,1048669,420338,98.4,305.1314,0.041,14.12038,...,88.034243,-167.055940,-213.061419,-195.050855,31.023373,56.079930,-106.042064,-78.047149,16.052001,
4011,571266,11405,15,200,1349225,421482,130.7,305.1314,0.045,18.17676,...,88.034243,-217.071590,-263.077069,-245.066505,-18.992277,-43.951371,-106.042064,-78.047149,-33.963649,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191731,717205,81584,8,183,326540,38293,12.6,272.1077,0.150,5.22887,...,88.030867,-164.016104,-210.021583,-192.011019,1.042888,-26.977179,-45.980764,-17.985849,-40.972842,
197753,482395,83931,14,227,370394,104660,40.8,284.1279,0.062,5.13490,...,88.031077,-138.964957,-184.970436,-166.959871,38.114022,35.076430,-77.951080,-49.956166,16.048835,
200992,496010,84523,21,233,530792,210256,82.7,245.0916,0.039,7.72378,...,88.030828,-242.051856,-288.057335,-270.046771,-104.008929,-31.991171,-167.022330,-139.027415,2.032936,
201034,504339,84536,31,234,520440,145351,67.2,245.0915,0.061,7.89864,...,88.030728,-245.037703,-291.043182,-273.032618,-106.994776,26.046755,-167.022430,-139.027515,-0.952911,


In [122]:
# check for how many different building blocks this occurs
print("I:", len(peaks.initiator.unique()))
print("M:", len(peaks.monomer.unique()))
print("T:", len(peaks.terminator.unique()))

I: 69
M: 22
T: 37


In [123]:
# let's check which monomers are involved
for short in peaks.monomer.unique():
    print(short, con.get_long_name(short))

M62 Spiro004
M52 Mon098
M29 Mon036
M10 Mon002
M27 Mon031
M35 Mon077
M41 Mon087
M9 Mon001
M13 Mon007
M36 Mon078
M21 Mon019
M48 Mon094
M46 Mon092
M45 Mon091
M68 Spiro011
M30 Mon049
M14 Mon011
M55 Mon101
M23 Mon021
M51 Mon097
M22 Mon020
M28 Mon033


In [278]:
con.get_product_smiles(14739)

('O=C(N[C@H](Cc1nnc(-c2ccccc2Cl)s1)C1(c2ccc(Cl)cc2)CC1)c1ccc(F)c(F)c1',
 'O=C(N[C@H](CC1(C(=O)O)NN=C(c2ccccc2Cl)S1)C1(c2ccc(Cl)cc2)CC1)c1ccc(F)c(F)c1',
 'O=C([O-])C12C[C@H](C3(c4ccc(Cl)cc4)CC3)NC(c3ccc(F)c(F)c3)=[N+]1N=C(c1ccccc1Cl)S2',
 'Fc1ccc(-c2nnc(-c3ccccc3Cl)s2)cc1F',
 'Clc1ccccc1-c1nnc(-c2ccccc2Cl)s1',
 'O=C(O)C(=O)C[C@@H](NC(=O)c1ccc(F)c(F)c1)C1(c2ccc(Cl)cc2)CC1',
 'O=C(O)C[C@@H](NC(=O)c1ccc(F)c(F)c1)C1(c2ccc(Cl)cc2)CC1',
 'Clc1ccc(C2(C=Cc3nnc(-c4ccccc4Cl)s3)CC2)cc1')

In [287]:
con.get_product_smiles(58109)

('CC(C)C[C@H](Cc1nnc(-c2ccccc2)s1)NC(=O)c1ccc(OCCCN=[N+]=[N-])cc1',
 'CC(C)C[C@H](CC1(C(=O)O)NN=C(c2ccccc2)S1)NC(=O)c1ccc(OCCCN=[N+]=[N-])cc1',
 'CC(C)C[C@@H]1CC2(C(=O)[O-])SC(c3ccccc3)=N[N+]2=C(c2ccc(OCCCN=[N+]=[N-])cc2)N1',
 '[N-]=[N+]=NCCCOc1ccc(-c2nnc(-c3ccccc3)s2)cc1',
 'c1ccc(-c2nnc(-c3ccccc3)s2)cc1',
 'CC(C)C[C@H](CC(=O)C(=O)O)NC(=O)c1ccc(OCCCN=[N+]=[N-])cc1',
 'CC(C)C[C@H](CC(=O)O)NC(=O)c1ccc(OCCCN=[N+]=[N-])cc1',
 'CC(C)CC=Cc1nnc(-c2ccccc2)s1')

In [289]:
con.get_smiles("M27")

'Cl.O=C1OC2(CCCCC2)O[C@@]12C[C@H](Cc1ccc(OCc3ccccc3)cc1)NO2'

This peak comes from the monomer only.
Actually, the mz value can typically be found in the QC LCMS data of the individual monomer (I checked for M36, M27 and M13 and found it in 3 out of 4 QC traces as a small but defined peak).
We conclude that the contaminant is carried over from the monomer stock solutions.

The m/z corresponds to monomer −H4O2 (−36.0211).
We don't have a plausible structure, but we mark this peak M−H4O2

In [305]:
delta_h4o2 = (-36.0211 - 0.002, -36.0211 + 0.002)

In [307]:
# find and count occurences
peaks_m_h4o2 = df.loc[df["delta_M"].between(*delta_h4o2), "id"].to_numpy()
print(len(peaks_m_h4o2))

1610


In [308]:
# commit to DB
assignment = [f"M - H4O2" for _ in range(len(peaks_m_h4o2))]
with con.con:
    con.con.executemany("INSERT INTO lcms_peaks_assignment (peak_id, assignment) VALUES (?, ?);", tuple(zip(peaks_m_h4o2.tolist(), assignment)))

#### Boc -> carbamic acid MS fragmentation (-56.06)


In [50]:
peaks = df.loc[df.delta_M.round(2) == -56.06]
peaks

Unnamed: 0,id,experiment_id,peak_nr,retention_time_s,area,intensity,signal_to_noise,mz_max,fwhm_min,%area,...,delta_bAA,delta_A,delta_B,delta_C,delta_D,delta_E,delta_F,delta_G,delta_H,assignment
3387,565445,11299,43,240,985674,187066,64.0,285.0944,0.091,13.99819,...,67.997243,-238.078687,-284.084166,-266.073602,-39.999374,66.049655,-160.063414,-132.068499,-20.986396,
3388,565451,11299,49,241,938194,131968,1236.8,285.0944,0.112,13.32390,...,67.997243,-238.078687,-284.084166,-266.073602,-39.999374,66.049655,-160.063414,-132.068499,-20.986396,
10457,641312,14868,29,322,1327257,478786,197.7,341.1710,0.042,16.27546,...,67.990113,-251.048990,-297.054469,-279.043905,3.114052,-33.867519,-95.036830,-67.041916,-98.990413,F (-tert-butyl fragmentation in MS for Boc or ...
10465,640518,14869,36,322,1062099,376018,123.3,341.1710,0.044,16.79558,...,67.990113,-217.022633,-263.028112,-245.017548,37.140409,34.185195,-95.036830,-67.041916,-64.964056,F (-tert-butyl fragmentation in MS for Boc or ...
10466,640519,14869,37,323,433626,136246,1975.1,341.1710,0.047,6.85718,...,67.990113,-217.022633,-263.028112,-245.017548,37.140409,34.185195,-95.036830,-67.041916,-64.964056,F (-tert-butyl fragmentation in MS for Boc or ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205582,395387,85507,49,322,765245,313410,103.3,341.1706,0.041,11.24804,...,67.989713,-210.098057,-256.103536,-238.092971,44.064986,42.085721,-92.062717,-64.067802,-61.013993,M (Boc->carbamic acid fragmentation in MS)
205590,400344,85508,58,322,493896,167821,62.7,341.1705,0.046,8.14605,...,67.989613,-221.088989,-267.094468,-249.083904,33.074054,20.103957,-92.062817,-64.067902,-72.004925,M (Boc->carbamic acid fragmentation in MS)
205597,396306,85509,46,323,818404,347601,152.7,341.1704,0.039,12.29023,...,67.989513,-186.044113,-232.049592,-214.039028,68.118930,90.193809,-92.062917,-64.068002,-36.960049,M (Boc->carbamic acid fragmentation in MS)
205616,396108,85512,46,322,652535,208172,87.8,341.1706,0.045,11.16905,...,67.989713,-171.062006,-217.067485,-199.056920,83.101037,120.157824,-92.062717,-64.067802,-21.977942,M (Boc->carbamic acid fragmentation in MS)


In [51]:
# check for how many different building blocks this occurs
print("I:", len(peaks.initiator.unique()))
print("M:", len(peaks.monomer.unique()))
print("T:", len(peaks.terminator.unique()))

I: 53
M: 9
T: 41


In [52]:
# let's check which monomers are involved
for short in peaks.monomer.unique():
    print(short, con.get_long_name(short))

M52 Mon098
M34 Mon076
M19 Mon016
M16 Mon013
M18 Mon015
M17 Mon014
M5 Mon096
M44 Mon090
M68 Spiro011


In [53]:
# let's check if all of these have boc
print("M (#boc, #cbz, #tbu, #tms)")
for short in peaks.monomer.unique():
    print(short, con.list_pg(short))

M (#boc, #cbz, #tbu, #tms)
M52 (0, 0, 0, 0)
M34 (1, 0, 0, 0)
M19 (1, 0, 0, 0)
M16 (1, 0, 0, 0)
M18 (1, 0, 0, 0)
M17 (1, 0, 0, 0)
M5 (1, 0, 0, 0)
M44 (1, 0, 0, 0)
M68 (0, 0, 0, 0)


In [54]:
# M52 doesn't have a boc. Check number of peaks for M52 to determine whether this is a random collision
peaks.monomer.value_counts()

M18    440
M34    380
M5     189
M19    158
M17    117
M44     70
M68      3
M52      2
M16      1
Name: monomer, dtype: int64

The -56.06 difference to monomer occurs from loss of the tert-butyl group (part of the Boc group).
The expected mass difference (-C4H8) is 56.0636.

In [55]:
delta_tert_butyl = (-56.0636 - 0.002, -56.0636 + 0.002)

In [56]:
# does the collision with M52 still occur in a more precise mass range?
df.loc[df["delta_M"].between(*delta_tert_butyl)].monomer.value_counts()

M18    440
M34    380
M5     189
M19    158
M17    117
M44     70
Name: monomer, dtype: int64

No, no more collision with M52 or M16. For the rest, the mass difference is safe to apply.
However, one question is left: Does this degradation occur in the MS, or do we actually have the carbamic acid species (usually not stable)?

To answer this, we need to manually inspect MS traces.


In [57]:
def get_lab_journal_number_well(experiment_id):
    return con.con.execute("SELECT lab_journal_number, well FROM experiments WHERE id = ?", (experiment_id,)).fetchone()

In [58]:
# let's sample a few of the alrger peaks to inspect manually peaks
peaks_to_inspect = df.loc[df["delta_M"].between(*delta_tert_butyl) & (df["%area"] > 20), ["experiment_id", "peak_nr", "monomer"]].sample(3, random_state=1)
peaks_to_inspect["identifier"] = peaks_to_inspect["experiment_id"].apply(lambda x: get_lab_journal_number_well(x))
peaks_to_inspect

Unnamed: 0,experiment_id,peak_nr,monomer,identifier
204313,85208,48,M18,"(JG248, A16)"
81016,51068,17,M34,"(JG366, F16)"
22101,22583,17,M19,"(JG280, F11)"


Outcome of manual inspection:
- JG248/A16: Peak 48 contains both monomer and the carbamic acid derivative (ca. 3:5 int.), but not the fully deprotected amine. There is no other monomer peak.
- JG366/F16: Same picture here for peak 17. Contains a miniscule amount of fully deprotected amine. No other monomer peak
- JG280/F11: Exact same picture again. Only other peaks are shoulders of peak 17.

_In conclusion_, our assumption that the conversion to carbamic acid occurs in the LCMS and does not indicate formation of a separate stable species is confirmed by the data.

In [59]:
# find and count occurences
peaks_m_carbamic_acid = df.loc[df["delta_M"].between(*delta_tert_butyl), "id"].to_numpy()
print(len(peaks_m_carbamic_acid))

1354


In [60]:
# commit to DB
assignment = [f"M (Boc->carbamic acid fragmentation in MS)" for _ in range(len(peaks_m_carbamic_acid))]
with con.con:
    con.con.executemany("INSERT INTO lcms_peaks_assignment (peak_id, assignment) VALUES (?, ?);", tuple(zip(peaks_m_carbamic_acid.tolist(), assignment)))

#### Terminator +C<sub>6</sub>H<sub>6</sub>O<sub>2</sub> (+110.04)

In [351]:
peaks = df.loc[df.delta_T.round(2) == 110.04]
peaks

Unnamed: 0,id,experiment_id,peak_nr,retention_time_s,area,intensity,signal_to_noise,mz_max,fwhm_min,%area,...,delta_bAA,delta_A,delta_B,delta_C,delta_D,delta_E,delta_F,delta_G,delta_H,assignment
43,518222,10584,99,242,373640,145574,55.6,236.0739,0.038,5.15718,...,17.935212,-209.060821,-255.066301,-237.055736,-9.939978,-12.977570,-148.046945,-120.052030,-54.04703,
48,521089,10585,153,288,390494,150981,54.4,286.0899,0.044,5.36774,...,67.951212,-209.060471,-255.065951,-237.055386,-9.939628,-62.992871,-98.030945,-70.036030,-54.04668,
132,527359,10605,32,288,448336,151736,56.3,286.0900,0.045,6.02827,...,67.951312,-189.114994,-235.120473,-217.109908,10.005850,-62.992771,-78.085468,-50.090553,-54.04658,
310,521315,10645,24,288,414054,138359,54.2,286.0900,0.046,5.80533,...,67.951312,-211.193244,-257.198723,-239.188159,-12.072401,-62.992771,-100.163718,-72.168803,-54.04658,
401,517036,10662,21,261,1362267,448566,161.1,280.1004,0.040,18.51341,...,61.961712,-271.139035,-317.144515,-299.133950,-72.018192,-57.003500,-166.098944,-138.104030,-54.046745,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205556,394890,85500,18,259,1396384,530515,211.1,250.0894,0.042,22.24137,...,66.012567,-107.053743,-153.059222,-135.048657,58.005250,-26.993371,-32.024216,-4.029302,-19.985329,
205558,388986,85501,21,260,1591041,620143,238.9,250.0895,0.040,23.38512,...,66.012667,-107.053643,-153.059122,-135.048557,58.005350,-26.993271,-32.024116,-4.029202,-19.985229,
205603,395448,85510,20,259,479836,182845,64.6,250.0894,0.041,8.42067,...,-23.091487,-258.173443,-304.178922,-286.168358,-4.010400,-26.993371,-183.143917,-155.149002,-109.089379,
205606,400692,85511,17,260,377169,179951,68.6,250.0894,0.030,5.56862,...,-23.091487,-258.173443,-304.178922,-286.168358,-4.010400,-26.993371,-183.143917,-155.149002,-109.089379,


In [352]:
# check for how many different building blocks this occurs
print("I:", len(peaks.initiator.unique()))
print("M:", len(peaks.monomer.unique()))
print("T:", len(peaks.terminator.unique()))

I: 69
M: 72
T: 13


In [353]:
# let's check which terminators are involved
for short in peaks.terminator.unique():
    print(short, con.get_long_name(short))

T1 TerABT001
T8 TerABT010
T17 TerABT022
T18 TerABT023
T13 TerABT015
T7 TerABT009
T3 TerABT005
T4 TerABT006
T12 TerABT014
T2 TerABT004
T15 TerABT017
T10 TerABT012
T6 TerABT008


In [356]:
# obviously all are ABTs. Let's see where we have frequent occurences
peaks.terminator.value_counts()

T17    490
T3     344
T8     303
T13    285
T18    271
T7     213
T4     208
T1     198
T2      29
T12     20
T15      2
T10      2
T6       2
Name: terminator, dtype: int64

In [357]:
con.get_smiles("T17")

'CCOc1ccc(N)c(S)c1'

In [358]:
con.get_smiles("T8")

'Nc1c(S)ccc2ccccc12'

The mass difference is only observed for ABT terminators.
It corresponds to +C<sub>6</sub>H<sub>6</sub>O<sub>2</sub> (+110.0368).
I don't find a good structure to explain this mass difference.
We will leave the peaks unassigned for now.

#### Terminator delta only with methyl-ABT (+215.08)

In [359]:
peaks = df.loc[df.delta_T.round(2) == 215.08]
peaks

Unnamed: 0,id,experiment_id,peak_nr,retention_time_s,area,intensity,signal_to_noise,mz_max,fwhm_min,%area,...,delta_bAA,delta_A,delta_B,delta_C,delta_D,delta_E,delta_F,delta_G,delta_H,assignment
55,519463,10586,181,382,5384517,2173071,938.2,355.1304,0.041,74.13447,...,136.991712,-104.019971,-150.025451,-132.014886,95.100872,78.047629,-28.990445,-0.995530,50.99382,
87,528763,10596,43,382,4087039,1700510,1036.0,355.1304,0.040,51.41997,...,109.143135,-131.868552,-177.874031,-159.863467,95.100872,78.047629,-56.839026,-28.844111,23.14524,
135,528874,10606,56,382,4872804,2095422,721.3,355.1305,0.040,71.86874,...,136.991812,-84.074494,-130.079973,-112.069408,115.046350,78.047729,-9.044968,18.949947,50.99392,
164,529114,10616,41,382,4074432,1868198,728.8,355.1306,0.039,58.47598,...,109.143335,-111.922974,-157.928454,-139.917889,115.046450,78.047829,-36.893448,-8.898533,23.14544,
261,533929,10636,36,382,3199862,1421639,658.6,355.1307,0.038,43.13149,...,109.143435,-154.965073,-200.970553,-182.959988,72.004350,78.047929,-79.935547,-51.940633,23.14554,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205466,396840,85481,46,378,1834166,888599,379.5,355.1292,0.035,26.58193,...,171.052367,-16.029593,-62.035072,-44.024507,149.029400,78.046429,58.999934,86.994848,85.054471,
205522,396361,85490,47,374,433704,222876,77.4,355.1295,0.038,6.24589,...,81.948613,-91.117693,-137.123172,-119.112608,163.045350,78.046729,-16.088167,11.906748,-4.049279,
205604,395473,85510,45,374,4812195,1792343,621.9,355.1291,0.044,84.44941,...,81.948213,-153.133743,-199.139222,-181.128658,101.029300,78.046329,-78.104217,-50.109302,-4.049679,
205609,400719,85511,44,378,1390123,1483025,332.2,355.1292,0.027,20.52410,...,81.948313,-153.133643,-199.139122,-181.128558,101.029400,78.046429,-78.104117,-50.109202,-4.049579,


In [368]:
peaks.mz_max.median()

355.1296

In [360]:
# check for how many different building blocks this occurs
print("I:", len(peaks.initiator.unique()))
print("M:", len(peaks.monomer.unique()))
print("T:", len(peaks.terminator.unique()))

I: 68
M: 73
T: 5


In [361]:
# let's check which terminators are involved
for short in peaks.terminator.unique():
    print(short, con.get_long_name(short))

T18 TerABT023
T7 TerABT009
T3 TerABT005
T24 TerTH006
T1 TerABT001


In [363]:
# Let's see where we have frequent occurrences
peaks.terminator.value_counts()

T18    1041
T3      798
T7      282
T24      27
T1        1
Name: terminator, dtype: int64

In [364]:
con.show_image("T18")

In [365]:
con.show_image("T3")

In [366]:
con.show_image("T7")

T18, T3, T7 are all isomers of each other (Me-substituted ABTs).
Since they all have the same mass, the fact that the same delta_T comes up for all of them does not tell us anything about whether the terminator is part of the unknown species.
We don't investigate this one further.

#### Terminator +77.04 / product E +78.05 (C6H6)
Initially, we only considered T+77.04 here, but analysis of the spectra showed that this is actually a frequent fragment arising from the product E + C6H6 species.

In [369]:
peaks = df.loc[df.delta_T.round(2) == 77.04]
peaks

Unnamed: 0,id,experiment_id,peak_nr,retention_time_s,area,intensity,signal_to_noise,mz_max,fwhm_min,%area,...,delta_bAA,delta_A,delta_B,delta_C,delta_D,delta_E,delta_F,delta_G,delta_H,assignment
30742,1150175,26661,48,354,2312804,1387892,239.5,203.0758,0.036,16.347060,...,-30.073787,-257.069820,-303.075300,-285.064735,-42.938078,-45.97567,-196.055944,-168.061030,-102.056029,
30901,1149470,26701,40,354,3054552,1306252,231.4,203.0757,0.041,21.476330,...,-30.073887,-280.166742,-326.172221,-308.161657,-66.035000,-45.97577,-219.152866,-191.157951,-102.056129,
31014,1151191,26721,50,354,2526741,1079650,225.2,203.0757,0.039,19.073080,...,-30.073887,-259.202793,-305.208272,-287.197708,-45.071051,-45.97577,-198.188917,-170.194002,-102.056129,
31129,1150029,26741,58,354,694028,337756,68.9,203.0757,0.037,5.357470,...,-30.073887,-319.148420,-365.153899,-347.143334,-105.016677,-45.97577,-258.134543,-230.139629,-102.056129,
31186,1157254,26751,46,354,722334,389763,62.2,203.0757,0.035,5.535304,...,55.033025,-219.023449,-265.028928,-247.018363,-89.998614,-45.97577,-158.009573,-130.014658,-16.949221,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174293,1126463,76811,52,354,3049385,1163253,279.2,203.0759,0.043,20.396290,...,71.010377,-153.030442,-199.035922,-181.025357,-39.982764,-45.97557,-92.016566,-64.021652,-0.971865,
174326,1135174,76821,50,354,1896936,1110388,142.6,203.0759,0.037,12.833970,...,-40.036907,-264.077727,-310.083206,-292.072642,-39.982764,-45.97557,-203.063851,-175.068936,-112.019149,
174373,1133824,76831,47,354,1608932,976262,114.1,203.0759,0.036,11.288160,...,71.010377,-164.071579,-210.077058,-192.066494,-51.023900,-45.97557,-103.057703,-75.062788,-0.971865,
174465,1127497,76851,62,354,1740954,674977,141.8,203.0758,0.042,11.302403,...,71.010277,-245.056757,-291.062236,-273.051672,-132.009079,-45.97567,-184.042881,-156.047966,-0.971965,


In [382]:
get_lab_journal_number_well(41780)

('JG337', 'E8')

In [380]:
con.get_product_smiles(26661)

('CC(C)(C)OC(=O)NCC[C@H](Cc1nc2ccccc2s1)NC(=O)c1ccc(Cl)cc1',
 'CC(C)(C)OC(=O)NCC[C@H](CC1(C(=O)O)Nc2ccccc2S1)NC(=O)c1ccc(Cl)cc1',
 'CC(C)(C)OC(=O)NCC[C@@H]1CC2(C(=O)[O-])Sc3ccccc3[N+]2=C(c2ccc(Cl)cc2)N1',
 'Clc1ccc(-c2nc3ccccc3s2)cc1',
 'Nc1ccccc1SSc1ccccc1N',
 'CC(C)(C)OC(=O)NCC[C@H](CC(=O)C(=O)O)NC(=O)c1ccc(Cl)cc1',
 'CC(C)(C)OC(=O)NCC[C@H](CC(=O)O)NC(=O)c1ccc(Cl)cc1',
 'CC(C)(C)OC(=O)NCCC=Cc1nc2ccccc2s1')

In [378]:
peaks.delta_T.describe()

count    1031.000000
mean       77.038461
std         0.000167
min        77.037950
25%        77.038350
50%        77.038450
75%        77.038600
max        77.039000
Name: delta_T, dtype: float64

In [371]:
# check for how many different building blocks this occurs
print("I:", len(peaks.initiator.unique()))
print("M:", len(peaks.monomer.unique()))
print("T:", len(peaks.terminator.unique()))

I: 53
M: 60
T: 5


In [372]:
# let's check which terminators are involved
for short in peaks.terminator.unique():
    print(short, con.get_long_name(short))

T1 TerABT001
T7 TerABT009
T10 TerABT012
T3 TerABT005
T2 TerABT004


In [373]:
# Let's see where we have frequent occurrences
peaks.terminator.value_counts()

T7     598
T1     426
T3       3
T10      2
T2       2
Name: terminator, dtype: int64

In [376]:
con.get_smiles("T1")

'Nc1ccccc1S'

In [375]:
con.show_image("T7")

Again, with only two terminators forming this, we have very little evidence to determine the structure of the unknown species.
Looking into the spectrum reveals that the actual base peak has a different m/z (327.0984 for T1 and 355.1296 for T7, difference between the two is C2H4). In both cases, a difference +78.0468 (C6H6) is observed to terminator dimer E.
It should be noted that we also observe delta_E +78.05 frequently for other terminators.
Let's check those!

In [383]:
peaks = df.loc[df.delta_E.round(2) == 78.05]
peaks

Unnamed: 0,id,experiment_id,peak_nr,retention_time_s,area,intensity,signal_to_noise,mz_max,fwhm_min,%area,...,delta_bAA,delta_A,delta_B,delta_C,delta_D,delta_E,delta_F,delta_G,delta_H,assignment
44,518314,10584,191,354,4999376,1875227,715.1,327.0988,0.043,69.00407,...,108.960112,-118.035921,-164.041401,-146.030836,81.084922,78.047330,-57.022045,-29.027130,36.97787,
55,519463,10586,181,382,5384517,2173071,938.2,355.1304,0.041,74.13447,...,136.991712,-104.019971,-150.025451,-132.014886,95.100872,78.047629,-28.990445,-0.995530,50.99382,
81,533032,10594,35,354,3276210,1215109,478.5,327.0989,0.041,48.48351,...,81.111635,-145.884402,-191.889881,-173.879316,81.085022,78.047430,-84.870526,-56.875611,9.12939,
87,528763,10596,43,382,4087039,1700510,1036.0,355.1304,0.040,51.41997,...,109.143135,-131.868552,-177.874031,-159.863467,95.100872,78.047629,-56.839026,-28.844111,23.14524,
129,519939,10604,54,354,4929430,1608085,615.0,327.0989,0.050,68.22611,...,108.960212,-98.090444,-144.095923,-126.085358,101.030400,78.047430,-37.076568,-9.081653,36.97797,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205466,396840,85481,46,378,1834166,888599,379.5,355.1292,0.035,26.58193,...,171.052367,-16.029593,-62.035072,-44.024507,149.029400,78.046429,58.999934,86.994848,85.054471,
205522,396361,85490,47,374,433704,222876,77.4,355.1295,0.038,6.24589,...,81.948613,-91.117693,-137.123172,-119.112608,163.045350,78.046729,-16.088167,11.906748,-4.049279,
205604,395473,85510,45,374,4812195,1792343,621.9,355.1291,0.044,84.44941,...,81.948213,-153.133743,-199.139222,-181.128658,101.029300,78.046329,-78.104217,-50.109302,-4.049679,
205609,400719,85511,44,378,1390123,1483025,332.2,355.1292,0.027,20.52410,...,81.948313,-153.133643,-199.139122,-181.128558,101.029400,78.046429,-78.104117,-50.109202,-4.049579,


In [384]:
# check for how many different building blocks this occurs
print("I:", len(peaks.initiator.unique()))
print("M:", len(peaks.monomer.unique()))
print("T:", len(peaks.terminator.unique()))

I: 69
M: 73
T: 10


In [385]:
# let's check which terminators are involved
for short in peaks.terminator.unique():
    print(short, con.get_long_name(short))

T1 TerABT001
T18 TerABT023
T7 TerABT009
T3 TerABT005
T13 TerABT015
T2 TerABT004
T12 TerABT014
T10 TerABT012
T16 TerABT018
T41 TerTH028


In [386]:
# Let's see where we have frequent occurrences
peaks.terminator.value_counts()

T18    1041
T3      798
T1      770
T2      435
T7      282
T12      25
T16      17
T13       9
T10       8
T41       1
Name: terminator, dtype: int64

So this occurs mainly with T1-3, T7, T18, all small ABTs (plain, Me- and F- substituted)

In [390]:
con.get_smiles("T18")

'Cc1cccc(S)c1N'

The mass difference of +78.048 clearly corresponds to +C6H6.
A possible explanation is pi-stacking with benzene, but it's not evident where the benzene could come from.
In the end, we don't have any good hypothesis for this one.

#### Terminator +73.01

In [407]:
peaks = df.loc[df.delta_T.round(2) == 73.01]
peaks

Unnamed: 0,id,experiment_id,peak_nr,retention_time_s,area,intensity,signal_to_noise,mz_max,fwhm_min,%area,...,delta_bAA,delta_A,delta_B,delta_C,delta_D,delta_E,delta_F,delta_G,delta_H,assignment
76,526397,10592,35,330,670776,280693,125.3,243.0713,0.038,8.54829,...,-2.915965,-273.938217,-319.943696,-301.933131,-46.968793,-94.032600,-168.898126,-140.903211,-118.924425,
161,531418,10615,31,353,363557,142053,67.7,249.0609,0.040,5.02121,...,3.073635,-253.992674,-299.998154,-281.987589,-27.023250,-100.021871,-142.963148,-114.968233,-118.92426,
163,529104,10616,31,337,396970,167275,70.7,213.0606,0.037,5.69730,...,-32.926665,-253.992974,-299.998454,-281.987889,-27.023550,-64.022171,-178.963448,-150.968533,-118.92456,
191,527667,10622,45,330,728395,290757,125.2,243.0714,0.039,10.61125,...,24.932712,-269.186358,-315.191837,-297.181272,-70.065514,-94.032500,-164.146267,-136.151352,-91.075745,
252,530503,10632,44,330,486108,220762,68.3,243.0714,0.036,6.78165,...,-2.915865,-297.034938,-343.040417,-325.029853,-70.065514,-94.032500,-191.994847,-163.999933,-118.924325,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205373,389234,85463,32,314,725622,298162,139.6,229.0554,0.041,10.63225,...,44.978567,-158.098307,-204.103786,-186.093222,6.960685,-80.017200,-67.073866,-39.078952,-57.014243,
205459,386434,85480,31,333,491301,177351,63.7,213.0606,0.048,8.51142,...,28.983767,-158.098193,-204.103672,-186.093107,6.960800,-64.022171,-83.068666,-55.073752,-57.014129,
205465,396825,85481,31,335,403853,163037,72.2,213.0605,0.040,5.85290,...,28.983667,-158.098293,-204.103772,-186.093207,6.960700,-64.022271,-83.068766,-55.073852,-57.014229,
205562,388902,85503,31,314,639087,193320,63.1,229.0554,0.052,8.64167,...,44.978567,-206.098307,-252.103786,-234.093222,-41.039315,-80.017200,-115.073866,-87.078952,-57.014243,


In [408]:
peaks.delta_T.describe()

count    1017.000000
mean       73.007466
std         0.000385
min        73.006885
25%        73.007185
50%        73.007385
75%        73.007685
max        73.008685
Name: delta_T, dtype: float64

In [409]:
# check for how many different building blocks this occurs
print("I:", len(peaks.initiator.unique()))
print("M:", len(peaks.monomer.unique()))
print("T:", len(peaks.terminator.unique()))

I: 67
M: 67
T: 9


In [410]:
# let's check which terminators are involved
for short in peaks.terminator.unique():
    print(short, con.get_long_name(short))

T17 TerABT022
T8 TerABT010
T18 TerABT023
T13 TerABT015
T7 TerABT009
T2 TerABT004
T4 TerABT006
T3 TerABT005
T14 TerABT016


In [411]:
# Let's see where we have frequent occurrences
peaks.terminator.value_counts()

T17    478
T13    313
T4     151
T18     18
T3      16
T8      14
T7      13
T2      10
T14      4
Name: terminator, dtype: int64

In [402]:
con.get_smiles("T4")

'Cc1cc(N)c(S)cc1C'

In [401]:
con.show_image("T3")

These are all particularly electron-rich ABTs, raising electrophilic aromatic substitution as a possible mechanism.
One way to explain the difference would be +C3H3NCl -O, but T4 does not have an oxygen atom.
We don't find an acceptable explanation for this mass difference.

#### Product F −56.06 / Product G -28.07
(these are the same, as the difference between F and G is constant)

In [43]:
peaks = df.loc[df.delta_F.round(2) == -56.06]
peaks

Unnamed: 0,id,experiment_id,peak_nr,retention_time_s,area,intensity,signal_to_noise,mz_max,fwhm_min,%area,...,delta_bAA,delta_A,delta_B,delta_C,delta_D,delta_E,delta_F,delta_G,delta_H,assignment
8896,631692,14610,19,244,1171680,400098,133.9,338.0628,0.040,18.08988,...,190.020125,-174.090510,-220.095989,-202.085424,-45.065675,38.977921,-56.055170,-28.060255,61.016415,
9713,647315,14752,25,267,361222,90143,33.9,385.1209,0.057,5.34745,...,111.940013,-150.043469,-196.048949,-178.038384,104.119573,134.144309,-56.062273,-28.067358,6.990451,
9718,646148,14753,146,259,1409229,340744,116.9,385.1208,0.060,21.45709,...,111.939913,-131.091899,-177.097379,-159.086814,123.071143,108.038029,-56.062373,-28.067458,25.942021,
9719,646150,14753,148,259,730870,157756,2160.9,385.1208,0.064,11.12831,...,111.939913,-131.091899,-177.097379,-159.086814,123.071143,108.038029,-56.062373,-28.067458,25.942021,
9725,640793,14754,20,267,709969,146616,1816.3,385.1208,0.063,10.10419,...,111.939913,-131.091899,-177.097379,-159.086814,123.071143,108.038029,-56.062373,-28.067458,25.942021,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205613,396084,85512,22,256,478256,101867,1342.3,377.1706,0.071,8.18602,...,103.989713,-135.062006,-181.067485,-163.056920,119.101037,156.157824,-56.062717,-28.067802,14.022058,
205614,396085,85512,23,256,455263,105184,43.3,377.1706,0.068,7.79246,...,103.989713,-135.062006,-181.067485,-163.056920,119.101037,156.157824,-56.062717,-28.067802,14.022058,
205620,400056,85513,19,262,1437655,350660,133.7,377.1705,0.055,21.57598,...,103.989613,-147.087258,-193.092737,-175.082172,107.075785,68.097900,-56.062817,-28.067902,1.996806,
205621,400057,85513,20,262,1388678,318975,3814.2,377.1705,0.058,20.84094,...,103.989613,-147.087258,-193.092737,-175.082172,107.075785,68.097900,-56.062817,-28.067902,1.996806,


In [44]:
# check for how many different building blocks this occurs
print("I:", len(peaks.initiator.unique()))
print("M:", len(peaks.monomer.unique()))
print("T:", len(peaks.terminator.unique()))

I: 42
M: 10
T: 38


In [45]:
# let's check which monomers are involved
for short in peaks.monomer.unique():
    print(short, con.get_long_name(short))

M73 Spiro016
M34 Mon076
M19 Mon016
M33 Mon075
M18 Mon015
M31 Mon072
M17 Mon014
M20 Mon017
M44 Mon090
M5 Mon096


In [46]:
# Let's see where we have frequent occurrences
peaks.monomer.value_counts()

M18    464
M34    401
M17    272
M44    180
M20     42
M33     21
M5       8
M19      6
M73      1
M31      1
Name: monomer, dtype: int64

In [47]:
con.show_image("M20")

All of these monomers have a Boc or tert-butyl group. We already know this mass difference from the Boc -> carbamic acid fragmentation seen for monomers. For the tert-butyl esters the same fragmentation can occur.

In [48]:
# find and count occurences
peaks_f_carbamic_acid = df.loc[df["delta_F"].between(*delta_tert_butyl), "id"].to_numpy()
print(len(peaks_f_carbamic_acid))

1396


In [49]:
# commit to DB
assignment = [f"F (-tert-butyl fragmentation in MS for Boc or tert-butyl ester)" for _ in range(len(peaks_f_carbamic_acid))]
with con.con:
    con.con.executemany("INSERT INTO lcms_peaks_assignment (peak_id, assignment) VALUES (?, ?);", tuple(zip(peaks_f_carbamic_acid.tolist(), assignment)))

Curiously, we don't usually get the fragmentation peak for all other products, only F and unreacted M.

#### Product F −71.985 / Product G -43.990
(these are the same)
I found this one during analysing the fate of M58.
It corresponds to loss of CO2 from G, which is a plausible decarboxylation for beta-2-monomers.

In [65]:
peaks = df.loc[df.delta_G.between(-43.991, -43.989)]
peaks

Unnamed: 0,id,experiment_id,peak_nr,retention_time_s,area,intensity,signal_to_noise,mz_max,fwhm_min,%area,...,delta_bAA,delta_A,delta_B,delta_C,delta_D,delta_E,delta_F,delta_G,delta_H,assignment
60636,1878192,41480,13,131,510094,107270,10.7,250.1436,0.071,9.187080,...,104.026045,-147.014444,-193.019923,-175.009358,-19.914730,-26.939171,-71.984917,-43.990003,18.028149,
120982,408992,63352,27,277,1199084,399078,129.7,224.0850,0.045,13.826250,...,62.026674,-150.984246,-196.989725,-178.979160,-7.943761,3.072224,-71.984957,-43.990042,-27.940984,T cyclohexanone condensate
133651,2940090,66978,12,174,1286813,434814,105.0,251.1177,0.052,5.047331,...,84.036193,-204.994790,-251.000270,-232.989705,-56.931128,-77.916206,-71.984937,-43.990022,-59.942027,
133654,2940861,66979,9,165,1269506,186868,44.1,251.1177,0.087,5.317367,...,84.036193,-160.009712,-206.015191,-188.004627,-11.946049,12.053951,-71.984937,-43.990022,-14.956948,
133655,2940863,66979,11,175,1233923,400155,96.7,251.1176,0.053,5.168329,...,84.036093,-160.009812,-206.015291,-188.004727,-11.946149,12.053851,-71.985037,-43.990122,-14.957048,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202940,352907,84879,11,167,5912382,2178955,563.8,255.1488,0.040,83.596760,...,88.067293,-165.966333,-211.971813,-193.961248,-17.902670,4.172209,-71.985137,-43.990222,-16.882269,
202945,368213,84880,9,151,2850181,271802,93.6,255.1491,0.115,38.430050,...,88.067593,-147.014363,-193.019843,-175.009278,1.049300,-21.933671,-71.984837,-43.989922,2.069701,
202946,368214,84880,10,169,2777781,882640,306.5,255.1488,0.041,37.453860,...,88.067293,-147.014663,-193.020143,-175.009578,1.049000,-21.933971,-71.985137,-43.990222,2.069401,
202951,365555,84881,9,153,2309518,262576,83.5,255.1490,0.157,31.611970,...,88.067493,-147.014463,-193.019943,-175.009378,1.049200,-21.933771,-71.984937,-43.990022,2.069601,


In [66]:
# check for how many different building blocks this occurs
print("I:", len(peaks.initiator.unique()))
print("M:", len(peaks.monomer.unique()))
print("T:", len(peaks.terminator.unique()))

I: 47
M: 5
T: 40


In [67]:
# let's check which monomers are involved
for short in peaks.monomer.unique():
    print(short, con.get_long_name(short))

M11 Mon003
M68 Spiro011
M58 Mon104
M73 Spiro016
M57 Mon103


In [68]:
# Let's see where we have frequent occurrences
peaks.monomer.value_counts()

M58    407
M57    386
M11      1
M68      1
M73      1
Name: monomer, dtype: int64

The ones with M11, M68, M73 are likely random m/z collisions. We only consider the ones with M57 and M58.

In [69]:
# find and count occurences
peaks_g_decarbox = peaks.loc[peaks["monomer"].isin(["M57", "M58"]), "id"].to_numpy()
print(len(peaks_g_decarbox))

793


In [70]:
# commit to DB
assignment = [f"G_decarboxylated" for _ in range(len(peaks_g_decarbox))]
with con.con:
    con.con.executemany("INSERT INTO lcms_peaks_assignment (peak_id, assignment) VALUES (?, ?);", tuple(zip(peaks_g_decarbox.tolist(), assignment)))

## Summary

In [22]:
df.loc[df["experiment_id"] == 20589]

Unnamed: 0,id,experiment_id,peak_nr,retention_time_s,area,intensity,signal_to_noise,mz_max,fwhm_min,%area,...,delta_bAA,delta_A,delta_B,delta_C,delta_D,delta_E,delta_F,delta_G,delta_H,assignment
16707,841547,20589,5,48,16453408,570256,185.5,179.0164,0.475,100.0,...,-66.970865,-461.084853,-507.090332,-489.079767,-234.115429,-158.0875,-356.044762,-328.049847,-182.979325,
16708,841571,20589,29,294,1308911,539379,173.6,325.0833,0.044,7.95526,...,79.096035,-315.017953,-361.023432,-343.012867,-88.048529,-12.0206,-209.977862,-181.982947,-36.912425,


In [6]:
# let's get a new version from DB and see what unassigned peaks we have left!
reaction_ids = con.get_reaction_ids_for_building_block(filter_exp_nr=(4, 29))
df = pd.concat([con.get_lcms_peaks(i, with_delta=True, with_assignment=True, with_building_blocks=True) for i in reaction_ids]).reset_index(drop=True)
# filter a bit: only unidentified peaks, and only those with a retention time > 4 min (240 s), and only with an m/z over 200
df_unassigned = df.loc[df["assignment"].isna() & (df["retention_time_s"] > 240) & (df["mz_max"] > 200)]
df_unassigned

Unnamed: 0,id,experiment_id,peak_nr,retention_time_s,area,intensity,signal_to_noise,mz_max,fwhm_min,%area,...,delta_bAA,delta_A,delta_B,delta_C,delta_D,delta_E,delta_F,delta_G,delta_H,assignment
4,524156,10579,115,245,371927,72549,35.9,274.2743,0.088,5.57669,...,56.135612,-187.861818,-233.867298,-215.856733,11.259025,55.229555,-109.846545,-81.851630,-32.848027,
7,516525,10580,162,362,917843,223037,3555.3,588.0722,0.060,13.73421,...,369.933512,48.004524,1.999045,20.009610,247.125368,213.164340,203.951355,231.946270,203.018316,
8,516528,10580,165,362,520875,203617,89.5,588.0721,0.051,7.79416,...,369.933412,48.004424,1.998945,20.009510,247.125268,213.164240,203.951255,231.946170,203.018216,
10,532645,10581,110,274,362967,89435,31.9,405.1966,0.068,5.07251,...,187.057912,-109.991220,-155.996699,-137.986134,89.129624,80.048452,21.075755,49.070670,45.022572,
20,521701,10582,131,242,145022,57629,26.5,274.2743,0.048,11.76016,...,56.135612,-214.886636,-260.892115,-242.881551,-15.765793,-62.829600,-109.846545,-81.851630,-59.872845,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205656,401424,85521,28,267,444307,136591,59.2,475.1570,0.043,25.84048,...,291.080167,55.998207,9.992728,28.003293,221.057200,198.074229,131.027734,159.022648,205.082271,
205657,401428,85521,32,274,140629,47175,20.2,473.1438,0.041,8.17887,...,289.066967,53.985007,7.979528,25.990093,219.044000,196.061029,129.014534,157.009448,203.069071,
205658,401431,85521,35,293,124932,41486,17.4,691.2559,0.043,7.26593,...,507.179067,272.097107,226.091628,244.102193,437.156100,414.173129,347.126634,375.121548,421.181171,
205659,401434,85521,38,305,131011,48852,20.3,711.2244,0.043,7.61949,...,527.147567,292.065607,246.060128,264.070693,457.124600,434.141629,367.095134,395.090048,441.149671,


In [23]:
con.con.execute('SELECT COUNT(*) FROM lcms_peaks WHERE "%area" = 100').fetchone()[0]


55693

In [21]:
con.con.execute('SELECT COUNT(*) FROM lcms_peaks JOIN lcms_peaks_assignment ON lcms_peaks.id = lcms_peaks_assignment.peak_id WHERE assignment IS NOT NULL AND "%area" = 100;').fetchone()[0]


51499

In [14]:
# number of peaks in the "interesting" region
len(df.loc[(df["retention_time_s"] > 240) & (df["mz_max"] > 200)])

85613

In [8]:
# number of unassigned peaks in the "interesting" region
len(df_unassigned)

57363

In [15]:
# total number of peaks >5% area
con.con.execute('SELECT COUNT(*) FROM lcms_peaks WHERE "%area" > 5;').fetchone()[0]

502626

In [16]:
# total number of assigned peaks >5% area
con.con.execute('SELECT COUNT(*) FROM lcms_peaks JOIN lcms_peaks_assignment ON lcms_peaks.id = lcms_peaks_assignment.peak_id WHERE assignment IS NOT NULL AND "%area" > 5;').fetchone()[0]

383716