# Extract MoBiAS PDF reports

The PDF reports are a great source of additional information as they contain data for peaks that may not have been assigned to any known product.
Mining these reports is much faster than reprocessing of the raw LCMS data.

Here, we are in particular interested to identify
1. leftover starting materials
2. systematic side products that we have not been previously looking for


In [4]:
import sys
import pathlib
import re
from datetime import datetime

sys.path.insert(0, str(pathlib.Path().resolve().parents[1]))

from pypdf import PdfReader
import pandas as pd
import numpy as np

from src.util.db_utils import SynFermDatabaseConnection
from src.definitions import DATA_DIR, PLATE_LIST_PATH
from src.util.rdkit_util import smiles_to_lcms_mass

In [5]:
con = SynFermDatabaseConnection()

## Extract PDF data

In [3]:
def import_lcms_full_report(path):
    # set up pdf reader
    reader = PdfReader(path)
    number_of_pages = len(reader.pages)
    lines = []
    found_data = False
    # iterate from second page until entire peak summary table is read completely
    for i in range(1, number_of_pages):
        page = reader.pages[i]
        text = page.extract_text()
        line_list = text.splitlines()
        if (line_list[0] != '# RT [min] Area I S/N Max. m/z FWHM [min] Area % Int. %') and found_data:  # stop when not encountering another data header
            break
        else:
            found_data = True
        lines += line_list
    
    if len(lines) == 0:
        raise RuntimeError("No data extracted")
        
    # remove spaces in header
    cleaned_lines = ['# RT[min] Area I S/N max_m/z FWHM[min] Area% Int%',]
    # remove footers and data headers on not-first page
    cleaned_lines += [line for line in lines if line[0].isnumeric()] 

    # split the lines into individual fields
    data = [line.split() for line in cleaned_lines]
    
    # assemble DataFrame from data
    df = pd.DataFrame(data[1:], columns=data[0]).astype("float")

    return df

In [None]:
%%capture output
# import the plate list to obtain LCMS identifier - plate_nr relation
plate_list = pd.read_csv(PLATE_LIST_PATH)
# we will record any files that cause exceptions for manual inspection
files_with_exceptions = []

# iterate over all experiments/plates
for exp_nr in range(1, 30):
    for plate_nr in range(1, 7):
        # print progress indicator
        now = datetime.now()
        print(f"exp {exp_nr}-{plate_nr}, started {now.strftime('%H:%M:%S')}")
 
        
        lcms_id = plate_list.loc[(plate_list["exp_nr"] == exp_nr) & (plate_list["plate_nr"] == plate_nr), "results_file_name"].item().split("_")[0]
        exp_path = DATA_DIR / "pdf_reports" / lcms_id
        full_report_paths = list(exp_path.glob("*_LCMS_Fullreport.pdf"))
        
        for path in full_report_paths:
            try:
                # get well from filename
                regex = r'_P\d{1}-[A-Z]-\d{1,2}_'
                match = re.search(regex, path.name)
                well = "".join(match.group().strip("_").split("-")[1:])
                reaction_id = con.get_reaction_id((exp_nr, plate_nr, well))
                df = import_lcms_full_report(path)
                # we persist this to the database for re-use
                # first reformat df to fit DB
                df.insert(0, "reaction_id", reaction_id)
                df.insert(2, "retention_time_s", (df["RT[min]"] * 60).astype("int"))
                df = df.drop(columns="RT[min]")
                df = df.rename(columns={"#": "peak_number", 
                           "Area": "area", 
                           "I": "intensity",
                           "S/N": "signal_to_noise",
                           "max_m/z": "mz_max",
                           "FWHM[min]": "fwhm_min", 
                           "Area%": "%area",
                           "Int%": "%intensity"
                          }).astype({"peak_number": "int",
                                     "area": "int",
                                     "intensity": "int"})
                # write all extracted peaks to DB
                with con.con:
                    con.con.executemany(
                        'INSERT INTO lcms_peaks (experiment_id, peak_nr, retention_time_s, area, intensity, signal_to_noise, mz_max, fwhm_min, "%area", "%intensity") VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?);', 
                        [tuple(row) for row in df.to_numpy()]
                    )
            except Exception as e:
                print(f"Something went wrong for {str(path)}")
                files_with_exceptions.append(path)
                print(e)

## Peak assignments - common contaminants

In [24]:
# define some expected contaminants
mz_dmso = [79.0212, 101.0032, 157.0351]  # M+H+, M+Na+ 2M+H+
mz_lock_tmp = 142.1590  # tetramethylpiperidine
mz_lock_hmp = 322.0481  # hexamethoxyphosphazene
mz_is = 361.1201  # fenofibrate

In [22]:
# write DMSO assignments to DB
# the median retention time for DMSO is 48 seconds, but it's a broad peak, so we just use <120s
for mz in mz_dmso:
    with con.con:
        con.con.execute("INSERT INTO lcms_peaks_assignment (peak_id, assignment) SELECT id, 'DMSO' FROM lcms_peaks WHERE mz_max BETWEEN ? AND ? AND retention_time_s < 120", (mz - 0.02, mz + 0.02))


In [27]:
# write lock mass assignments to DB
# since lock molecules are part of the solvent, we do not put a retention time constraint
with con.con:
    con.con.execute("INSERT INTO lcms_peaks_assignment (peak_id, assignment) SELECT id, 'lock mass tetramethylpiperidine' FROM lcms_peaks WHERE mz_max BETWEEN ? AND ?", (mz_lock_tmp - 0.02, mz_lock_tmp + 0.02))
with con.con:
    con.con.execute("INSERT INTO lcms_peaks_assignment (peak_id, assignment) SELECT id, 'lock mass hexamethoxyphosphazene' FROM lcms_peaks WHERE mz_max BETWEEN ? AND ?", (mz_lock_hmp - 0.02, mz_lock_hmp + 0.02))

In [28]:
# write IS assignments to DB
# the median retention time for fenofibrate was 368 s. We allow a 20 s window around this (the latest picked peak is at 388 so the upper bound is not necessary)
with con.con:
    con.con.execute("INSERT INTO lcms_peaks_assignment (peak_id, assignment) SELECT id, 'IS fenofibrate' FROM lcms_peaks WHERE mz_max BETWEEN ? AND ? AND retention_time_s > 348", (mz_is - 0.02, mz_is + 0.02))

## Peak assignments - Known reactants/products

We read extracted data from the DB, compare it with the expected masses and save the results back to the db

In [20]:
def calculate_lcms_peak_differences(identifier):

    # import relevant reactants and products
    # import starting material mzs
    sms = con.get_starting_materials_for_reaction(identifier)
    mz_i, mz_m, mz_t = [smiles_to_lcms_mass(smi) for smi in sms]
    mz_iacid = mz_i - 52.0096  # mass difference of degradation KAT-H to carboxylic acid
    mz_baa = mz_m - 124.05243  # mass difference of degradation to beta-amino acid
    mzs = {"I": mz_i, "M": mz_m, "T": mz_t, "I_acid": mz_iacid, "M_bAA": mz_baa}

    # import product mzs
    known_product_smiles = con.get_product_smiles(identifier)
    known_product_mzs = {s: smiles_to_lcms_mass(smi) for s, smi in zip("ABCDEFGH", known_product_smiles) if smi}  # "if smi" bc we sometimes have None for product H
    mzs.update(known_product_mzs)

    # assign known peaks
    for s, mz in mzs.items():
        with con.con:
            con.con.execute("INSERT INTO lcms_peaks_assignment (peak_id, assignment) SELECT id, ? AS assignment FROM lcms_peaks WHERE experiment_id=? AND mz_max BETWEEN ? AND ?", (s, identifier, mz - 0.02, mz + 0.02))

    # some peaks are still unexplained
    # for these, we calculate the difference to known mzs to see if a pattern emerges
    lcms_peaks = con.get_lcms_peaks(identifier, with_assignment=True)
    # ignore peaks that are already explained or that are insignificant
    unexplained_peaks = lcms_peaks.loc[lcms_peaks.assignment.isna() & (lcms_peaks["%area"] > 5)]
    unexplained_mzs = unexplained_peaks.mz_max.to_numpy()
    mzs_arr = np.array(list(mzs.values()))
    # calculate the differences with all the considered masses
    delta = pd.concat([unexplained_peaks[["experiment_id", "id"]].reset_index(drop=True), pd.DataFrame(unexplained_mzs[:, None] - mzs_arr, columns=list(mzs.keys()))], axis=1)

    # add "H" column if it does not exist
    if "H" not in delta.columns:
        delta["H"] = np.nan
    
    with con.con:
        # persist differences to DB
        con.con.executemany("INSERT INTO lcms_peaks_differences (experiment_id, peak_id, delta_I, delta_M, delta_T, delta_Iacid, delta_bAA, delta_A, delta_B, delta_C, delta_D, delta_E, delta_F, delta_G, delta_H) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);",
                           delta.to_numpy())
    return delta

In [21]:
# we will record any files that cause exceptions for manual inspection
records_with_exceptions = []
# iterate over all experiments/plates
for exp_nr in range(1, 30):
    for plate_nr in range(1, 7):
        # print progress indicator
        now = datetime.now()
        print(f"exp {exp_nr}-{plate_nr}, started {now.strftime('%H:%M:%S')}")

        identifiers = con.get_reaction_ids_for_plate((exp_nr, plate_nr))
        for i in identifiers:
            try:
                calculate_lcms_peak_differences(i)
            except Exception as e:
                records_with_exceptions.append(i)

exp 1-1, started 14:22:14
exp 1-2, started 14:22:17
exp 1-3, started 14:22:19
exp 1-4, started 14:22:21
exp 1-5, started 14:22:24
exp 1-6, started 14:22:26
exp 2-1, started 14:22:29
exp 2-2, started 14:22:31
exp 2-3, started 14:22:33
exp 2-4, started 14:22:36
exp 2-5, started 14:22:38
exp 2-6, started 14:22:41
exp 3-1, started 14:22:43
exp 3-2, started 14:22:45
exp 3-3, started 14:22:48
exp 3-4, started 14:22:51
exp 3-5, started 14:22:54
exp 3-6, started 14:22:56
exp 4-1, started 14:22:59
exp 4-2, started 14:23:01
exp 4-3, started 14:23:04
exp 4-4, started 14:23:07
exp 4-5, started 14:23:09
exp 4-6, started 14:23:12
exp 5-1, started 14:23:14
exp 5-2, started 14:23:17
exp 5-3, started 14:23:19
exp 5-4, started 14:23:22
exp 5-5, started 14:23:25
exp 5-6, started 14:23:28
exp 6-1, started 14:23:30
exp 6-2, started 14:23:33
exp 6-3, started 14:23:35
exp 6-4, started 14:23:38
exp 6-5, started 14:23:40
exp 6-6, started 14:23:42
exp 7-1, started 14:23:45
exp 7-2, started 14:23:47
exp 7-3, sta

In [27]:
# count the assigned peaks
assigned_peaks = con.con.execute("SELECT peak_id, assignment, lp.experiment_id FROM lcms_peaks_assignment JOIN lcms_peaks lp on lcms_peaks_assignment.peak_id = lp.id").fetchall()
df = pd.DataFrame(assigned_peaks, columns=["peak_id", "assignment", "experiment_id"])
# count the number of reaction each species is found in
df[["assignment", "experiment_id"]].drop_duplicates().groupby(["assignment"]).count()

Unnamed: 0_level_0,experiment_id
assignment,Unnamed: 1_level_1
A,22943
B,14541
C,3630
D,25151
DMSO,54892
E,27215
F,30292
G,13861
H,5662
I,582


In [30]:
len(df["experiment_id"].unique())

55477

## Peak assignments - Find more contaminants

In [31]:
## get all peaks to identify common contaminants
res = con.con.execute('SELECT lcms_peaks.id, lcms_peaks.experiment_id, lcms_peaks.peak_nr, retention_time_s, area, intensity, signal_to_noise, mz_max, fwhm_min, "%area", "%intensity", a.assignment FROM lcms_peaks LEFT JOIN lcms_peaks_assignment a on lcms_peaks.id = a.peak_id;',
                ).fetchall()
df = pd.DataFrame(
    res,
    columns=[
       "peak_id",
        "reaction_id",
        "peak_nr",
        "retention_time_s",
        "area",
        "intensity",
        "signal_to_noise",
        "mz_max",
        "fwhm_min",
        "%area",
        "%intensity",
        "assignment",
    ]
)

In [32]:
df.loc[df.assignment.isna(), "mz_max"].round(2).value_counts()

100.11    194848
128.11     78793
111.04     70752
99.53      61966
84.08      54308
           ...  
553.83         1
756.27         1
766.21         1
79.08          1
253.75         1
Name: mz_max, Length: 27420, dtype: int64

#### m/z 128.11

In [33]:
# what mass exactly do we observe?
df.loc[df["mz_max"].between(128.105, 128.109), "mz_max"].describe()

count    78398.000000
mean       128.106494
std          0.000541
min        128.105000
25%        128.106200
50%        128.106400
75%        128.106700
max        128.109000
Name: mz_max, dtype: float64

In [34]:
# when does the 128.1064 peak occur?
df.loc[df["mz_max"].between(128.105, 128.109), "retention_time_s"].describe()

count    78398.000000
mean       202.097936
std         44.703646
min         44.000000
25%        183.000000
50%        185.000000
75%        189.000000
max        388.000000
Name: retention_time_s, dtype: float64

An m/z of 128.1064 corrsponds to C7H13NO (calc m/z 128.1070)
This could e.g. be 2-azacyclooctanone.

The peak usually occurs after 184 seconds. We will assign anything with ±15s and ±0.002 m/z to this formula


In [35]:
# assign
peak_ids = df.loc[df["mz_max"].between(128.105, 128.109) & df["retention_time_s"].between(169, 199), "peak_id"].to_numpy()
with con.con:
    con.con.executemany('INSERT INTO lcms_peaks_assignment (peak_id, assignment) VALUES (?, "common contaminant C7H13NO");', peak_ids[:, None].tolist())

In [36]:
# assign
df.loc[df["mz_max"].between(128.105, 128.109) & ~df["retention_time_s"].between(169, 199)]

Unnamed: 0,peak_id,reaction_id,peak_nr,retention_time_s,area,intensity,signal_to_noise,mz_max,fwhm_min,%area,%intensity,assignment
444,445,86392,50,302,1399,3947,1.7,128.1064,0.013,0.053290,4.06,
485,486,86413,32,249,4175,2389,21.6,128.1066,0.029,0.053340,0.13,
627,628,86201,13,201,29472,10757,135.0,128.1066,0.037,0.204459,0.24,
657,658,86201,43,310,476,1841,23.1,128.1065,0.004,0.003304,0.04,
659,660,86201,45,314,2394,1152,15.5,128.1063,0.037,0.016613,0.03,
...,...,...,...,...,...,...,...,...,...,...,...,...
3116160,3116161,70085,41,268,1512,1794,17.3,128.1077,0.017,0.006253,0.06,
3116402,3116403,70001,17,208,2132,1199,14.3,128.1060,0.022,0.009843,0.03,
3117415,3117416,69925,18,200,32,2,209.7,128.1065,0.061,0.000152,0.00,
3117549,3117550,69916,22,200,33,2,297.4,128.1064,0.141,0.000170,0.00,


#### m/z 185.11

In [37]:
# what mass exactly do we observe?
df.loc[df["mz_max"].between(185.10, 185.12), "mz_max"].describe()

count    53431.000000
mean       185.114160
std          0.000348
min        185.106300
25%        185.113900
50%        185.114100
75%        185.114400
max        185.118500
Name: mz_max, dtype: float64

In [38]:
# when does the 185.1141 peak occur?
df.loc[df["mz_max"].between(185.1121, 185.1161), "retention_time_s"].describe()

count    53303.000000
mean       196.797460
std          4.850153
min         64.000000
25%        193.000000
50%        198.000000
75%        200.000000
max        326.000000
Name: retention_time_s, dtype: float64

An m/z of 185.1141 could correspond to C8H18O3Na+ (calc m/z 185.1148)
This could e.g. be diethyl-diethyleneglycol (or other short PEG chains) which are common LCMS contaiminants

The peak usually occurs after 198 seconds. We will assign anything with ±15s and ±0.002 m/z to this formula


In [39]:
# assign
peak_ids = df.loc[df["mz_max"].between(185.1128, 185.1168) & df["retention_time_s"].between(183, 213), "peak_id"].to_numpy()
with con.con:
    con.con.executemany('INSERT INTO lcms_peaks_assignment (peak_id, assignment) VALUES (?, "common PEG contaminant");', peak_ids[:, None].tolist())

#### m/z 239.09

In [40]:
# what mass exactly do we observe?
df.loc[df["mz_max"].between(239.08, 239.10), "mz_max"].describe()

count    37169.000000
mean       239.088666
std          0.000433
min        239.080700
25%        239.088400
50%        239.088600
75%        239.088900
max        239.099500
Name: mz_max, dtype: float64

In [41]:
# when does the 239.0885 peak occur?
df.loc[df["mz_max"].between(239.0865, 239.0905), "retention_time_s"].describe()

count    36976.000000
mean       196.820532
std          3.800570
min         46.000000
25%        196.000000
50%        197.000000
75%        198.000000
max        297.000000
Name: retention_time_s, dtype: float64

An m/z of 239.0885 could correspond to many different compounds.
The peak usually occurs after 196 seconds. We will assign anything with ±15s and ±0.002 m/z to "common contaminant"


In [42]:
# assign
peak_ids = df.loc[df["mz_max"].between(239.0865, 239.0905) & df["retention_time_s"].between(181, 211), "peak_id"].to_numpy()
with con.con:
    con.con.executemany('INSERT INTO lcms_peaks_assignment (peak_id, assignment) VALUES (?, "common contaminant");', peak_ids[:, None].tolist())

## Identify systematic mass differences [DRAFT]
Here's an idea how to go about identifying systematic mass differences:
Identify those records, where a certain mass difference (e.g. delta_T) is within a narrow span, but which use different (relevant) building blocks, in the example different terminators.
Rationale: If the observed m/z is dependent on terminator mass, the responsible species contains the terminator in some form.

In [43]:
reaction_ids = con.get_reaction_ids_for_building_block(filter_exp_nr=(4, 29))
df = pd.concat([con.get_lcms_peaks(i, with_delta=True, with_assignment=True, with_building_blocks=True) for i in reaction_ids]).reset_index(drop=True)

In [44]:
# filter a bit: only unidentified peaks, and only those with a retention time > 4 min (240 s), and only with an m/z over 200
df = df.loc[df["assignment"].isna() & (df["retention_time_s"] > 240) & (df["mz_max"] > 200)]
df

Unnamed: 0,id,experiment_id,peak_nr,retention_time_s,area,intensity,signal_to_noise,mz_max,fwhm_min,%area,...,delta_bAA,delta_A,delta_B,delta_C,delta_D,delta_E,delta_F,delta_G,delta_H,assignment
2,522750,10578,187,343,5609629,2104344,826.3,259.1262,0.041,86.13517,...,40.987512,-239.035070,-285.040550,-267.029985,-39.914227,-31.968849,-124.994645,-96.999730,-84.021279,
4,524156,10579,115,245,371927,72549,35.9,274.2743,0.088,5.57669,...,56.135612,-187.861818,-233.867298,-215.856733,11.259025,55.229555,-109.846545,-81.851630,-32.848027,
7,516525,10580,162,362,917843,223037,3555.3,588.0722,0.060,13.73421,...,369.933512,48.004524,1.999045,20.009610,247.125368,213.164340,203.951355,231.946270,203.018316,
8,516528,10580,165,362,520875,203617,89.5,588.0721,0.051,7.79416,...,369.933412,48.004424,1.998945,20.009510,247.125268,213.164240,203.951255,231.946170,203.018216,
10,532645,10581,110,274,362967,89435,31.9,405.1966,0.068,5.07251,...,187.057912,-109.991220,-155.996699,-137.986134,89.129624,80.048452,21.075755,49.070670,45.022572,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205656,401424,85521,28,267,444307,136591,59.2,475.1570,0.043,25.84048,...,291.080167,55.998207,9.992728,28.003293,221.057200,198.074229,131.027734,159.022648,205.082271,
205657,401428,85521,32,274,140629,47175,20.2,473.1438,0.041,8.17887,...,289.066967,53.985007,7.979528,25.990093,219.044000,196.061029,129.014534,157.009448,203.069071,
205658,401431,85521,35,293,124932,41486,17.4,691.2559,0.043,7.26593,...,507.179067,272.097107,226.091628,244.102193,437.156100,414.173129,347.126634,375.121548,421.181171,
205659,401434,85521,38,305,131011,48852,20.3,711.2244,0.043,7.61949,...,527.147567,292.065607,246.060128,264.070693,457.124600,434.141629,367.095134,395.090048,441.149671,


In [45]:
# get unique differences
t_diff = df.round(2).groupby("delta_T")["terminator"].unique()
t_diff

delta_T
-91.91         [T26]
-90.93         [T26]
-84.94         [T26]
-81.89         [T26]
-80.98         [T26]
             ...    
 803.27    [T21, T2]
 804.26        [T39]
 807.24        [T18]
 821.26         [T1]
 841.22         [T1]
Name: terminator, Length: 6404, dtype: object

In [46]:
t_diff.loc[t_diff.apply(lambda x: len(x) > 20)]

delta_T
80.06     [T25, T34, T18, T22, T36, T1, T28, T32, T31, T...
103.01    [T18, T12, T2, T11, T29, T19, T33, T5, T26, T4...
110.02    [T25, T39, T34, T17, T36, T18, T22, T1, T23, T...
153.02    [T25, T1, T8, T18, T22, T7, T3, T12, T2, T11, ...
166.92    [T31, T40, T20, T7, T3, T34, T39, T17, T36, T1...
Name: terminator, dtype: object

In [47]:
df.loc[df["delta_T"].between(166.915, 166.925)]

Unnamed: 0,id,experiment_id,peak_nr,retention_time_s,area,intensity,signal_to_noise,mz_max,fwhm_min,%area,...,delta_bAA,delta_A,delta_B,delta_C,delta_D,delta_E,delta_F,delta_G,delta_H,assignment
6397,435466,13160,42,344,681948,277515,108.0,349.9781,0.043,21.03909,...,219.891842,-109.070389,-155.075868,-137.065304,1.998025,50.893221,8.964951,36.959866,90.888136,
6398,435468,13160,44,344,895042,284261,3795.0,349.9781,0.046,27.61334,...,219.891842,-109.070389,-155.075868,-137.065304,1.998025,50.893221,8.964951,36.959866,90.888136,
6403,452354,13161,48,274,417192,112765,1181.0,360.9690,0.053,6.36450,...,230.882742,-109.070321,-155.075801,-137.065236,1.998093,39.902457,19.955851,47.950766,90.888204,
6404,452355,13161,49,274,398529,116612,48.7,360.9689,0.053,6.07978,...,230.882642,-109.070421,-155.075901,-137.065336,1.997993,39.902357,19.955751,47.950666,90.888104,
6408,443039,13162,55,335,403455,172618,56.5,325.9238,0.042,5.99284,...,195.837542,-109.070545,-155.076025,-137.065460,1.997869,74.947209,-15.089349,12.905566,90.88798,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203114,379448,84915,51,371,414345,142573,2415.8,387.9546,0.044,6.21591,...,114.810102,-252.128952,-298.134431,-280.123867,1.997705,12.916081,-96.116792,-68.121878,-52.170427,
203125,376386,84917,45,343,825214,330219,124.2,349.9777,0.044,11.96499,...,76.833202,-252.129032,-298.134511,-280.123947,1.997625,50.892821,-134.093692,-106.098778,-52.170507,
203126,376387,84917,46,343,1073677,340251,2920.8,349.9778,0.046,15.56751,...,76.833302,-252.128932,-298.134411,-280.123847,1.997725,50.892921,-134.093592,-106.098678,-52.170407,
204428,393234,85237,48,344,662514,261112,74.3,349.9777,0.044,9.35856,...,165.900867,-163.061367,-209.066846,-191.056282,1.997625,50.892821,-45.026027,-17.031113,36.897158,


These are actually all caused by bromine isotope 81Br. The mass difference to 79Br is 1.9979535, which coincides with delta_T.

In [62]:
building_blocks = [x[0] for x in con.con.execute("SELECT short FROM building_block_shorts;").fetchall()]
has_bromine = ["Br" in con.get_smiles(bb) for bb in building_blocks]
building_blocks_with_bromine = [x[0] for x in zip(building_blocks, has_bromine) if x[1]]
building_blocks_with_bromine

['I4', 'I11', 'I23', 'I53', 'I68', 'I78', 'M7', 'T5', 'T15', 'I53']

In [63]:
df.loc[df["delta_D"].between(1.9976, 1.9982) & (df["initiator"].isin(building_blocks_with_bromine) | df["terminator"].isin(building_blocks_with_bromine))]

Unnamed: 0,id,experiment_id,peak_nr,retention_time_s,area,intensity,signal_to_noise,mz_max,fwhm_min,%area,...,delta_bAA,delta_A,delta_B,delta_C,delta_D,delta_E,delta_F,delta_G,delta_H,assignment
6397,435466,13160,42,344,681948,277515,108.0,349.9781,0.043,21.03909,...,219.891842,-109.070389,-155.075868,-137.065304,1.998025,50.893221,8.964951,36.959866,90.888136,
6398,435468,13160,44,344,895042,284261,3795.0,349.9781,0.046,27.61334,...,219.891842,-109.070389,-155.075868,-137.065304,1.998025,50.893221,8.964951,36.959866,90.888136,
6403,452354,13161,48,274,417192,112765,1181.0,360.9690,0.053,6.36450,...,230.882742,-109.070321,-155.075801,-137.065236,1.998093,39.902457,19.955851,47.950766,90.888204,
6404,452355,13161,49,274,398529,116612,48.7,360.9689,0.053,6.07978,...,230.882642,-109.070421,-155.075901,-137.065336,1.997993,39.902357,19.955751,47.950666,90.888104,
6408,443039,13162,55,335,403455,172618,56.5,325.9238,0.042,5.99284,...,195.837542,-109.070545,-155.076025,-137.065460,1.997869,74.947209,-15.089349,12.905566,90.88798,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203114,379448,84915,51,371,414345,142573,2415.8,387.9546,0.044,6.21591,...,114.810102,-252.128952,-298.134431,-280.123867,1.997705,12.916081,-96.116792,-68.121878,-52.170427,
203125,376386,84917,45,343,825214,330219,124.2,349.9777,0.044,11.96499,...,76.833202,-252.129032,-298.134511,-280.123947,1.997625,50.892821,-134.093692,-106.098778,-52.170507,
203126,376387,84917,46,343,1073677,340251,2920.8,349.9778,0.046,15.56751,...,76.833302,-252.128932,-298.134411,-280.123847,1.997725,50.892921,-134.093592,-106.098678,-52.170407,
204428,393234,85237,48,344,662514,261112,74.3,349.9777,0.044,9.35856,...,165.900867,-163.061367,-209.066846,-191.056282,1.997625,50.892821,-45.026027,-17.031113,36.897158,


In [52]:
[print(con.get_smiles(s)) for s in ["I11"]]


O=C(c1ccc(Br)nc1)[B-](F)(F)F.[K+]


[None]

In [48]:

con.get_product_smiles(13160)

('COc1ccccc1-c1nnc([C@@H]2CCC[C@@H]2NC(=O)c2ccc(Br)cn2)s1',
 'COc1ccccc1C1=NNC(C(=O)O)([C@@H]2CCC[C@@H]2NC(=O)c2ccc(Br)cn2)S1',
 'COc1ccccc1C1=N[N+]2=C(c3ccc(Br)cn3)N[C@H]3CCC[C@H]3C2(C(=O)[O-])S1',
 'COc1ccccc1-c1nnc(-c2ccc(Br)cn2)s1',
 'COc1ccccc1-c1nnc(-c2ccccc2OC)s1',
 'O=C(O)C(=O)[C@@H]1CCC[C@@H]1NC(=O)c1ccc(Br)cn1',
 'O=C(N[C@H]1CCC[C@H]1C(=O)O)c1ccc(Br)cn1',
 'COc1ccccc1-c1nnc(C2=CCCC2)s1')

In [93]:
# There is an inconsistency here!
# The reaction with id 20838 uses initiator I3, and according to the SMILES, this is 2-Pyr005.
# However, according to the building blocks table, I3 is Ph002
# The culprit is a change that was made after the initial experiment planning that is not reflected in the experiments table

# let's check where the things don't align
# this pretty confusing query gets the latest version of each building block assignment
bb_mapping = con.con.execute(
"""
SELECT b.short, b.long
FROM building_block_shorts AS b
JOIN (
  SELECT short, MAX(first_use_exp_nr) AS max_exp
  FROM building_block_shorts
  GROUP BY short
) AS max_exp_table
ON b.short = max_exp_table.short AND b.first_use_exp_nr = max_exp_table.max_exp;
"""
).fetchall()
experiment_bb_mapping = con.con.execute("SELECT id, exp_nr, plate_nr, initiator, monomer, terminator, initiator_long, monomer_long, terminator_long FROM experiments;").fetchall()

In [94]:
len(bb_mapping)

193

In [95]:
bb_mapping = dict(bb_mapping)

In [96]:
bb_mapping

{'I1': '2-Pyr002',
 'I2': '2-Pyr003',
 'I4': '2-Pyr006',
 'I5': '2-Pyr007',
 'I6': '2-Pyr008',
 'I7': '2-Pyr009',
 'I8': '2-Pyr010',
 'I9': '2-Thio001',
 'I10': '3-Fur001',
 'I11': '3-Pyr002',
 'I12': '3-Pyr003',
 'I13': '3-Pyr004',
 'I14': '3-Thio001',
 'I15': '4-Pym001',
 'I17': '4-Pyrazole001',
 'I18': '4-Pyrazole002',
 'I19': '5-Quin001',
 'I20': '6-Quin001',
 'I21': '8-Quin003',
 'I22': '8-Quin004',
 'I23': '8-Quin005',
 'I24': '8-Quin008',
 'I25': 'Al001',
 'I26': 'Al002',
 'I27': 'Al003',
 'I28': 'Al004',
 'I29': 'Al005',
 'I30': 'Al007',
 'I31': 'Al013',
 'I32': 'Al036',
 'I33': 'Al038',
 'I34': 'BiAl001',
 'I35': 'BiAl007',
 'I36': 'BiAl008',
 'I37': 'BiAl009',
 'I38': 'BiPh001',
 'I39': 'BiPh002',
 'I41': 'BiPh004',
 'I43': 'BiPh006',
 'I44': 'BiPh007',
 'I45': 'BiPh009',
 'I46': 'BiPh010',
 'I47': 'BiPh011',
 'I49': 'BiPyr002',
 'I51': 'BiPyr004',
 'I52': 'Ph001',
 'I54': 'Ph004',
 'I55': 'Ph005',
 'I56': 'Ph006',
 'I57': 'Ph007',
 'I58': 'Ph009',
 'I59': 'Ph010',
 'I60': 'P

In [77]:
misaligned = []
for exp in experiment_bb_mapping:
    for i in range(3, 6):
        true_long = bb_mapping[exp[i]]
        if true_long != exp[i+3]:
            misaligned.append([exp[0], exp[1], exp[2], exp[i], true_long, exp[i+3]])
misaligned_df = pd.DataFrame(misaligned, columns=["experiment_id", "exp_nr", "plate_nr", "short", "bb_long", "exp_long"])

In [78]:
misaligned_df.drop_duplicates(subset=["exp_nr", "plate_nr", "short"])

Unnamed: 0,experiment_id,exp_nr,plate_nr,short,bb_long,exp_long
0,2981,2,4,I42,Ph033,BiPh005
20,3301,2,5,I42,Ph033,BiPh005
40,3621,2,6,I42,Ph033,BiPh005
60,6942,3,4,I61,Ph017,Ph015
80,7262,3,5,I61,Ph017,Ph015
...,...,...,...,...,...,...
5120,87494,2,2,I42,Ph033,BiPh005
5140,87814,2,3,I42,Ph033,BiPh005
5160,88254,3,1,I61,Ph017,Ph015
5180,88574,3,2,I61,Ph017,Ph015


In [79]:
# fortunately, this problem is isolated to the experiments table. The building blocks table is correct. The lcms submission files are also correct as they queried `compound_mapping.txt` and the virtuallibrary table at execution which are both correct.


In [94]:
## read data for individual building blocks
res = con.con.execute("SELECT experiment_id, peak_id, delta_I, delta_M, delta_T, delta_Iacid, delta_bAA, delta_IM, delta_IT, delta_MT, delta_IMT FROM lcms_peaks_differences WHERE experiment_id IN (SELECT id FROM experiments WHERE lab_journal_number=? AND well LIKE ?);",
                ("JG319", "J%")).fetchall()
df = pd.DataFrame(res, columns=["experiment_id", "peak_id", "delta_I", "delta_M", "delta_T", "delta_Iacid", "delta_bAA", "delta_IM", "delta_IT", "delta_MT", "delta_IMT"])           

In [95]:
# get the number of experiments
len(df.experiment_id.unique())

20

In [119]:
peaks = pd.concat([con.get_lcms_peaks(("JG319", well), with_delta=True, with_assignment=True) for well in [f"J{i}" for i in range(3,13)]]).reset_index(drop=True)
peaks.loc[peaks.assignment.isna()]

Unnamed: 0,peak_id,reaction_id,peak_nr,retention_time_s,area,intensity,signal_to_noise,mz_max,fwhm_min,%area,...,delta_I,delta_M,delta_T,delta_Iacid,delta_bAA,delta_IM,delta_IT,delta_MT,delta_IMT,assignment
2,1566991,36115,39,377,602429,209255,49.2,284.2946,0.083,6.04212,...,29.339927,-55.859735,100.240687,81.349527,68.192695,-310.814408,-154.713986,-239.913648,-494.868321,
5,1567883,36116,16,248,446580,134828,33.2,455.1892,0.045,9.20807,...,200.234527,115.034865,261.164615,252.244127,239.087295,-139.919808,6.209942,-78.989719,-333.944393,
6,1567885,36116,18,267,512274,155375,38.3,475.1581,0.049,10.56261,...,220.203427,135.003765,281.133515,272.213027,259.056195,-119.950908,26.178842,-59.020819,-313.975493,
7,1567895,36116,28,366,248728,509717,13.6,644.4946,0.014,5.12854,...,389.539927,304.340265,450.470015,441.549527,428.392695,49.385592,195.515342,110.315681,-144.638993,
10,1569205,36117,25,326,3668452,173897,35.9,302.0956,0.264,31.352374,...,47.140927,-38.058735,9.984176,99.150527,85.993695,-293.013408,-244.970498,-330.170159,-585.124832,
11,1569219,36117,39,373,1239577,319141,4929.7,417.1383,0.054,10.594032,...,162.183627,76.983965,125.026876,214.193227,201.036395,-177.970708,-129.927798,-215.127459,-470.082132,
12,1569220,36117,40,373,913600,314930,62.2,417.1383,0.053,7.808075,...,162.183627,76.983965,125.026876,214.193227,201.036395,-177.970708,-129.927798,-215.127459,-470.082132,
15,1578084,36118,15,285,831409,300114,103.2,244.1076,0.042,5.62452,...,-10.847073,-96.046735,46.074423,41.162527,28.005695,-351.001408,-208.880251,-294.079912,-549.034585,
16,1578101,36118,32,336,914002,340060,115.3,278.0955,0.04,6.18326,...,23.140827,-62.058835,80.062323,75.150427,61.993595,-317.013508,-174.892351,-260.092012,-515.046685,
17,1578113,36118,44,382,907753,397487,82.8,341.2663,0.044,6.14099,...,86.311627,1.111965,143.233123,138.321227,125.164395,-253.842708,-111.721551,-196.921212,-451.875885,


In [30]:
peaks.loc[peaks["%area"] > 30]

Unnamed: 0,peak_id,reaction_id,peak_nr,retention_time_s,area,intensity,signal_to_noise,mz_max,fwhm_min,%area,...,delta_I,delta_M,delta_T,delta_Iacid,delta_bAA,delta_IM,delta_IT,delta_MT,delta_IMT,assignment
0,1567881,36116,14,183,4255522,385568,96.8,128.1065,0.172,87.74478,...,-126.848173,-212.047835,-65.918085,-74.838573,-87.995405,-467.002508,-320.872758,-406.072419,-661.027093,
1,1567882,36116,15,197,4849886,367399,92.4,239.0885,0.203,100.0,...,-15.866173,-101.065835,45.063915,36.143427,22.986595,-356.020508,-209.890758,-295.090419,-550.045093,
0,1569188,36117,8,181,3901239,296551,63.4,128.1066,0.209,33.341884,...,-126.848073,-212.047735,-164.004824,-74.838473,-87.995305,-467.002408,-418.959498,-504.159159,-759.113832,
1,1569190,36117,10,197,3941213,282067,60.3,185.1143,0.209,33.683523,...,-69.840373,-155.040035,-106.997124,-17.830773,-30.987605,-409.994708,-361.951798,-447.151459,-702.106132,
2,1569205,36117,25,326,3668452,173897,35.9,302.0956,0.264,31.352374,...,47.140927,-38.058735,9.984176,99.150527,85.993695,-293.013408,-244.970498,-330.170159,-585.124832,
0,1564024,36121,1,45,4960112,173123,46.6,84.0811,0.424,100.0,...,-170.873573,-256.073235,-58.957497,-118.863973,-132.020805,-511.027908,-313.912171,-399.111832,-654.066505,
1,1564038,36121,15,183,3863387,304394,81.3,128.1067,0.196,77.88911,...,-126.847973,-212.047635,-14.931897,-74.838373,-87.995205,-467.002308,-269.886571,-355.086232,-610.040905,
2,1564039,36121,16,197,4216812,306410,81.8,239.0885,0.202,85.01445,...,-15.866173,-101.065835,96.049903,36.143427,22.986595,-356.020508,-158.904771,-244.104432,-499.059105,
0,1568572,36103,8,182,3761834,283929,135.0,128.1066,0.205,40.13552,...,-126.848073,-212.047735,-15.921178,-74.838473,-87.995305,-467.002408,-270.875852,-356.075513,-611.030186,
2,1568574,36103,10,198,3417886,269441,127.9,185.1141,0.193,36.4659,...,-69.840573,-155.040235,41.086322,-17.830973,-30.987805,-409.994908,-213.868352,-299.068013,-554.022686,


In [21]:
for col in peaks.columns:
    if col.startswith("delta_"):
        print(peaks[col].value_counts().iloc[:2])

-126.848173    6
-69.840473     6
Name: delta_I, dtype: int64
-212.047835    6
-155.040135    6
Name: delta_M, dtype: int64
 125.026876    2
-18.833512     2
Name: delta_T, dtype: int64
-74.838573    6
-17.830873    6
Name: delta_Iacid, dtype: int64
-87.995405    6
-30.987705    6
Name: delta_bAA, dtype: int64
-467.002508    6
-409.994808    6
Name: delta_IM, dtype: int64
-129.927798    2
-273.788186    2
Name: delta_IT, dtype: int64
-215.127459    2
-358.987847    2
Name: delta_MT, dtype: int64
-470.082132    2
-613.942520    2
Name: delta_IMT, dtype: int64


In [19]:
(peaks.delta_M.round(3) == 0).value_counts()

False    48
Name: delta_M, dtype: int64

In [20]:
peaks.delta_T.round(2).value_counts()

 125.03    2
-18.83     2
 80.06     2
 25.12     2
 330.11    1
 276.11    1
-58.96     1
-14.93     1
 96.05     1
 350.13    1
 312.15    1
 332.12    1
-55.95     1
-31.89     1
 206.15    1
 51.13     1
-15.92     1
 168.19    1
 41.09     1
 110.04    1
 219.05    1
-75.84     1
 256.14    1
 258.15    1
 1.06      1
 274.15    1
 100.24    1
-65.92     1
 45.06     1
 261.16    1
 281.13    1
 450.47    1
-164.00    1
-107.00    1
 9.98      1
-69.93     1
-12.92     1
 46.07     1
 143.23    1
-24.94     1
 32.07     1
-68.93     1
 20.07     1
-3.75      1
Name: delta_T, dtype: int64