# Extract MoBiAS PDF reports

The PDF reports are a great source of additional information as they contain data for peaks that may not have been assigned to any known product.
Mining these reports is much faster than reprocessing of the raw LCMS data.

Here, we are in particular interested to identify
1. leftover starting materials
2. systematic side products that we have not been previously looking for


In [1]:
import sys
import pathlib
import re
from datetime import datetime

sys.path.insert(0, str(pathlib.Path().resolve().parents[1]))

from pypdf import PdfReader
import pandas as pd
import numpy as np

from src.util.db_utils import SynFermDatabaseConnection
from src.definitions import DATA_DIR, PLATE_LIST_PATH
from src.util.rdkit_util import smiles_to_lcms_mass

## Extract PDF data

In [2]:
con = SynFermDatabaseConnection()

In [3]:
def import_lcms_full_report(path):
    # set up pdf reader
    reader = PdfReader(path)
    number_of_pages = len(reader.pages)
    lines = []
    found_data = False
    # iterate from second page until entire peak summary table is read completely
    for i in range(1, number_of_pages):
        page = reader.pages[i]
        text = page.extract_text()
        line_list = text.splitlines()
        if (line_list[0] != '# RT [min] Area I S/N Max. m/z FWHM [min] Area % Int. %') and found_data:  # stop when not encountering another data header
            break
        else:
            found_data = True
        lines += line_list
    
    if len(lines) == 0:
        raise RuntimeError("No data extracted")
        
    # remove spaces in header
    cleaned_lines = ['# RT[min] Area I S/N max_m/z FWHM[min] Area% Int%',]
    # remove footers and data headers on not-first page
    cleaned_lines += [line for line in lines if line[0].isnumeric()] 

    # split the lines into individual fields
    data = [line.split() for line in cleaned_lines]
    
    # assemble DataFrame from data
    df = pd.DataFrame(data[1:], columns=data[0]).astype("float")

    return df

In [None]:
%%capture output
# import the plate list to obtain LCMS identifier - plate_nr relation
plate_list = pd.read_csv(PLATE_LIST_PATH)
# we will record any files that cause exceptions for manual inspection
files_with_exceptions = []

# iterate over all experiments/plates
for exp_nr in range(1, 30):
    for plate_nr in range(1, 7):
        # print progress indicator
        now = datetime.now()
        print(f"exp {exp_nr}-{plate_nr}, started {now.strftime('%H:%M:%S')}")
 
        
        lcms_id = plate_list.loc[(plate_list["exp_nr"] == exp_nr) & (plate_list["plate_nr"] == plate_nr), "results_file_name"].item().split("_")[0]
        exp_path = DATA_DIR / "pdf_reports" / lcms_id
        full_report_paths = list(exp_path.glob("*_LCMS_Fullreport.pdf"))
        
        for path in full_report_paths:
            try:
                # get well from filename
                regex = r'_P\d{1}-[A-Z]-\d{1,2}_'
                match = re.search(regex, path.name)
                well = "".join(match.group().strip("_").split("-")[1:])
                reaction_id = con.get_reaction_id((exp_nr, plate_nr, well))
                df = import_lcms_full_report(path)
                # we persist this to the database for re-use
                # first reformat df to fit DB
                df.insert(0, "reaction_id", reaction_id)
                df.insert(2, "retention_time_s", (df["RT[min]"] * 60).astype("int"))
                df = df.drop(columns="RT[min]")
                df = df.rename(columns={"#": "peak_number", 
                           "Area": "area", 
                           "I": "intensity",
                           "S/N": "signal_to_noise",
                           "max_m/z": "mz_max",
                           "FWHM[min]": "fwhm_min", 
                           "Area%": "%area",
                           "Int%": "%intensity"
                          }).astype({"peak_number": "int",
                                     "area": "int",
                                     "intensity": "int"})
                # write all extracted peaks to DB
                with con.con:
                    con.con.executemany(
                        'INSERT INTO lcms_peaks (experiment_id, peak_nr, retention_time_s, area, intensity, signal_to_noise, mz_max, fwhm_min, "%area", "%intensity") VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?);', 
                        [tuple(row) for row in df.to_numpy()]
                    )
            except Exception as e:
                print(f"Something went wrong for {str(path)}")
                files_with_exceptions.append(path)
                print(e)

In [6]:
print(output)

exp 1-1, started 16:08:03
exp 1-2, started 16:08:21
exp 1-3, started 16:08:38
exp 1-4, started 16:08:55
exp 1-5, started 16:09:14
exp 1-6, started 16:09:33
exp 2-1, started 16:09:59
exp 2-2, started 16:10:17
exp 2-3, started 16:10:35
exp 2-4, started 16:10:53
exp 2-5, started 16:11:11
exp 2-6, started 16:11:31
exp 3-1, started 16:11:50
exp 3-2, started 16:12:08
exp 3-3, started 16:12:26
exp 3-4, started 16:12:45
exp 3-5, started 16:13:04
exp 3-6, started 16:13:22
exp 4-1, started 16:13:42
exp 4-2, started 16:13:59
exp 4-3, started 16:14:17
exp 4-4, started 16:14:35
exp 4-5, started 16:14:52
exp 4-6, started 16:15:10
exp 5-1, started 16:15:29
exp 5-2, started 16:15:48
exp 5-3, started 16:16:07
exp 5-4, started 16:16:26
exp 5-5, started 16:16:45
exp 5-6, started 16:17:04
exp 6-1, started 16:17:23
exp 6-2, started 16:17:41
exp 6-3, started 16:17:58
exp 6-4, started 16:18:17
exp 6-5, started 16:18:34
exp 6-6, started 16:18:53
exp 7-1, started 16:19:10
exp 7-2, started 16:19:26
exp 7-3, sta

In [6]:
# now which of these are unexpected / not already explained?
mz_dmso = [79.0212, 101.0032, 157.0351]  # M+H+, M+Na+ 2M+H+
mz_lock = [142.1590, 322.0481]  # tetramethylpiperidine, hexamethoxyphosphazene
mz_is = 361.1201  # fenofibrate

known_product_smiles = con.get_product_smiles((exp_nr, plate_nr), "A3")
known_product_mzs = [smiles_to_lcms_mass(smi) for smi in known_product_smiles]

known_mzs = mz_dmso + mz_lock + [mz_is] + known_product_mzs

In [7]:
unexplained_peaks = df.loc[((df["Area%"] > 5)  # only significant peaks
        & ~np.isclose(df["max_m/z"].to_numpy()[:, None], known_mzs, rtol=5e-6, atol=0).any(axis=1)  # only peaks that are not already explained
       ), 
    ]
unexplained_peaks

Unnamed: 0_level_0,RT[min],Area,I,S/N,max_m/z,FWHM[min],Area%,Int%
#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
9,2.156,258954.7,62241.0,17.8,146.1175,0.059,5.39209,3.71
12,2.903,649078.1,146339.0,42.1,365.1712,0.081,13.51545,8.73
16,3.0,1114304.0,440712.0,130.7,365.1709,0.038,23.20264,26.28
17,3.395,2613391.0,1012323.0,306.9,332.0521,0.037,54.41742,60.36
22,3.592,1006776.0,445752.0,132.9,459.1523,0.034,20.96362,26.58
25,3.73,815462.1,335406.0,100.8,505.1579,0.037,16.97999,20.0
36,5.032,415270.5,143717.0,1209.5,250.9768,0.043,8.64698,8.57
37,5.033,390619.3,139087.0,41.9,250.9768,0.043,8.13368,8.29


## Identify starting material peaks

In [9]:
sms = con.get_starting_materials_for_reaction((exp_nr, plate_nr, "K7"))
sms

['O=C(c1ccc(C2=NCCO2)cc1)[B-](F)(F)F.[K+]',
 'CC(C)C[C@H]1C[C@]2(O[NH2+]1)OC1(CCCCC1)OC2=O.[Cl-]',
 '[Cl-].[NH3+]NC(=S)c1cccs1']

In [12]:
smiles_to_lcms_mass(sms[2])

159.00451664409

In [13]:
unexplained_peaks.columns

Index(['RT[min]', 'Area', 'I', 'S/N', 'max_m/z', 'FWHM[min]', 'Area%', 'Int%'], dtype='object')