# Investigate Ph018

We suspect a problem with Ph018 that we want to look into. One hypothesis is that instead of the previously assumed `CN(C)Cc1ccc(C(=O)[B-](F)(F)F)cc1.[K+]`, Ph018 is actually the quarternary ammonium compound `C[N+](C)(Cc1ccc(C(=O)[B-](F)(F)F)cc1)Cc1ccc(C(=O)[B-](F)(F)F)cc1.[K+]`.

Here, we want to check if the 50k data contains any products that may arise from this compound.

In [None]:
import sys 
import pathlib
sys.path.append(str(pathlib.Path().absolute().parents[1]))

from rdkit import Chem
import pandas as pd

from src.util.db_utils import SynFermDatabaseConnection

In [None]:
con = SynFermDatabaseConnection()

In [None]:
# previous assumption for Ph018
mol_prior = con.get_mol(long="Ph018")
mol_prior

In [None]:
# alternative hypothesis for Ph018
mol_alt = Chem.MolFromSmiles("C[N+](C)(Cc1ccc(C(=O)[B-](F)(F)F)cc1)Cc1ccc(C(=O)[B-](F)(F)F)cc1.[K+]")
mol_alt

In [None]:
# get reactants and peaks for all Ph018 reactions
res = con.con.execute("""
SELECT
    lp.experiment_id,
    e.initiator, e.monomer, e.terminator, e.lab_journal_number, e.well,
    d.peak_id,
    lp.mz_max, lp.retention_time_s,
    delta_I, delta_M, delta_T, delta_Iacid, delta_bAA, delta_A, delta_B, delta_C, delta_D, delta_E, delta_F, delta_G, delta_H,
    a.assignment
FROM lcms_peaks_differences as d
    LEFT OUTER JOIN lcms_peaks_assignment as a on d.peak_id = a.peak_id
    LEFT JOIN experiments as e on d.experiment_id = e.id
    LEFT JOIN lcms_peaks as lp on d.peak_id = lp.id
WHERE d.experiment_id IN (SELECT id FROM experiments WHERE exp_nr BETWEEN 4 and 29)
ORDER BY delta_A ASC;
""").fetchall()

df_all = pd.DataFrame(res, columns=["experiment_id", "initiator", "monomer", "terminator", "lab_journal_number", "well", "peak_id", "mz_max", "retention_time_s", "delta_I", "delta_M", "delta_T", "delta_Iacid", "delta_bAA", "delta_A", "delta_B", "delta_C", "delta_D", "delta_E", "delta_F", "delta_G", "delta_H", "assignment"])
df_all.head()

In [None]:
# select Ph018 (short I63)
df = df_all.loc[df_all["initiator"] == "I63"]

In [None]:
# check the frequent assignments
df["assignment"].value_counts()

In [None]:
# exclude the already assigned peaks
df = df.loc[df["assignment"].isna()].copy()

In [None]:
# we make columns with lower precision to be able to aggregate differences
df_rounded = df.round(3)

In [None]:
# let's look for frequent differences
for col in df_rounded.columns:
    if col.startswith("delta_"):
        print(col)
        print(df_rounded[col].value_counts().head(3))

the most frequent differences we observe are I_acid + 295.056 and I_acid + 275.087

In [None]:
# check I_acid + 295.056

df.loc[df_rounded["delta_Iacid"] == 295.056]

In [None]:
df_rounded["mz_max"].value_counts().head(10)

In [None]:
df_all.loc[df_all["mz_max"].between(475.1575, 475.1585)]["initiator"].value_counts()

In [None]:
(2 * df_rounded["delta_I"] + df_rounded["delta_M"]).value_counts()