# Product correlations
An obvious question to ask in exploratory data analysis: Is there a correlation between the different products.
E.g. do we get product C in the cases where we do not get A?

In [4]:
import pathlib
import sys

sys.path.append(str(pathlib.Path().resolve().parents[1]))

import pandas as pd
import scipy

from src.definitions import DATA_DIR

In [3]:
# get the dataset
df = pd.read_csv(DATA_DIR / "curated_data" / "synferm_dataset_2023-12-20_39486records.csv")
df.head()

Unnamed: 0,I_long,M_long,T_long,product_A_smiles,I_smiles,M_smiles,T_smiles,reaction_smiles,reaction_smiles_atom_mapped,experiment_id,...,binary_H,scaled_A,scaled_B,scaled_C,scaled_D,scaled_E,scaled_F,scaled_G,scaled_H,major_A-C
0,2-Pyr003,Fused002,TerABT004,COc1ccc(CCOC(=O)N2C[C@H](NC(=O)c3cccc(Cl)n3)[C...,O=C(c1cccc(Cl)n1)[B-](F)(F)F.[K+],COc1ccc(CCOC(=O)N2C[C@@H]3NO[C@]4(OC5(CCCCC5)O...,Nc1ccc(F)cc1S,O=C(c1cccc(Cl)n1)[B-](F)(F)F.COc1ccc(CCOC(=O)N...,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,56113,...,0,0.035906,0.003423,0.0,0.021553,0.002986,0.939121,0.90361,0.0,A
1,2-Pyr003,Fused002,TerABT007,COc1ccc(CCOC(=O)N2C[C@H](NC(=O)c3cccc(Cl)n3)[C...,O=C(c1cccc(Cl)n1)[B-](F)(F)F.[K+],COc1ccc(CCOC(=O)N2C[C@@H]3NO[C@]4(OC5(CCCCC5)O...,Nc1cc(Br)ccc1S,O=C(c1cccc(Cl)n1)[B-](F)(F)F.COc1ccc(CCOC(=O)N...,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,56114,...,0,0.0,0.0,0.0,0.006328,0.367872,0.926031,1.093633,0.0,no_product
2,2-Pyr003,Fused002,TerABT013,COc1ccc(CCOC(=O)N2C[C@H](NC(=O)c3cccc(Cl)n3)[C...,O=C(c1cccc(Cl)n1)[B-](F)(F)F.[K+],COc1ccc(CCOC(=O)N2C[C@@H]3NO[C@]4(OC5(CCCCC5)O...,Nc1cc(C(F)(F)F)ccc1S,O=C(c1cccc(Cl)n1)[B-](F)(F)F.COc1ccc(CCOC(=O)N...,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,56106,...,1,0.0,0.0,0.0,0.014604,2.187072,1.010519,0.531508,0.0566,no_product
3,2-Pyr003,Fused002,TerABT014,COc1ccc(CCOC(=O)N2C[C@H](NC(=O)c3cccc(Cl)n3)[C...,O=C(c1cccc(Cl)n1)[B-](F)(F)F.[K+],COc1ccc(CCOC(=O)N2C[C@@H]3NO[C@]4(OC5(CCCCC5)O...,Nc1ccc(Cl)cc1S,O=C(c1cccc(Cl)n1)[B-](F)(F)F.COc1ccc(CCOC(=O)N...,F[B-](F)(F)[C:1](=[O:2])[c:15]1[cH:16][cH:18][...,56112,...,0,0.028822,0.005032,0.0,0.016007,0.508862,0.989601,0.880251,0.0,A
4,2-Pyr003,Fused002,TerTH001,COc1ccc(CCOC(=O)N2C[C@H](NC(=O)c3cccc(Cl)n3)[C...,O=C(c1cccc(Cl)n1)[B-](F)(F)F.[K+],COc1ccc(CCOC(=O)N2C[C@@H]3NO[C@]4(OC5(CCCCC5)O...,[Cl-].[NH3+]NC(=S)c1ccccc1,O=C(c1cccc(Cl)n1)[B-](F)(F)F.COc1ccc(CCOC(=O)N...,F[B-](F)(F)[C:1](=[O:2])[c:11]1[cH:12][cH:14][...,56109,...,0,0.348936,0.642356,0.0,0.032561,0.619445,0.108977,0.433894,0.0,B


In [6]:
scipy.stats.pearsonr(df["binary_A"], df["binary_C"])

PearsonRResult(statistic=0.21904446416298023, pvalue=0.0)

In [7]:
scipy.stats.pearsonr(df["binary_A"], df["binary_B"])

PearsonRResult(statistic=0.4155878778420521, pvalue=0.0)

In [8]:
scipy.stats.pearsonr(df["binary_B"], df["binary_C"])

PearsonRResult(statistic=0.36329029094728055, pvalue=0.0)

In [9]:
scipy.stats.pearsonr(df["scaled_A"], df["scaled_C"])

PearsonRResult(statistic=0.07355059390742487, pvalue=1.6881349080452412e-48)

In [10]:
scipy.stats.pearsonr(df["scaled_A"], df["scaled_B"])

PearsonRResult(statistic=0.39808509674269943, pvalue=0.0)

In [11]:
scipy.stats.pearsonr(df["scaled_B"], df["scaled_C"])

PearsonRResult(statistic=0.09103469513699995, pvalue=1.9634499121257127e-73)

## Conclusion
It seems the formation of A and B is moderately correlated (which makes some sense as B is an intermediate en route to A).
A and C are weakly correlated (probably this is the result of two opposing tendencies: Formation of A depletes mutual intermediate B, leading to a negative correlation, but formation of A and C is confounded by formation of B, leading to a positive correlation) 