# Analysis of recommendations

See the `summary-notebook` notebook for how the targets are generated. This notebook basically provides some simple analyses using RDKit into the kinds of molecules being recommended.

In [1]:
from rdkit import Chem
from rdkit.Chem import Descriptors
import pandas as pd

from umda.utils import paths

In [2]:
# load the recommendations from the `processed` directory
with open(paths.get("processed").joinpath("targets.smi"), "r") as read_file:
    smiles = read_file.readlines()
    smiles = [smi.strip() for smi in smiles]

In [3]:
molecules = [Chem.MolFromSmiles(smi, sanitize=True) for smi in smiles]

In [4]:
_ = [mol.UpdatePropertyCache(strict=False) for mol in molecules]

In [5]:
# generate some SMARTS lookups
cyanide = Chem.MolFromSmarts("C#N")
alkene = Chem.MolFromSmarts("C=C")
acetylenic = Chem.MolFromSmarts("C#C")
aromatic_c = Chem.MolFromSmarts("c")
sp2_nitrogen = Chem.MolFromSmarts("[N^2]")

In [6]:
def describe(mol, *smarts):
    result = [descriptor(mol) for descriptor in [Descriptors.MolWt, Descriptors.NumHeteroatoms]]
    for smart in smarts:
        result.append(mol.HasSubstructMatch(smart))
    return result

In [7]:
data = [describe(mol, *[cyanide, alkene, acetylenic, aromatic_c, sp2_nitrogen]) for mol in molecules]

In [8]:
df = pd.DataFrame(data, columns=["MW", "Heteroatoms", "Cyanide", "Alkene", "Acetylenic", "Aromatic Carbon", "sp2 Nitrogen"])

In [9]:
df

Unnamed: 0,MW,Heteroatoms,Cyanide,Alkene,Acetylenic,Aromatic Carbon,sp2 Nitrogen
0,102.136,0,False,False,True,False,False
1,102.136,0,False,False,True,False,False
2,102.136,0,False,False,True,False,False
3,104.108,1,False,False,True,False,False
4,116.119,1,False,False,True,False,False
...,...,...,...,...,...,...,...
1577,84.118,1,False,False,True,False,False
1578,101.174,2,True,False,False,False,False
1579,85.106,2,True,False,False,False,False
1580,71.079,2,True,False,False,False,False


In [10]:
saturated = df.loc[(df["Alkene"] == False) & (df["Acetylenic"] == False) & (df["Aromatic Carbon"] == False)]

In [11]:
len(df) - len(saturated)

1231

In [12]:
df.loc[df["sp2 Nitrogen"] == True]

Unnamed: 0,MW,Heteroatoms,Cyanide,Alkene,Acetylenic,Aromatic Carbon,sp2 Nitrogen
52,117.107,2,False,False,True,False,True
74,121.139,2,False,False,True,False,True
96,101.108,1,False,True,True,False,True
116,93.129,1,False,False,True,False,True
151,93.085,2,False,False,True,False,True
...,...,...,...,...,...,...,...
1533,159.188,2,False,True,False,True,True
1535,131.178,1,False,False,False,True,True
1537,159.232,1,False,True,False,True,True
1540,145.205,1,False,True,False,True,True


In [13]:
len(df) - df["Aromatic Carbon"].sum()

1476

In [14]:
def non_tmc1_element(smi: str, *elements):
    return any([element in smi for element in elements])

In [15]:
df["New element"] = [non_tmc1_element(smi, "Cl", "P", "Si") for smi in smiles]

In [16]:
df["New element"].sum()

78