In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors3D
import pandas as pd
from Hyprid import polFeature
from rdkit.Chem import Descriptors

In [2]:
def hybConn(mol):
    sp3sp3 = 0
    sp3sp2 = 0
    sp3sp = 0
    sp2sp = 0
    sp2sp2 = 0
    spsp = 0
    sp3s = 0
    sp2s = 0
    sps = 0
    other = 0
    for bond in mol.GetBonds():
        atom1 = bond.GetBeginAtom()
        atom2 = bond.GetEndAtom()
        hyb1 = str(atom1.GetHybridization())
        hyb2 = str(atom2.GetHybridization())
        if   hyb1=="SP3" and hyb2=="SP3":sp3sp3+=1
        elif hyb1=="SP3" and hyb2=="SP2":sp3sp2+=1
        elif hyb1=="SP3" and hyb2=="SP" :sp3sp+=1
        elif hyb1=="SP3" and hyb2=="S"  :sp3s+=1
        elif hyb1=="S"   and hyb2=="SP3":sp3s+=1
        elif hyb1=="SP2" and hyb2=="SP3":sp3sp2+=1
        elif hyb1=="SP2" and hyb2=="SP2":sp2sp2+=1
        elif hyb1=="SP2" and hyb2=="SP" :sp2sp+=1
        elif hyb1=="SP2" and hyb2=="S"  :sp2s+=1
        elif hyb1=="S"   and hyb2=="SP2":sp2s+=1
        elif hyb1=="SP"  and hyb2=="SP3":sp3sp+=1
        elif hyb1=="SP"  and hyb2=="SP2":sp2sp+=1
        elif hyb1=="SP"  and hyb2=="SP" :spsp+=1
        elif hyb1=="SP"  and hyb2=="S"  :sps+=1
        elif hyb1=="S"   and hyb2=="SP" :sps+=1
        else:other+=1
    return [sp3sp3,sp3sp2,sp3sp,sp3s,sp2sp2,sp2sp,sp2s,spsp,sps,other]

In [3]:
def getMolDescriptors(mol):
    res = []
    for nm,fn in Descriptors._descList:
        if nm != 'Ipc':
            try:
                val = fn(mol)
                res.append(val)
            except:
                res.append(None)
                print(nm)
    return res

In [None]:
# Load and preprocess data
filename = "../Raw_Files/bindingdb_G12C.tsv"
#df = pd.read_csv("fda_original.csv",sep=";")
df = pd.read_csv(filename,sep='\t')
df.dropna(inplace=True)
hybd_names = ['FC','N','H','Cl','Br',
                 'I','P','F','Se','Si',
                 'S_sp3','S_sp2','S_sp',
                 'C_sp3','C_sp2','C_sp',
                 'N_sp3','N_sp2','N_sp',
                 'O_sp3','O_sp2','O_sp',
                 'I1','I2','I3']
bondhybd_names = ['sp3sp3','sp3sp2','sp3sp','sp3s',
                  'sp2sp2','sp2sp','sp2s','spsp','sps','other']
count = 0
dcr_names = [nm for nm,fn in Descriptors._descList]
dcr_names.remove('Ipc')
Cnames = ["ChEMBL ID","Smiles"]+hybd_names+bondhybd_names+dcr_names
print("Cnames", len(Cnames))
DF = pd.DataFrame(columns=Cnames)

In [None]:
filename = "../Raw_Files/bindingdb_G12C.tsv"
#df = pd.read_csv("fda_original.csv",sep=";")
df = pd.read_csv(filename, sep='\t')
df.dropna(subset=['IC50 (nM)'], inplace=True)
df

In [None]:
# Iterate through ChEMBL IDs and SMILES
count = 0
for i, (chembl_id, smiles) in enumerate(zip(df["BindingDB Reactant_set_id"], df["Ligand SMILES"])):
    try:
        count += 1
        if count == 100:
            count = 0
            print("Processing molecule number", i)

        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            print(f"Skipping {chembl_id}: Invalid SMILES '{smiles}'")
            continue

        mol = Chem.AddHs(mol)  # Add hydrogens
        status = AllChem.EmbedMolecule(mol, AllChem.ETKDG())

        if status == 0:
            t = getMolDescriptors(mol)
            hyp = polFeature(mol)
            bhyp = hybConn(mol)
            I = [Descriptors3D.PMI1(mol), Descriptors3D.PMI2(mol), Descriptors3D.PMI3(mol)]

            DF.loc[i] = [chembl_id, smiles] + hyp + bhyp + I + t
            print(f"Processed {chembl_id} successfully")
        else:
            print(f"Skipping {chembl_id}: 3D conformer generation failed")

    except Exception as e:
        print(f"Error processing {chembl_id} ({smiles}): {repr(e)}")

# Save to CSV
output_filename = "g12c_Hyb_Features.csv"
DF.to_csv(output_filename, index=False)
print(f"Saved output to {output_filename}")

In [None]:
import pandas as pd

# Load the files
df_ic50 = pd.read_csv("../Raw_Files/bindingdb_G12C.tsv", sep="\t")  # Read TSV file
df_features = pd.read_csv("g12c_Hyb_Features.csv")

# Merge on the respective ID columns
merged_df = df_features.merge(df_ic50[['BindingDB Reactant_set_id', 'IC50 (nM)']],
                              left_on="ChEMBL ID",
                              right_on="BindingDB Reactant_set_id",
                              how="left")

# Drop the duplicate ID column if needed
merged_df.drop(columns=["BindingDB Reactant_set_id"], inplace=True)

# Save the merged file
merged_df.to_csv("G12C_training.csv", index=False)

print("Merge completed. File saved as merged_features_IC50_g12c.csv")