In [None]:
!pip install -q condacolab # conda install 후 적용 과정에서 무조건 runtime이 crash나며 종료되므로, 런타임 종료 후에 밑의 셀을 작동시켜줘야함
import condacolab
condacolab.install()

✨🍰✨ Everything looks OK!


In [None]:
!mamba install -y -c conda-forge rdkit # conda 상황에서 rdkit 다운로드해야 rdkit 풀패키지가 다운 가능


Looking for: ['rdkit']

[?25l[2K[0G[+] 0.0s
conda-forge/linux-64  ⣾  [2K[1A[2K[0G[+] 0.1s
conda-forge/linux-64   1%
conda-forge/noarch     1%[2K[1A[2K[1A[2K[0G[+] 0.2s
conda-forge/linux-64   9%
conda-forge/noarch    19%[2K[1A[2K[1A[2K[0G[+] 0.3s
conda-forge/linux-64  17%
conda-forge/noarch    37%[2K[1A[2K[1A[2K[0G[+] 0.4s
conda-forge/linux-64  23%
conda-forge/noarch    51%[2K[1A[2K[1A[2K[0G[+] 0.5s
conda-forge/linux-64  29%
conda-forge/noarch    62%[2K[1A[2K[1A[2K[0G[+] 0.6s
conda-forge/linux-64  31%
conda-forge/noarch    66%[2K[1A[2K[1A[2K[0G[+] 0.7s
conda-forge/linux-64  36%
conda-forge/noarch    78%[2K[1A[2K[1A[2K[0G[+] 0.8s
conda-forge/linux-64  42%
conda-forge/noarch    90%[2K[1A[2K[1A[2K[0Gconda-forge/noarch                                
[+] 0.9s
conda-forge/linux-64  47%[2K[1A[2K[0G[+] 1.0s
conda-forge/linux-64  55%[2K[1A[2K[0G[+] 1.1s
conda-forge/linux-64  62%[2K[1A[2K[0G[+] 1.2s
conda-forge/linux-64  70%[2K

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, Crippen, AllChem
from rdkit.Chem import rdMolDescriptors

df = pd.read_csv('candidates_00_1.csv')

motif_cols = [col for col in df.columns[-22:] if col in df.columns]
for col in motif_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)


def calc_dipole(m):
    mH = Chem.AddHs(m)
    try:
        AllChem.EmbedMolecule(mH, randomSeed=42)
        AllChem.UFFOptimizeMolecule(mH)
        AllChem.ComputeGasteigerCharges(mH)
        conf = mH.GetConformer()
        dip = np.zeros(3)
        for atom in mH.GetAtoms():
            q = float(atom.GetProp('_GasteigerCharge'))
            pos = np.array(conf.GetAtomPosition(atom.GetIdx()))
            dip += q * pos
        return np.linalg.norm(dip)
    except:
        return 0.0

def featurize_env(smiles):
    m = Chem.MolFromSmiles(smiles)
    if m is None:
        return pd.Series({k:0.0 for k in [
            'LogP','TPSA','MolWt','HBD','HBA','RotBonds','AromRings','FracCSP3',
            'LabuteASA','Dipole','MQN1','MQN2'
        ]})
    logp = Crippen.MolLogP(m)
    tpsa = Descriptors.TPSA(m)
    mw = Descriptors.ExactMolWt(m)
    hbd = Descriptors.NumHDonors(m)
    hba = Descriptors.NumHAcceptors(m)
    rb = Descriptors.NumRotatableBonds(m)
    ar = Descriptors.NumAromaticRings(m)
    fsp3 = Descriptors.FractionCSP3(m)
    asa = Descriptors.LabuteASA(m)
    dip = calc_dipole(m)
    mqns = rdMolDescriptors.MQNs_(m)
    mqn1, mqn2 = mqns[0], mqns[1] if len(mqns) > 1 else (0, 0)
    return pd.Series({
        'LogP':logp,'TPSA':tpsa,'MolWt':mw,'HBD':hbd,'HBA':hba,
        'RotBonds':rb,'AromRings':ar,'FracCSP3':fsp3,
        'LabuteASA':asa,'Dipole':dip,
        'MQN1':mqn1,'MQN2':mqn2
    })

env_df = df['Canonical_Smiles'].apply(featurize_env)
df = pd.concat([df, env_df], axis=1)

pKa_map = {'imidazole':6.9,'1,2,4-triazole':2.5,'pyridine':5.2}

present_pka_motifs = [m for m in pKa_map.keys() if m in df.columns]
if present_pka_motifs:
    df['pKa_pred'] = df.apply(lambda r: np.mean([pKa_map[m] for m in present_pka_motifs if r.get(m,0)==1]) if any(r.get(m,0)==1 for m in present_pka_motifs) else 7.4, axis=1)
else:
    df['pKa_pred'] = 7.4
    print("Warning: None of the pKa related motif columns (imidazole, 1,2,4-triazole, pyridine) found. pKa_pred set to default 7.4.")

if 'pKa_pred' in df.columns and 'LogP' in df.columns:
    df['LogD_7.4'] = df['LogP'] - np.log10(1 + 10**(7.4 - df['pKa_pred']))
else:
    df['LogD_7.4'] = np.nan
    print("Warning: 'pKa_pred' or 'LogP' column not found. 'LogD_7.4' cannot be calculated.")


env_cols = ['LogP','TPSA','MolWt','HBD','HBA','RotBonds','AromRings','FracCSP3',
            'LabuteASA','Dipole','MQN1','MQN2',
            'pKa_pred','LogD_7.4']
present_env_cols = [col for col in env_cols if col in df.columns]

present_motif_cols = [col for col in motif_cols if col in df.columns]
if len(present_motif_cols) != len(motif_cols):
    print(f"Warning: Some initial motif columns were not found in the DataFrame and will be skipped for interaction features: {[col for col in motif_cols if col not in df.columns]}")


for motif in present_motif_cols:
    for env in present_env_cols:
        df[f'{motif}_x_{env}'] = df[motif] * df[env]


display(df.head())
df.to_csv('candidates_full_env.csv', index=False)

  df[f'{motif}_x_{env}'] = df[motif] * df[env]
  df[f'{motif}_x_{env}'] = df[motif] * df[env]
  df[f'{motif}_x_{env}'] = df[motif] * df[env]
  df[f'{motif}_x_{env}'] = df[motif] * df[env]
  df[f'{motif}_x_{env}'] = df[motif] * df[env]
  df[f'{motif}_x_{env}'] = df[motif] * df[env]
  df[f'{motif}_x_{env}'] = df[motif] * df[env]
  df[f'{motif}_x_{env}'] = df[motif] * df[env]
  df[f'{motif}_x_{env}'] = df[motif] * df[env]
  df[f'{motif}_x_{env}'] = df[motif] * df[env]
  df[f'{motif}_x_{env}'] = df[motif] * df[env]
  df[f'{motif}_x_{env}'] = df[motif] * df[env]
  df[f'{motif}_x_{env}'] = df[motif] * df[env]
  df[f'{motif}_x_{env}'] = df[motif] * df[env]
  df[f'{motif}_x_{env}'] = df[motif] * df[env]
  df[f'{motif}_x_{env}'] = df[motif] * df[env]
  df[f'{motif}_x_{env}'] = df[motif] * df[env]
  df[f'{motif}_x_{env}'] = df[motif] * df[env]
  df[f'{motif}_x_{env}'] = df[motif] * df[env]
  df[f'{motif}_x_{env}'] = df[motif] * df[env]
  df[f'{motif}_x_{env}'] = df[motif] * df[env]
  df[f'{motif

Unnamed: 0,ID,Inhibition,Canonical_Smiles,pyridine,benzene,furan,pyrrole,tertiary_amine,caffeic_acid,hydroxyethylene,...,"1,2,4-triazole_x_HBA","1,2,4-triazole_x_RotBonds","1,2,4-triazole_x_AromRings","1,2,4-triazole_x_FracCSP3","1,2,4-triazole_x_LabuteASA","1,2,4-triazole_x_Dipole","1,2,4-triazole_x_MQN1","1,2,4-triazole_x_MQN2","1,2,4-triazole_x_pKa_pred","1,2,4-triazole_x_LogD_7.4"
0,TRAIN_0000,12.5,Cl.OC1(Cc2cccc(Br)c2)CCNCC1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,TRAIN_0001,4.45,Brc1ccc2OCCc3ccnc1c23,0.7,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,TRAIN_0002,4.92,CC1(CO)CC(=NO1)c2cc(c(F)cc2Cl)[N+](=O)[O-],0.0,0.0,0.0,0.0,0.3,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,TRAIN_0003,71.5,Fc1ccc2nc(Nc3cccc(COc4cccc(c4)C(=O)N5CCOCC5)c3...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,TRAIN_0004,18.3,CC(C)CC(=O)C1=C(Nc2c(Cl)ccc(Cl)c2C1=O)S(=O)C,0.7,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


✅ candidates_full_env.csv 생성 완료
