In [2]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import AllChem, DataStructs, Descriptors
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import os
import random

In [3]:
CFG = {
    'NBITS': 2048,
    'SEED': 42
}

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED'])

In [5]:
# SMILES 데이터를 분자 지문으로 변환
def smiles_to_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=CFG['NBITS'])
        return np.array(fp)
    else:
        return np.zeros((CFG['NBITS'],))

In [6]:
def IC50_to_pIC50(ic50_nM):
    ic50_nM = np.clip(ic50_nM, 1e-10, None)
    return 9 - np.log10(ic50_nM)

In [None]:
def pIC50_to_IC50(pIC50):
    return 10 ** (9 - pIC50)

In [7]:
chembl = pd.read_csv("data/ChEMBL_ASK1(IC50).csv", sep=';')
pubchem = pd.read_csv("data/Pubchem_ASK1.csv", low_memory=False)

In [9]:
chembl.columns = chembl.columns.str.strip().str.replace('"', '')
chembl = chembl[chembl['Standard Type'] == 'IC50']
chembl = chembl[['Smiles', 'Standard Value']].rename(columns={'Smiles': 'smiles', 'Standard Value': 'ic50_nM'}).dropna()
chembl['ic50_nM'] = pd.to_numeric(chembl['ic50_nM'], errors='coerce')
chembl['pIC50'] = IC50_to_pIC50(chembl['ic50_nM'])

In [10]:
pubchem = pubchem[['SMILES', 'Activity_Value']].rename(columns={'SMILES': 'smiles', 'Activity_Value': 'ic50_nM'}).dropna()
pubchem['ic50_nM'] = pd.to_numeric(pubchem['ic50_nM'], errors='coerce')
pubchem['pIC50'] = IC50_to_pIC50(pubchem['ic50_nM'])

In [12]:
total = pd.concat([chembl, pubchem], ignore_index=True)
total = total.drop_duplicates(subset='smiles')
total = total[total['ic50_nM'] > 0].dropna()

In [None]:
def calculate_rdkit_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None: return np.full((len(Descriptors._desc
                                        List),), np.nan)
    descriptors = [desc_func(mol) for _, desc_func in Descriptors._descList]
    return np.array(descriptors)

In [31]:
total['pIC50'] = IC50_to_pIC50(total['ic50_nM'])
print("\n--- Feature Engineering ---")
total['fingerprint'] = total['smiles'].apply(smiles_to_fingerprint)
total['descriptors'] = total['smiles'].apply(calculate_rdkit_descriptors)
total.dropna(subset=['fingerprint', 'descriptors'], inplace=True)

desc_stack = np.stack(total['descriptors'].values)
desc_mean = np.nanmean(desc_stack, axis=0)
desc_stack = np.nan_to_num(desc_stack, nan=desc_mean)

scaler = StandardScaler()
desc_scaled = scaler.fit_transform(desc_stack)
fp_stack = np.stack(total['fingerprint'].values)
X_train = np.hstack([fp_stack, desc_scaled])
y_train = total['pIC50'].values


--- Feature Engineering ---


In [27]:
test = pd.read_csv("data/test.csv") 

In [33]:
test['Fingerprint'] = test['Smiles'].apply(smiles_to_fingerprint)
test['descriptors'] = test['Smiles'].apply(calculate_rdkit_descriptors)
test.dropna(subset=['Fingerprint', 'descriptors'], inplace=True)

desc_stack = np.stack(test['descriptors'].values)
desc_mean = np.nanmean(desc_stack, axis=0)
desc_stack = np.nan_to_num(desc_stack, nan=desc_mean)


desc_scaled = scaler.transform(desc_stack)
fp_stack = np.stack(test['Fingerprint'].values)
X_test = np.hstack([fp_stack, desc_scaled])

In [None]:
np.save("X_train.npy", X_train)
np.save("y_train.npy", y_train)
np.save("X_test.npy", X_test)