In [137]:
import pandas as pd
import numpy as np
import os
import random

from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


**seed 고정**

seed 고정 이유:
재현성을 보장하기 위해/ 동일한 데이터와 동일한 코드로 항상 동일한 결과를 얻음. 

In [138]:
CFG = {
    'NBITS':2048,
    'SEED':42,

}

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    return f"Seed {seed} has been set."
seed_everything(CFG['SEED'])

'Seed 42 has been set.'

In [139]:

# SMILES 데이터를 분자 지문으로 변환
def smiles_to_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 3, nBits=CFG['NBITS'])
        return np.array(fp)
    else:
        return np.zeros((CFG['NBITS'],))
    

In [140]:
# 학습 ChEMBL 데이터 로드
chembl_data = pd.read_csv('./open/train.csv')  # 예시 파일 이름
chembl_data.head()

train = chembl_data[['Smiles', 'pIC50']]
train['Fingerprint'] = train['Smiles'].apply(smiles_to_fingerprint)
train_x = np.stack(train['Fingerprint'].values)
train_y = train['pIC50'].values


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Fingerprint'] = train['Smiles'].apply(smiles_to_fingerprint)


In [141]:
# 학습 및 검증 데이터 분리
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.25, random_state=42)

# 랜덤 포레스트 모델 학습
model = RandomForestRegressor( n_estimators=100, random_state=CFG['SEED'])
model.fit(train_x, train_y)

def pIC50_to_IC50(pic50_values):
    """Convert pIC50 values to IC50 (nM)."""
    return 10 ** (9 - pic50_values)

In [142]:
# Validation 데이터로부터의 학습 모델 평가
val_y_pred = model.predict(val_x)

# IC50 단위로 변환된 값
val_y_ic50 = pIC50_to_IC50(val_y)
val_y_pred_ic50 = pIC50_to_IC50(val_y_pred)

mse = mean_squared_error(val_y_ic50, val_y_pred_ic50)
rmse = np.sqrt(mse)
mae = mean_absolute_error(val_y_ic50, val_y_pred_ic50)
r2 = r2_score(val_y_ic50, val_y_pred_ic50)

print(f'RMSE: {rmse}')
print(f'MSE: {mse}')
print(f'MAE: {mae}')
print(f'R² Score: {r2}')

#제출 할 파일 저장

test = pd.read_csv('./open/test.csv')
test['Fingerprint'] = test['Smiles'].apply(smiles_to_fingerprint)

test_x = np.stack(test['Fingerprint'].values)

test_y_pred = model.predict(test_x)

submit = pd.read_csv('./open/sample_submission.csv')
submit['IC50_nM'] = pIC50_to_IC50(test_y_pred)
submit.head()

submit.to_csv('./open/submit_file.csv', index=False)

RMSE: 1801.8774939383463


In [143]:
# IC50 단위로 변환된 값
val_y_ic50 = pIC50_to_IC50(val_y)
val_y_pred_ic50 = pIC50_to_IC50(val_y_pred)



# 결과 출력



RMSE: 1801.8774939383463
MSE: 3246762.5031615356
MAE: 527.0260580667718
R² Score: 0.1452898642943634
