In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Dataset Info.

# **train.csv [파일]**

ChEMBL IRAK4 IC50에 대한 학습용 데이터 1,952종

Smiles : 분자구조 데이터

IC50_nM : 예측 Target

plC50 : IC50_nM과 동일하지만, 단위가 다름


# test.csv [파일]

ID : 고유 ID

Smiles : 분자구조 데이터


# sample_submission.csv [파일] - 제출 양식
ID : 고유 ID

IC50_nM : 예측한 IC50 (nM단위)

In [20]:
!pip install rdkit-pypi
import pandas as pd
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5


In [None]:
data_path = '/content/drive/MyDrive/open/'
train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')
submission = pd.read_csv(data_path + 'sample_submission.csv')

In [18]:
print(f'train {train.columns}')
print(f'test {test.columns}')
print(f'submision {submission.columns}')

train Index(['Molecule ChEMBL ID', 'Standard Type', 'Standard Relation',
       'Standard Value', 'Standard Units', 'pChEMBL Value', 'Assay ChEMBL ID',
       'Target ChEMBL ID', 'Target Name', 'Target Organism', 'Target Type',
       'Document ChEMBL ID', 'IC50_nM', 'pIC50', 'Smiles'],
      dtype='object')
test Index(['ID', 'Smiles'], dtype='object')
submision Index(['ID', 'IC50_nM'], dtype='object')


In [29]:
train.shape, test.shape

((1952, 15), (113, 2))

In [21]:
# SMILES 문자열을 분자 특성으로 변환하는 함수
def smiles_to_features(smiles):
    mol = Chem.MolFromSmiles(smiles)
    features = {
        'MolWt': Descriptors.MolWt(mol),
        'MolLogP': Descriptors.MolLogP(mol),
        'NumRotatableBonds': Descriptors.NumRotatableBonds(mol),
        'TPSA': Descriptors.TPSA(mol),
        'NumHAcceptors': Descriptors.NumHAcceptors(mol),
        'NumHDonors': Descriptors.NumHDonors(mol),
    }
    return pd.Series(features)

# 훈련 데이터의 SMILES를 분자 특성으로 변환
train_features = train['Smiles'].apply(smiles_to_features)
train_features['IC50_nM'] = train['IC50_nM']

# 테스트 데이터의 SMILES를 분자 특성으로 변환
test_features = test['Smiles'].apply(smiles_to_features)

# 훈련 데이터와 타겟 값 분리
X = train_features.drop(columns=['IC50_nM'])
y = train_features['IC50_nM']

In [22]:
# 훈련 데이터와 검증 데이터로 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 랜덤 포레스트 모델 학습
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 검증 데이터 예측
y_pred = model.predict(X_val)

In [23]:
# 모델 성능 평가 (RMSE)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f'Validation RMSE: {rmse}')

# 테스트 데이터 예측
test_predictions = model.predict(test_features)

# 결과를 제출 형식에 맞게 저장
submission = pd.DataFrame({
    'ID': test['ID'],
    'IC50_nM': test_predictions
})

submission.to_csv(data_path + 'submission.csv', index=False)
print('Submission saved successfully.')

Validation RMSE: 2486.5486282998827
Submission saved successfully.
