<a href="https://colab.research.google.com/github/jarammm/moledule/blob/main/dnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.9 kB)
Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl (33.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.1/33.1 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.3.5


In [None]:
import pandas as pd
import numpy as np
import os
import random

from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [None]:
CFG = {
    'NBITS':2048,
    'SEED':42,
}

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG['SEED']) # Seed 고정

In [None]:
# SMILES 데이터를 분자 지문으로 변환
def smiles_to_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=CFG['NBITS'])
        return np.array(fp)
    else:
        return np.zeros((CFG['NBITS'],))

In [None]:
def pIC50_to_IC50(pic50_values):
    """Convert pIC50 values to IC50 (nM)."""
    return 10 ** (9 - pic50_values)

In [None]:
def normalized_rmse(y_true, y_pred):
    y_true_ic50 = pIC50_to_IC50(y_true)
    y_pred_ic50 = pIC50_to_IC50(y_pred)

    rmse = torch.sqrt(torch.mean((y_true_ic50 - y_pred_ic50) ** 2))
    range_ic50 = torch.max(y_true_ic50) - torch.min(y_true_ic50)
    return rmse / range_ic50

def correct_ratio(y_true, y_pred):
    absolute_error = torch.abs(y_true - y_pred)
    correct_predictions = torch.less_equal(absolute_error, 0.5)
    return torch.mean(correct_predictions.float())

def custom_score(y_true, y_pred):
    A = normalized_rmse(y_true, y_pred)
    B = correct_ratio(y_true, y_pred)
    total_score = 0.5 * (1 - min(A.item(), 1.0)) + 0.5 * B.item()
    return A, B, total_score

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class SimpleDNN(nn.Module):
    def __init__(self, input_dim, embed_dim):
        super(SimpleDNN, self).__init__()
        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.fc1 = nn.Linear(embed_dim, 32)
        self.fc2 = nn.Linear(32, 16)
        self.output = nn.Linear(16, 1)

    def forward(self, x):
        # 원-핫 인코딩 벡터를 임베딩 벡터로 변환
        x = self.embedding(x)
        x = x.sum(dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.output(x)
        return x

class StackedDNN(nn.Module):
    def __init__(self, input_dim, embed_dim):
        super(SimpleDNN, self).__init__()
        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.encode = nn.Sequential(
            nn.Linear(embed_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU()
        )
        self.decode = nn.Sequential(
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, embed_dim),
            nn.ReLU()
        )
        self.fc1 = nn.Linear(embed_dim, 32)
        self.fc2 = nn.Linear(32, 16)
        self.output = nn.Linear(16, 1)

    def forward(self, x):
        x = F.relu(self.embedding(x))
        x = x.sum(dim=1)
        x = self.encode(x)
        x = self.decode(x)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.output(x)
        return x

In [None]:
# 학습 ChEMBL 데이터 로드
chembl_data = pd.read_csv('train.csv')  # 예시 파일 이름
chembl_data.head()

Unnamed: 0,Molecule ChEMBL ID,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Assay ChEMBL ID,Target ChEMBL ID,Target Name,Target Organism,Target Type,Document ChEMBL ID,IC50_nM,pIC50,Smiles
0,CHEMBL4443947,IC50,'=',0.022,nM,10.66,CHEMBL4361896,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4359855,0.022,10.66,CN[C@@H](C)C(=O)N[C@H](C(=O)N1C[C@@H](NC(=O)CC...
1,CHEMBL4556091,IC50,'=',0.026,nM,10.59,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.026,10.59,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
2,CHEMBL4566431,IC50,'=',0.078,nM,10.11,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.078,10.11,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
3,CHEMBL4545898,IC50,'=',0.081,nM,10.09,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.081,10.09,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
4,CHEMBL4448950,IC50,'=',0.099,nM,10.0,CHEMBL4361896,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4359855,0.099,10.0,COc1cc2c(OC[C@@H]3CCC(=O)N3)ncc(C#CCCCCCCCCCCC...


In [None]:
train = chembl_data[['Smiles', 'pIC50']]
train['Fingerprint'] = train['Smiles'].apply(smiles_to_fingerprint)

train_x = np.stack(train['Fingerprint'].values)
train_y = train['pIC50'].values

train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.3, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Fingerprint'] = train['Smiles'].apply(smiles_to_fingerprint)


In [None]:
train_x = torch.tensor(train_x, dtype=torch.int32)
val_x = torch.tensor(val_x, dtype=torch.int32)
train_y = torch.tensor(train_y, dtype=torch.float32).view(-1, 1)
val_y = torch.tensor(val_y, dtype=torch.float32).view(-1, 1)

In [None]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        if m.bias is not None:
            nn.init.constant_(m.bias, 0)

In [None]:
from torch.utils.data import DataLoader, TensorDataset

# 데이터 준비
dataset = TensorDataset(train_x, train_y)  # 입력과 타겟이 동일
dataloader = DataLoader(dataset, batch_size=128, shuffle=True)  # 배치 사이즈 64

# 모델 초기화
input_dim = 2048
embed_dim = 256
dnn = SimpleDNN(input_dim, embed_dim)
# dnn.apply(init_weights)
criterion = CustomLoss()
optimizer = optim.Adam(dnn.parameters(), lr=0.0001)

In [None]:
num_epochs = 100

for epoch in range(num_epochs):
    dnn.train()
    epoch_loss = 0

    for idx, data in enumerate(dataloader):
        input, gt = data
        pred = dnn(input)

        loss = criterion(pred, gt)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item() * input.size(0)  # 전체 배치에서의 손실 합산

    epoch_loss /= len(dataloader.dataset)  # 평균 손실 계산
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')

Epoch [1/10], Loss: 155.8619
Epoch [2/10], Loss: 45.1087
Epoch [3/10], Loss: 11.3606
Epoch [4/10], Loss: 6.8162
Epoch [5/10], Loss: 3.5607
Epoch [6/10], Loss: 2.0855
Epoch [7/10], Loss: 1.8519
Epoch [8/10], Loss: 1.7907
Epoch [9/10], Loss: 1.7976
Epoch [10/10], Loss: 1.7728


In [None]:
val_y_pred = dnn(val_x)
A, B, total_score = custom_score(val_y, val_y_pred)
print(A, B, total_score)

tensor(0.0800, grad_fn=<DivBackward0>) tensor(0.3242) 0.6221237480640411


In [None]:
test = pd.read_csv('./test.csv')
test['Fingerprint'] = test['Smiles'].apply(smiles_to_fingerprint)
test_x = np.stack(test['Fingerprint'].values)
test_x = torch.tensor(test_x, dtype=torch.int32)
test_y_pred = dnn(test_x)



In [None]:
submit = pd.read_csv('./sample_submission.csv')
submit['IC50_nM'] = pIC50_to_IC50(test_y_pred.detach().numpy())
submit.head()

Unnamed: 0,ID,IC50_nM
0,TEST_000,14.85296
1,TEST_001,17.27364
2,TEST_002,23.365192
3,TEST_003,12.772462
4,TEST_004,18.166157


In [None]:
submit.to_csv('./custom_submit.csv', index=False)