<a href="https://colab.research.google.com/github/kimjaehwankimjaehwan/Dacon/blob/main/Graph_Neural_Networks_(GNNs).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install torch
!pip install torch-geometric
!pip install rdkit


Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# 1. SMILES 데이터를 그래프 데이터로 변환
def smiles_to_graph(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    # 노드 특성 생성 (원자 종류)
    node_features = []
    for atom in mol.GetAtoms():
        node_features.append([atom.GetAtomicNum()])

    # 엣지 리스트와 엣지 특성 생성
    edge_index = []
    edge_attr = []
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        edge_index.append((i, j))
        edge_index.append((j, i))
        edge_attr.append([bond.GetBondTypeAsDouble()])
        edge_attr.append([bond.GetBondTypeAsDouble()])

    # 그래프 데이터 생성
    x = torch.tensor(node_features, dtype=torch.float)
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_attr = torch.tensor(edge_attr, dtype=torch.float)

    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr)

# 2. 학습 데이터 로드 및 전처리
chembl_data = pd.read_csv('/content/drive/MyDrive/데이콘/train.csv')
chembl_data['Graph'] = chembl_data['Smiles'].apply(smiles_to_graph)
chembl_data = chembl_data.dropna(subset=['Graph'])

# 데이터를 훈련 및 검증 세트로 나누기
train_data, val_data = train_test_split(chembl_data, test_size=0.3, random_state=42)

train_dataset = [row['Graph'] for _, row in train_data.iterrows()]
train_y = torch.tensor(train_data['pIC50'].values, dtype=torch.float)

val_dataset = [row['Graph'] for _, row in val_data.iterrows()]
val_y = torch.tensor(val_data['pIC50'].values, dtype=torch.float)

# 3. GNN 모델 정의
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(1, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.lin = torch.nn.Linear(hidden_channels, 1)

    def forward(self, data):
        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)

        x = global_mean_pool(x, data.batch)  # 그래프 수준의 풀링

        x = self.lin(x)

        return x

# 4. 학습 및 검증
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GNN(hidden_channels=64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.MSELoss()

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

def train():
    model.train()
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out, train_y.to(device)) # Move train_y to the device
        loss.backward()
        optimizer.step()

def validate():
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for data in val_loader:
            data = data.to(device)
            out = model(data)
            # Move val_y to the device
            val_loss += criterion(out, val_y.to(device)).item()
    return val_loss / len(val_loader)

# 5. 모델 훈련
for epoch in range(1, 201):
    train()
    val_loss = validate()
    if epoch % 10 == 0:
        print(f'Epoch: {epoch:03d}, Validation Loss: {val_loss:.4f}')

# 6. 테스트 데이터 처리
test_data = pd.read_csv('/content/drive/MyDrive/데이콘/test.csv')
test_data['Graph'] = test_data['Smiles'].apply(smiles_to_graph)
test_dataset = [row['Graph'] for _, row in test_data.iterrows()]

test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
model.eval()
test_preds = []

with torch.no_grad():
    for data in test_loader:
        data = data.to(device)
        out = model(data)
        test_preds.append(out.cpu().numpy())

# 결과 제출 준비
submit = pd.read_csv('/content/drive/MyDrive/데이콘/sample_submission.csv')
submit['pIC50'] = np.concatenate(test_preds)
submit.to_csv('gnn_submit.csv', index=False)


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 010, Validation Loss: 1.2730
Epoch: 020, Validation Loss: 1.2759
Epoch: 030, Validation Loss: 1.2601
Epoch: 040, Validation Loss: 1.2497
Epoch: 050, Validation Loss: 1.2499
Epoch: 060, Validation Loss: 1.2408
Epoch: 070, Validation Loss: 1.2309
Epoch: 080, Validation Loss: 1.2361
Epoch: 090, Validation Loss: 1.2309
Epoch: 100, Validation Loss: 1.2307
Epoch: 110, Validation Loss: 1.2310
Epoch: 120, Validation Loss: 1.2313
Epoch: 130, Validation Loss: 1.2324
Epoch: 140, Validation Loss: 1.2309
Epoch: 150, Validation Loss: 1.2354
Epoch: 160, Validation Loss: 1.2324
Epoch: 170, Validation Loss: 1.2327
Epoch: 180, Validation Loss: 1.2389
Epoch: 190, Validation Loss: 1.2309
Epoch: 200, Validation Loss: 1.2330


