In [None]:
from sklearn.metrics import jaccard_score
import argparse


import os
import pandas as pd
import numpy as np
import pickle
from io import StringIO


import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader

from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
import zipfile
import os
from Bio.PDB import PDBParser
from rdkit import Chem
from rdkit.Chem import Descriptors

import warnings
warnings.filterwarnings("ignore")

np.random.seed(1234)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


In [None]:
def preprocess_structure(pdb_file):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('protein', pdb_file)
    features = np.array([len(structure)])

    return features


In [None]:
class ProteinDataset(Dataset):
    def __init__(self, zip_file, csv_file=None, labels_available=True, transform=None):
        if labels_available:
            self.labels_frame = pd.read_csv(csv_file)
        self.zip_file = zipfile.ZipFile(zip_file, 'r')
        self.transform = transform
        self.labels_available = labels_available

    def __len__(self):
        if self.labels_available:
            return len(self.labels_frame)
        else:
            return len(self.zip_file.namelist())

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        pdb_id = self.labels_frame.iloc[idx, 0]
        f = self.zip_file.open(f"{pdb_id}_protein.pdb")
        pdb_data = f.read().decode()
        parser = PDBParser(QUIET=True)
        structure = parser.get_structure(pdb_id, StringIO(pdb_data))
        structure = preprocess_structure(structure)

        if self.transform:
            structure = self.transform(structure)

        if self.labels_available:
            labels = self.labels_frame.iloc[idx, 1]
            labels = np.array([labels])
            labels = labels.astype('float').reshape(-1, 1)
            sample = {'structure': structure, 'labels': labels}
        else:
            sample = {'structure': structure}

        return sample


In [None]:
train_dataset = ProteinDataset(
    'train.zip', csv_file='train.csv', labels_available=True)



train_size = int(0.8 * len(train_dataset))
valid_size = len(train_dataset) - train_size

train_dataset, valid_dataset = random_split(
    train_dataset, [train_size, valid_size])


In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv3d(1, 32, kernel_size=5)
        self.conv2 = nn.Conv3d(32, 64, kernel_size=5)
        self.fc1 = nn.Linear(64*4*4*4, 1024)
        self.fc2 = nn.Linear(1024, 1)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool3d(x, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool3d(x, 2)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [None]:
num_epochs = 10
lr = 0.001


In [None]:
model = Net().to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)


In [None]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=True)


In [None]:
for epoch in range(1, num_epochs+1):
    model.train()
    train_loss = 0
    for batch in train_loader:
        structures = batch['structure'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(structures)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()


    model.eval()
    valid_loss = 0
    with torch.no_grad():
        for batch in valid_loader:
            structures = batch['structure'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(structures)
            loss = criterion(outputs, labels)

            valid_loss += loss.item()

    print(f"Epoch {epoch}/{num_epochs}, Train Loss: {train_loss/len(train_loader)}, Valid Loss: {valid_loss/len(valid_loader)}")


In [None]:
torch.save(model.state_dict(), 'model.pth')


In [None]:
test_dataset = ProteinDataset('test.zip', labels_available=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


In [None]:
model.eval()
predictions = []
with torch.no_grad():
    for batch in test_loader:
        structures = batch['structure'].to(device)

        outputs = model(structures)

        predictions.extend(outputs.cpu().numpy())


In [None]:
predictions_df = pd.DataFrame(predictions, columns=['prediction'])
predictions_df.to_csv('predictions.csv', index=False)
