<a href="https://colab.research.google.com/github/kim9296/Dacon/blob/main/cyp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os, sys
from google.colab import drive
drive.mount('/content/drive')

my_path = '/content/notebooks'
# Colab Notebooks 안에 my_env 폴더에 패키지 저장
os.symlink('/content/drive/My Drive/Colab Notebooks/my_env', my_path)
sys.path.insert(0, my_path)

In [None]:
cd /content/drive/MyDrive/dacon_cyp

In [None]:
# !pip install --target=$my_path utils

In [None]:
import pandas as pd
import numpy as np
import random

from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from rdkit import DataStructs
from rdkit.Chem import PandasTools, AllChem


from rdkit import Chem
from rdkit.Chem import Draw

In [None]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
# print(device)


In [None]:
seed = 42

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)

seed_everything(seed) # Seed 고정

In [None]:
train_csv = pd.read_csv('data/train.csv')
test_csv = pd.read_csv('data/test.csv')

In [None]:
PandasTools.AddMoleculeColumnToFrame(train_csv,'SMILES','Molecule')
PandasTools.AddMoleculeColumnToFrame(test_csv,'SMILES','Molecule')

In [None]:
 # @title Phase1 / 2 rules

glory_rules = pd.read_csv('data/gloryx_reactionrules.csv')
rxn_positions_mols = [Chem.MolFromSmarts(x.split('>>')[0]) for x in glory_rules.SMIRKS]
print(len(rxn_positions_mols))

In [None]:
vocab_size = 0
global vocab_size

def mol2vec(mol):
  global vocab_size
  fp1 = [len(mol.GetSubstructMatches(x)) for x in rxn_positions_mols]
  if max(fp1) > vocab_size:
    vocab_size = max(fp1)
  fp2 = AllChem.GetHashedMorganFingerprint(mol, 4, nBits=2048)
  ar = np.zeros((1,), dtype=np.int8)
  DataStructs.ConvertToNumpyArray(fp2, ar)
  # return np.concatenate((ar, np.array(fp1)))
  return np.array(fp1)

In [None]:
train_csv["FPs"] = train_csv.Molecule.apply(mol2vec)
test_csv["FPs"] = test_csv.Molecule.apply(mol2vec)

In [None]:
# 사용할 column만 추출
train_data = train_csv[['FPs','MLM', 'HLM']]
test_data = test_csv[['FPs']]

In [None]:
vocab_size

In [None]:
# def mol2fp(mol):
#     fp = AllChem.GetHashedMorganFingerprint(mol, 6, nBits=4096)
#     ar = np.zeros((1,), dtype=np.int8)
#     DataStructs.ConvertToNumpyArray(fp, ar)
#     return ar

In [None]:
# # FPs column 추가
# train["FPs"] = train.Molecule.apply(mol2fp)
# test["FPs"] = test.Molecule.apply(mol2fp)

In [None]:
# # 사용할 column만 추출
# train = train_csv[['FPs','MLM', 'HLM']]
# test = test_csv[['FPs']]

In [None]:
# @title Hyperparameter

CFG = {'BATCH_SIZE': 256,
       'EPOCHS': 1000,
       'HIDDEN_SIZE': 1024,
       'OUTPUT_SIZE': 1,
       'DROPOUT_RATE': 0.8,
       'LEARNING_RATE': 0.001,
       'SIGMOID' : True,
       'BCE' : False}

In [None]:
# @title GraphGCN DataEmbedding

LIST_SYMBOLS = ['C', 'N', 'O', 'S', 'F', 'H', 'Si', 'P', 'Cl', 'Br',
            'Li', 'Na', 'K', 'Mg', 'Ca', 'Fe', 'As', 'Al', 'I', 'B',
            'V', 'Tl', 'Sb', 'Sn', 'Ag', 'Pd', 'Co', 'Se', 'Ti', 'Zn',
            'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'Mn', 'Cr', 'Pt', 'Hg', 'Pb']


def atom_feature(atom):
    return np.array(char_to_ix(atom.GetSymbol(), LIST_SYMBOLS) +
                    char_to_ix(atom.GetDegree(), [0, 1, 2, 3, 4, 5]) +
                    char_to_ix(atom.GetTotalNumHs(), [0, 1, 2, 3, 4]) +
                    char_to_ix(atom.GetImplicitValence(), [0, 1, 2, 3, 4, 5]) +
                    char_to_ix(int(atom.GetIsAromatic()), [0, 1]))    # (40, 6, 5, 6, 2)


def char_to_ix(x, allowable_set):
    if x not in allowable_set:
        return [0] # Unknown Atom Token
    return [allowable_set.index(x)+1]


def mol2graph(smi, MAX_LEN):
    mol = Chem.MolFromSmiles(smi)

    X = np.zeros((MAX_LEN, 5), dtype=np.uint8)
    A = np.zeros((MAX_LEN, MAX_LEN), dtype=np.uint8)

    temp_A = Chem.rdmolops.GetAdjacencyMatrix(mol).astype(np.uint8, copy=False)[:MAX_LEN, :MAX_LEN]
    num_atom = temp_A.shape[0]
    A[:num_atom, :num_atom] = temp_A + np.eye(temp_A.shape[0], dtype=np.uint8)

    for i, atom in enumerate(mol.GetAtoms()):
        feature = atom_feature(atom)
        X[i, :] = feature
        if i + 1 >= num_atom: break

    return X, A

In [None]:
# @title Custom Dataset

class CustomDataset(Dataset):
    def __init__(self, df, target, transform, is_test=False, is_sigmoid = False, is_bce = False):
        self.df = df
        self.target = target # HLM or MLM
        self.is_test = is_test # train,valid / test
        self.is_sigmoid = is_sigmoid
        self.is_bce = is_bce

        # self.feature_select = transform
        if not self.is_test:
            # self.fp = self.feature_select.fit_transform(np.stack(df['FPs']))
            self.fp = np.stack(df['FPs'])
        else: # valid or test
            self.fp = np.stack(df['FPs'])
            # self.fp = self.feature_select.transform(np.stack(df['FPs']))

    def __getitem__(self, index):
        fp = self.fp[index]
        if not self.is_test: # test가 아닌 경우(label 존재)
            if self.is_sigmoid and self.is_bce:
              label = (self.df[self.target][index] / 100 + 0.5) / 2
            else:
              label = self.df[self.target][index]
            return torch.tensor(fp).float().to(device), torch.tensor(label).float().unsqueeze(dim=-1).to(device) # feature, label

        else: # test인 경우
            return torch.tensor(fp).float().to(device) # feature

    def __len__(self):
        return len(self.df)

class gcnDataset(Dataset):
    def __init__(self, df, max_len=120):
        self.smiles = df["smiles"]
        self.exp = df["exp"].values

        list_X = list()
        list_A = list()
        for i, smiles in enumerate(self.smiles):
            X, A = mol2graph(smiles, max_len)
            list_X.append(X)
            list_A.append(A)

        self.X = np.array(list_X, dtype=np.uint8)
        self.A = np.array(list_A, dtype=np.uint8)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        return self.X[index], self.A[index], self.exp[index]

In [None]:
transform = VarianceThreshold(threshold=0)
# transform = False

train_MLM = CustomDataset(df=train_data, target='MLM', transform=transform, is_test=False, is_sigmoid = CFG['SIGMOID'], is_bce = CFG['BCE'])
train_HLM = CustomDataset(df=train_data, target='HLM', transform=transform, is_test=False, is_sigmoid = CFG['SIGMOID'], is_bce = CFG['BCE'])

input_size = train_MLM.fp.shape[1]
input_size

In [None]:
CFG['INPUT_SIZE'] = input_size

In [None]:
# train,valid split
train_MLM_dataset, valid_MLM        = train_test_split(train_MLM, test_size=0.2, random_state=seed)
valid_MLM_dataset, test_MLM_dataset = train_test_split(valid_MLM, test_size=0.5, random_state=seed)
train_HLM_dataset, valid_HLM        = train_test_split(train_HLM, test_size=0.2, random_state=seed)
valid_HLM_dataset, test_HLM_dataset = train_test_split(valid_HLM, test_size=0.5, random_state=seed)

In [None]:
train_MLM_loader = DataLoader(dataset=train_MLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=True)

valid_MLM_loader = DataLoader(dataset=valid_MLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)

test_MLM_loader  = DataLoader(dataset=test_MLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)

train_HLM_loader = DataLoader(dataset=train_HLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=True)

valid_HLM_loader = DataLoader(dataset=valid_HLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)

test_HLM_loader  = DataLoader(dataset=test_HLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)

In [None]:
# @title Model

class Net(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_rate, out_size, is_sigmoid, is_bce):
        super(Net, self).__init__()

        self.is_sigmoid = is_sigmoid
        self.is_bce = is_bce

        # fc 레이어 3개와 출력 레이어
        self.fc1 = nn.Linear(input_size, hidden_size, device = device)
        self.fc2 = nn.Linear(hidden_size, hidden_size, device = device)
        self.fc3 = nn.Linear(hidden_size, hidden_size, device = device)
        self.fc_out = nn.Linear(hidden_size, out_size, device = device)

        # 정규화
        self.ln1 = nn.LayerNorm(hidden_size, device = device)
        self.ln2 = nn.LayerNorm(hidden_size, device = device)
        self.ln3 = nn.LayerNorm(hidden_size, device = device)

        # 활성화 함수
        self.activation = nn.LeakyReLU()
        self.sigmoid = nn.Sigmoid()

        # Dropout
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        out = self.fc1(x)
        out = self.ln1(out)
        out = self.activation(out)
        out = self.dropout(out)

        out = self.fc2(out)
        out = self.ln2(out)
        out = self.activation(out)
        out = self.dropout(out)

        out = self.fc3(out)
        out = self.ln3(out)
        out = self.activation(out)
        out = self.dropout(out)

        out = self.fc_out(out)

        if self.is_sigmoid and self.is_bce:
          out = self.sigmoid(out)

        return out


In [None]:
class ResBlock(nn.Module):
    def __init__(self, in_filter, out_filter, stride, use_bn, dp_rate, block_type):
        super(ResBlock, self).__init__()
        self.use_bn = use_bn
        self.block_type = block_type
        self.conv1 = nn.Conv2d(in_filter, out_filter, kernel_size=3, stride=stride, padding=1, bias=False)
        self.conv2 = nn.Conv2d(out_filter, out_filter, kernel_size=3, stride=1, padding=1, bias=False)
        self.relu = nn.ReLU()
        self.bn1 = nn.BatchNorm2d(out_filter)
        self.bn2 = nn.BatchNorm2d(out_filter)
        self.dropout = nn.Dropout2d(dp_rate)
        self.shortcut = nn.Sequential()
        if in_filter != out_filter:
            self.shortcut.add_module(
                'conv', nn.Conv2d(in_filter, out_filter,
                                  kernel_size=1, stride=stride,
                                  padding=0, bias=False)
            )

    def forward(self, _x):
        if self.block_type == 'a': #original residual block
            x = self.relu(self.bn1(self.conv1(_x))) if self.use_bn else self.relu(self.conv1(_x))
            x = self.bn2(self.conv2(x)) if self.use_bn else self.conv2(x)
            x = x + self.shortcut(_x)
            return self.dropout(self.relu(x))

        elif self.block_type == 'b': # BN after addition
            x = self.relu(self.bn1(self.conv1(_x))) if self.use_bn else self.relu(self.conv1(_x))
            x = self.conv2(x) + self.shortcut(_x)
            return self.dropout(self.relu(self.bn2(x)) if self.use_bn else self.relu(x))

        elif self.block_type == 'c': # ReLU before addition
            x = self.relu(self.bn1(self.conv1(_x))) if self.use_bn else self.relu(self.conv1(_x))
            x = self.relu(self.bn2(self.conv2(x))) if self.use_bn else self.relu(self.conv2(x))
            return self.dropout(x + self.shortcut(_x))

        elif self.block_type == 'd': # ReLU-only pre-activation
            x = self.bn1(self.conv1(self.relu(_x))) if self.use_bn else self.conv1(self.relu(_x))
            x = self.bn2(self.conv2(self.relu(x))) if self.use_bn else self.conv2(self.relu(x))
            return self.dropout(x + self.shortcut(_x))

        elif self.block_type == 'e': # full pre-activation
            x = self.conv1(self.relu(self.bn1(_x))) if self.use_bn else self.conv1(self.relu(_x))
            x = self.conv2(self.relu(self.bn2(x))) if self.use_bn else self.conv2(self.relu(x))
            return self.dropout(x + self.shortcut(_x))


class CNNNet(nn.Module):
    def __init__(self, args):
        super(CNNNet, self).__init__()

        # Create Atom Element embedding layer
        self.embedding = self.create_emb_layer(args.vocab_size, args.emb_train)

        # Create Residual Convolution layer
        list_res_blocks = list()
        n_channel = 1
        for i in range(args.n_stage):
            if i==0:
                list_res_blocks.append(ResBlock(n_channel, n_channel*args.start_channel, args.stride, args.use_bn, args.dp_rate, args.block_type))
                n_channel *= args.start_channel
            else:
                list_res_blocks.append(ResBlock(n_channel, n_channel*2, args.stride, args.use_bn, args.dp_rate, args.block_type))
                n_channel *= 2
            for j in range(args.n_layer-1):
                list_res_blocks.append(ResBlock(n_channel, n_channel, 1, args.use_bn, args.dp_rate, args.block_type))
        self.res_blocks = nn.Sequential(*list_res_blocks)

        # Create MLP layers
        fc_shape = self._estimate_fc_shape((1, args.max_len, ))
        self.fc1 = nn.Linear(fc_shape[-1], 100)
        self.fc2 = nn.Linear(100, 50)
        self.fc3 = nn.Linear(50, 1)
        self.relu = nn.ReLU()

    def create_emb_layer(self, vocab_size, emb_train):
        emb_layer = nn.Embedding(vocab_size, vocab_size)
        weight_matrix = torch.zeros((vocab_size, vocab_size))
        for i in range(vocab_size):
            weight_matrix[i][i] = 1
        emb_layer.load_state_dict({'weight': weight_matrix})

        if not emb_train:
            emb_layer.weight.requires_grad = False
        return emb_layer

    def _estimate_fc_shape(self, input_shape):
        dummy_input = torch.zeros(input_shape).long()
        dummy_output = self._conv_forward(dummy_input)
        fc_shape = dummy_output.view(dummy_output.shape[0], -1).shape
        return fc_shape

    def _conv_forward(self, x):
        embeds = self.embedding(x)
        embeds = embeds.view(embeds.shape[0], 1, embeds.shape[1], embeds.shape[2])
        x = self.res_blocks(embeds)
        return x

    def forward(self, x):
        x = self._conv_forward(x)
        x = x.view(x.shape[0], -1)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return torch.squeeze(x)

In [None]:

class BN1d(nn.Module):
    def __init__(self, out_dim, use_bn):
        super(BN1d, self).__init__()
        self.use_bn = use_bn
        self.bn = nn.BatchNorm1d(out_dim)

    def forward(self, x):
        if not self.use_bn:
            return  x
        origin_shape = x.shape
        x = x.view(-1, origin_shape[-1])
        x = self.bn(x)
        x = x.view(origin_shape)
        return x


class GConv(nn.Module):
    def __init__(self, input_dim, output_dim, use_bn):
        super(GConv, self).__init__()
        self.fc = nn.Linear(input_dim, output_dim)
        self.bn = BN1d(output_dim, use_bn)
        self.relu = nn.ReLU()

    def forward(self, X, A):
        x = self.fc(X)
        x = torch.matmul(A, x)
        x = self.relu(self.bn(x))
        return x, A


class Readout(nn.Module):
    def __init__(self, out_dim, molvec_dim):
        super(Readout, self).__init__()
        self.readout_fc = nn.Linear(out_dim, molvec_dim)
        nn.init.xavier_normal_(self.readout_fc.weight.data)

    def forward(self, output_H):
        molvec = self.readout_fc(output_H)
        molvec = torch.mean(molvec, dim=1)
        return molvec


class GCNNet(nn.Module):

    def __init__(self, args):
        super(GCNNet, self).__init__()

        # Create Atom Element embedding layer
        self.embedding = self.create_emb_layer([args.vocab_size, args.degree_size,
                                                args.numH_size, args.valence_size,
                                                args.isarom_size],  args.emb_train)

        self.gcn_layers = nn.ModuleList()
        for i in range(args.n_layer):
            self.gcn_layers.append(GConv(args.in_dim if i==0 else args.out_dim, args.out_dim, args.use_bn))

        self.readout = Readout(args.out_dim, args.molvec_dim)

        self.fc1 = nn.Linear(args.molvec_dim, args.molvec_dim//2)
        self.fc2 = nn.Linear(args.molvec_dim//2, args.molvec_dim//2)
        self.fc3 = nn.Linear(args.molvec_dim//2, 1)
        self.relu = nn.ReLU()

    def create_emb_layer(self, list_vocab_size, emb_train=False):
        list_emb_layer = nn.ModuleList()
        for i, vocab_size in enumerate(list_vocab_size):
            vocab_size += 1
            emb_layer = nn.Embedding(vocab_size, vocab_size)
            weight_matrix = torch.zeros((vocab_size, vocab_size))
            for i in range(vocab_size):
                weight_matrix[i][i] = 1
            emb_layer.load_state_dict({'weight': weight_matrix})
            emb_layer.weight.requires_grad = emb_train
            list_emb_layer.append(emb_layer)
        return list_emb_layer

    def _embed(self, x):
        list_embed = list()
        for i in range(5):
            list_embed.append(self.embedding[i](x[:, :, i]))
        x = torch.cat(list_embed, 2)
        return x

    def forward(self, x, A):
        A = A.float()
        x = self._embed(x)

        for i, module in enumerate(self.gcn_layers):
            x, A = module(x, A)
        x = self.readout(x)

        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return torch.squeeze(x)


In [None]:
# model_MLM = Net(CFG['INPUT_SIZE'],CFG['HIDDEN_SIZE'],CFG['DROPOUT_RATE'],CFG['OUTPUT_SIZE'],CFG['SIGMOID'],CFG['BCE']).to(device)
# model_HLM = Net(CFG['INPUT_SIZE'],CFG['HIDDEN_SIZE'],CFG['DROPOUT_RATE'],CFG['OUTPUT_SIZE'],CFG['SIGMOID'],CFG['BCE']).to(device)

In [None]:
# if CFG['SIGMOID']:
#   criterion = nn.BCELoss()
#   print('sigmoid')
# else:
#   criterion = nn.MSELoss()
#   print('mse')
# criterion = nn.MSELoss()
# optimizer_MLM = torch.optim.Adam(model_MLM.parameters(), lr=CFG['LEARNING_RATE'])
# optimizer_HLM = torch.optim.Adam(model_HLM.parameters(), lr=CFG['LEARNING_RATE'])

In [None]:
# @title Training

# def train(train_loader, valid_loader, model, criterion, optimizer, epochs):
#     model.train()

#     for epoch in range(epochs):
#         running_loss = 0
#         for inputs, targets in train_loader:
#             optimizer.zero_grad()
#             output = model(inputs)
#             loss = criterion(output, targets)
#             loss.backward()
#             optimizer.step()

#             running_loss += loss.item()

#         if epoch % 100 == 0:
#             valid_loss = 0
#             with torch.no_grad():
#                 for inputs, targets in valid_loader:
#                     output = model(inputs)
#                     loss = criterion(output, targets)
#                     valid_loss += loss.item()

#             print(f'Epoch: {epoch}/{epochs}, Train Loss: {running_loss/len(train_loader)}, Valid Loss: {valid_loss/len(valid_HLM_loader)}')

#             model.train()

#     return model


In [None]:
def train(model, dataloader, optimizer, criterion, args, **kwargs):

    epoch_train_loss = 0
    list_train_loss = list()
    cnt_iter = 0
    for batch_idx, batch in enumerate(dataloader):
        X, y = batch[0].long(), batch[1].float()
        X, y = X.to(args.device), y.to(args.device)

        model.train()
        optimizer.zero_grad()

        pred_y = model(X)
        train_loss = criterion(pred_y, y)
        epoch_train_loss += train_loss.item()
        # list_train_loss.append({'epoch':batch_idx/len(dataloader)+kwargs['epoch'], 'train_loss':train_loss.item()})
        train_loss.backward()
        optimizer.step()

        cnt_iter += 1
    return model, epoch_train_loss/cnt_iter


def validate(model, dataloader, criterion, args):

    epoch_val_loss = 0
    cnt_iter = 0
    for batch_idx, batch in enumerate(dataloader):
        X, y = batch[0].long(), batch[1].float()
        X, y = X.to(args.device), y.to(args.device)

        model.eval()
        pred_y = model(X)
        val_loss = criterion(pred_y, y)
        epoch_val_loss += val_loss.item()
        cnt_iter += 1

    return epoch_val_loss/cnt_iter

def test(model, dataloader, args, **kwargs):

    list_y, list_pred_y = list(), list()
    for batch_idx, batch in enumerate(dataloader):
        X, y = batch[0].long(), batch[1].float()
        X, y = X.to(args.device), y.to(args.device)

        model.eval()
        pred_y = model(X)
        list_y += y.cpu().detach().numpy().tolist()
        list_pred_y += pred_y.cpu().detach().numpy().tolist()

    mae = mean_absolute_error(list_y, list_pred_y)
    std = np.std(np.array(list_y)-np.array(list_pred_y))
    return mae, std, list_y, list_pred_y


def experiment(partition, args):
    ts = time.time()
    # args.input_shape = (args.max_len, args.vocab_size)

    if args.model == 'CNN':
      print('CNN')
      model = CNNNet(args)
    else:
      model = Net(args)

    model.to(args.device)
    criterion = nn.MSELoss()

    # Initialize Optimizer
    trainable_parameters = filter(lambda p: p.requires_grad, model.parameters())
    if args.optim == 'ADAM':
        optimizer = optim.Adam(trainable_parameters, lr=args.lr, weight_decay=args.l2_coef)
    elif args.optim == 'RMSProp':
        optimizer = optim.RMSprop(trainable_parameters, lr=args.lr, weight_decay=args.l2_coef)
    elif args.optim == 'SGD':
        optimizer = optim.SGD(trainable_parameters, lr=args.lr, weight_decay=args.l2_coef)
    else:
        assert False, "Undefined Optimizer Type"

    # Train, Validate, Evaluate
    list_train_loss = list()
    list_val_loss = list()
    list_mae = list()
    list_std = list()

    args.best_mae = 10000
    for epoch in range(args.epoch):
        model, train_loss = train(model, partition['train'], optimizer, criterion, args, **{'epoch':epoch})
        val_loss = validate(model, partition['val'], criterion, args)
        mae, std, true_y, pred_y = test(model, partition['test'], args, **{'epoch':epoch})

        list_train_loss.append({'epoch':epoch, 'train_loss':train_loss})
        list_val_loss.append({'epoch':epoch, 'val_loss':val_loss})
        list_mae.append({'epoch':epoch, 'mae':mae})
        list_std.append({'epoch':epoch, 'std':std})

        if epoch % 10 == 0:
            print('Epoch: {:2}/{:2}, Train Loss: {:2.3f}, Valid Loss: {:2.3f}'.format(epoch, args.epoch, train_loss, val_loss))

        if args.best_mae > mae or epoch==0:
            args.best_epoch = epoch
            args.best_mae = mae
            args.best_std = std
            args.best_true_y = true_y
            args.best_pred_y = pred_y


    # End of experiments
    te = time.time()
    args.elapsed = te-ts
    args.train_losses = list_train_loss
    args.val_losses = list_val_loss
    args.maes = list_mae
    args.stds = list_std

    return model, args

In [None]:
CFG

In [None]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
torch.cuda.empty_cache()

In [None]:
# @title Experiments

import argparse
import time
from sklearn.metrics import mean_absolute_error
from utils import *

parser = argparse.ArgumentParser()
args = parser.parse_args("")

args.vocab_size = vocab_size + 2
args.max_len = input_size

args.n_layer = 1
args.n_stage = 3

args.lr = CFG['LEARNING_RATE']
args.l2_coef = 0.0001
args.optim = 'ADAM'
args.epoch = 100
args.test_batch_size= CFG['BATCH_SIZE']
args.emb_train = False
args.start_channel = 256
args.stride = 2
args.use_bn = True
args.dp_rate = 0.3
args.block_type = 'a'
args.shuffle = True
args.device = device
args.model = 'CNN'
print (device)

args.batch_size = CFG['BATCH_SIZE']
args.exp_name = 'exp1_lr_stage'


# writer = Writer(prior_keyword=['n_layer', 'n_stage','block_type', 'use_bn', 'lr', 'dp_rate', 'emb_train', 'epoch', 'batch_size'])
#writer.clear()

# Define Hyperparameter Search Space
list_lr = [0.025, 0.005, 0.001]
list_n_stage = [1,2,3,4,5]


# train_dataloader = DataLoader(cnnDataset(datasets[0], vocab, args.max_len), batch_size=args.batch_size, shuffle=True)
# val_dataloader = DataLoader(cnnDataset(datasets[1], vocab, args.max_len), batch_size=args.batch_size, shuffle=False)
# test_dataloader = DataLoader(cnnDataset(datasets[2], vocab, args.max_len), batch_size=args.batch_size, shuffle=False)
partition = {'train': train_MLM_loader, 'val': valid_MLM_loader, 'test' : test_MLM_loader}
check_list = {}
cnt_exp = 0
for lr in list_lr:
    for n_stage in list_n_stage:
        args.lr = lr
        args.n_stage = n_stage

        model, result = experiment(partition, args)
        # writer.write(result)

        cnt_exp += 1
        state_n = f'{n_stage}_{lr}'
        check_list[stage_n] = result.best_mae
        print('[Exp {:2}] got mae: {:2.3f}, std: {:2.3f} at epoch {:2}'.format(cnt_exp, result.best_mae, result.best_std, result.best_epoch))

cpu
CNN


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch:  0/100, Train Loss: 3621564.403, Valid Loss: 2676.864


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(inpu

Epoch: 10/100, Train Loss: 1382.875, Valid Loss: 1371.530


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(inpu

KeyboardInterrupt: ignored

In [None]:
print("Training Start: MLM")
model_MLM = train(train_MLM_loader, valid_MLM_loader, model_MLM, criterion, optimizer_MLM, epochs=CFG['EPOCHS'])

print("Training Start: HLM")
model_HLM = train(train_HLM_loader, valid_HLM_loader, model_HLM, criterion, optimizer_HLM, epochs=CFG['EPOCHS'])

In [None]:
# @title  Inference

test_MLM = CustomDataset(df=test, target=None, transform=transform, is_test=True)
test_HLM = CustomDataset(df=test, target=None, transform=transform, is_test=True)

test_MLM_loader = DataLoader(dataset=test_MLM,
                             batch_size=CFG['BATCH_SIZE'],
                             shuffle=False)

test_HLM_loader = DataLoader(dataset=test_HLM,
                             batch_size=CFG['BATCH_SIZE'],
                             shuffle=False)

In [None]:
def inference(test_loader, model, is_sigmoid):
    model.eval()
    preds = []

    with torch.no_grad():
        for inputs in test_loader:
            output = model(inputs)
            if is_sigmoid:
              output = (output * 2 - 0.5 ) * 100
            preds.extend(output.cpu().numpy().flatten().tolist())

    return preds

In [None]:
predictions_MLM = inference(test_MLM_loader, model_MLM, is_sigmoid = CFG['SIGMOID'])
predictions_HLM = inference(test_HLM_loader, model_HLM, is_sigmoid = CFG['SIGMOID'])

In [None]:
# @title Submission

submission = pd.read_csv('data/sample_submission.csv')
submission

In [None]:
submission['MLM'] = predictions_MLM
submission['HLM'] = predictions_HLM
submission

In [None]:
submission.to_csv('result/submission.csv', index=False)

In [None]:
mol = Chem.MolFromSmiles('CCOc1ccc(CNC(=O)c2cc(-c3sc(C)nc3C)n[nH]2)cc1OCC')

In [None]:
Draw.MolsToGridImage([mol])

In [None]:
rxn_positions = [Chem.MolFromSmarts(x.split('>>')[0]) for x in glory_rules.SMIRKS]

In [None]:
rxn_positions_mols = [Chem.MolFromSmarts(x) for x in rxn_positions]

In [None]:
vec = [len(mol.GetSubstructMatches(x)) for x in rxn_positions_mols]

In [None]:
vec