# Tokenization
- need to process just one time

In [1]:
# from tokenizers import BertWordPieceTokenizer

# tokenizer = BertWordPieceTokenizer(
#     strip_accents=True,
#     lowercase=False
# )

# corpus_file = ['data/drug/molecule_total.txt']
# vocab_size = 200
# min_frequency = 10

# tokenizer.train(
#     files=corpus_file,
#     vocab_size=vocab_size,
#     min_frequency=min_frequency,
#     show_progress=True
# )

In [2]:
# import os

# hf_model_path='data/drug/tokenizer_model'

# if not os.path.isdir(hf_model_path):
#     os.mkdir(hf_model_path)

# tokenizer.save_model(hf_model_path)
# tokenizer.save("data/drug/tokenizer_model/tokenizer.json")

# Load Tokenizer

In [1]:
from transformers import PreTrainedTokenizerFast

fast_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="data/drug/tokenizer_model/tokenizer.json",
    pad_token="[PAD]",
    mask_token="[MASK]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    unk_token="[UNK]"
)

vocab_size = len(fast_tokenizer.get_vocab().keys())

print(f"load tokenizer\nvocab size: {vocab_size}\nspecial tokens: {fast_tokenizer.all_special_tokens}")

load tokenizer
vocab size: 200
special tokens: ['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']


In [2]:
import os
import pickle

if not os.path.exists("data/drug/X.pkl"):
    from sklearn.model_selection import train_test_split
    
    with open("data/drug/molecule_total.txt", 'r') as f:
        data = f.readlines()
    
        print(f"load dataset ... # of data: {len(data)}")
    
    X_train, X_test = train_test_split(data, test_size=0.2, random_state=42, shuffle=True)
    X_train, X_valid = train_test_split(X_train, test_size=0.2, random_state=42, shuffle=True)
    
    with open("data/drug/X.pkl", "wb") as f:
        pickle.dump([X_train, X_valid, X_test], f)
else:
    with open("data/drug/X.pkl", "rb") as f:
        X_train, X_valid, X_test = pickle.load(f)

print(f"load dataset\nX_train: {len(X_train)}\nX_valid: {len(X_valid)}\nX_test: {len(X_test)}")

X_train: 59950729 X_valid: 14987683 X_test: 18734604


In [10]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import DataCollatorForLanguageModeling

class MaskedLMDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        
    def encode(self, data):
        return self.tokenizer.encode(data, max_length=self.max_length, truncation=True)
        
        
    def __len__(self):
        return len(self.data)

    
    def __getitem__(self, idx):
        return torch.tensor(self.encode(self.data[idx]), dtype=torch.long)
    
    
data_collator = DataCollatorForLanguageModeling(
    tokenizer=fast_tokenizer, mlm=True, mlm_probability=0.3
)

train_dataset = MaskedLMDataset(X_train, fast_tokenizer)
train_loader = DataLoader(train_dataset, batch_size=1024, collate_fn=data_collator, num_workers=16)

valid_dataset = MaskedLMDataset(X_valid, fast_tokenizer)
valid_loader = DataLoader(valid_dataset, batch_size=1024, collate_fn=data_collator, num_workers=16)

test_dataset = MaskedLMDataset(X_test, fast_tokenizer)
test_loader = DataLoader(test_dataset, batch_size=1024, collate_fn=data_collator, num_workers=16)

In [None]:
import numpy as np
import torchmetrics
import pytorch_lightning as pl
from transformers import BertConfig, BertForMaskedLM
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

config = BertConfig(
    vocab_size=vocab_size,
    hidden_size=128,
    num_hidden_layers=8,
    num_attention_heads=8,
    intermediate_size=512,
    hidden_act="gelu",
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    max_position_embeddings=128,
    type_vocab_size=1,
    pad_token_id=0,
    position_embedding_type="absolute"
)


class Bert(pl.LightningModule):
    def __init__(self, config):
        super().__init__()
        self.model = BertForMaskedLM(config)
        self.train_accuracy = torchmetrics.Accuracy()
        self.valid_accuracy = torchmetrics.Accuracy()
        
        
    def forward(self, input_ids, labels):
        return self.model(input_ids=input_ids, labels=labels)

       
    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        labels = batch['labels']
        
        output = self(input_ids, labels)

        loss = output.loss
        logits = output.logits

        preds = logits.argmax(dim=-1)
        
        self.log('train_loss', float(loss), on_step=True, on_epoch=True, prog_bar=True)
        self.log("train_accuracy", self.train_accuracy(preds[labels > 0], labels[labels > 0]), on_step=False, on_epoch=True, prog_bar=True, logger=True)
        
        return loss

    
    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        labels = batch['labels']
        
        output = self(input_ids, labels)

        loss = output.loss
        logits = output.logits

        preds = logits.argmax(dim=-1)
        
        self.log('valid_loss', float(loss), on_step=False, on_epoch=True, prog_bar=True)
        self.log("valid_accuracy", self.valid_accuracy(preds[labels > 0], labels[labels > 0]), on_step=False, on_epoch=True, prog_bar=True, logger=True)

    
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=1e-5)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=10)

        return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "valid_loss"}

    
model = Bert(config)
callbacks = [
    ModelCheckpoint(monitor='val_loss', dirpath='weights/molecule_bert', filename='molecule_bert-{epoch:02d}-{val_loss:.2f}'),
    EarlyStopping('valid_loss', patience=25)
]

trainer = pl.Trainer(max_epochs=100, gpus=1, enable_progress_bar=True, callbacks=callbacks)
trainer.fit(model, train_loader)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type            | Params
---------------------------------------------------
0 | model          | BertForMaskedLM | 1.6 M 
1 | train_accuracy | Accuracy        | 0     
2 | valid_accuracy | Accuracy        | 0     
---------------------------------------------------
1.6 M     Trainable params
0         Non-trainable params
1.6 M     Total params
6.582     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]