## 필요한거 설치하기

In [1]:
!pip install transformers torch
!pip install git+https://github.com/facebookresearch/esm.git

Collecting git+https://github.com/facebookresearch/esm.git
  Cloning https://github.com/facebookresearch/esm.git to /tmp/pip-req-build-pj2wax1y
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/esm.git /tmp/pip-req-build-pj2wax1y
  Resolved https://github.com/facebookresearch/esm.git to commit 723e85829b1c175f23d9c1195b0fb47d6b2bf5cd
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25h

## 필요한거 import하고 로딩하기

In [2]:
from transformers import BertForMaskedLM, BertConfig, BertTokenizer
from esm.data import ESMStructuralSplitDataset
import os
import numpy as np

In [3]:
esm_structural_train = ESMStructuralSplitDataset(
    split_level='superfamily', 
    cv_partition='4', 
    split='train', 
    root_path = os.path.expanduser('~/.cache/torch/data/esm'),
    download=True
)

Files already downloaded and verified


In [4]:
# pretraining을 이 데이터로만 하면 의미가 없습니다! 더 많은 데이터를 가지고 하는게 좋고, 이번에는 예시로서 보여드리기위해 이것으로 합니다.
seqs = [data['seq'] for data in esm_structural_train]

In [5]:
tokenizer = BertTokenizer.from_pretrained('mytoken')
tokenizer

PreTrainedTokenizer(name_or_path='mytoken', vocab_size=32, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [6]:
configuration = BertConfig()

In [7]:
model = BertForMaskedLM(configuration)

## 변수, 함수들의 역할 확인

In [8]:
configuration

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [9]:
def make_input_n_label(seq):
    seq = list(seq)
    seq = np.array(seq, dtype='<U6')
    mask1 = np.random.random(size=seq.shape)<0.15
    u = np.random.random(size=seq.shape)
    mask = mask1 & (u < 0.8)
    tokens = list('ABCDEFGHIKLMNOPQRSTUVWXYZ-.')
    random_another_mask = mask1 & (u>=0.8) & (u<0.9)
    random_another = np.random.choice(tokens, size = seq.shape)
    
    seqm = seq.copy()
    seqm[mask] = '[MASK]'
    seqm[random_another_mask] = random_another[random_another_mask]
    
    seqL = seq.copy()
    seqL[~mask] = '[MASK]'
    
    return ' '.join(seqm), ' '.join(seqL)

In [10]:
make_input_n_label(seqs[0])

('[MASK] [MASK] T V R Q E R L K S I V R I L E [MASK] S K E P V S G A Q L A E E L S V S R Q [MASK] I V [MASK] [MASK] I A Y L R F L G [MASK] N I V A T P R G H V L A G G',
 'M K [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] R [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] V [MASK] [MASK] Q D [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] Y [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK]')

In [11]:
example = [make_input_n_label(seq) for seq in seqs[:4]]

In [12]:
inputs = [e[0] for e in example]
labels = [e[1] for e in example]
inputs = tokenizer(inputs, return_tensors='pt', padding=True)
labels = tokenizer(labels, return_tensors='pt', padding=True)

In [13]:
inputs

{'input_ids': tensor([[ 2, 16, 14, 23,  4,  4, 20,  9, 21, 15,  4,  5, 28, 25, 21,  4, 15,  9,
         11, 22, 14,  9, 19, 25, 22, 11,  5, 20, 15,  5,  9,  9, 15,  4,  4, 22,
         21, 20, 25,  4, 25,  4,  4, 13,  5, 28, 15, 21, 22, 15,  4, 28,  4, 13,
         25,  5, 23,  4, 21,  4, 28, 25, 15,  5, 11, 11,  3,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0],
        [ 2, 14,  5, 15, 23,  5, 21, 20, 20,  9, 25,  4, 25,  4, 13, 21,  8, 12,
         13, 22, 20, 23, 11, 16, 19, 19, 23, 21,  5,  9, 13,  5, 20, 21, 15, 11,
         10, 21, 22, 19, 17,  5,  4,  9,  9,  4, 15, 14,  5, 15,  4, 21, 14, 11,
         25,  4,  4, 13, 25, 22, 11,  5, 22, 21, 11, 13,  4, 15, 15, 20,  9,  9,
          3,  0,  0,  0,  0],
        [ 2, 11, 20, 21,  4,  4, 14,  4, 21,  9,  4, 13, 16,  4, 17,  8, 13,  9,
          4, 20,  8,  9, 15,  4,  8, 21, 15, 21,  9,  5, 11,  4, 17, 25, 23, 20,
          5, 23, 25, 22, 21,  8, 13, 14,  9, 16,  4, 15, 25, 14, 25, 19,  4,  5,
          4,  4, 21, 28, 14, 28,  4

In [14]:
labels['input_ids'][labels['input_ids']<5]=-100

In [15]:
labels['input_ids']

tensor([[-100, -100, -100, -100,   25,   21, -100, -100, -100, -100,   14, -100,
         -100, -100, -100,   13, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100,   22,   25, -100,
         -100, -100, -100,   13, -100,   20,    8, -100, -100, -100, -100, -100,
         -100, -100,   11, -100,   17, -100, -100, -100, -100,   19, -100,   11,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100],
        [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,   10,
         -100,   15, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100,    5, -100, -100,   12, -100, -100,
         -100, -100,    5, -100, -100, -100, -100,   13,    9, -100, -100, -100,
         -100, -100, -100, -100, -100, -100,   21, -100, -100, -100, 

In [16]:
out = model(**inputs, labels=labels['input_ids'])
out

MaskedLMOutput(loss=tensor(10.6432, grad_fn=<NllLossBackward0>), logits=tensor([[[ 0.0000,  1.2639,  0.6349,  ..., -0.5563, -0.4117,  0.5651],
         [ 0.0000, -0.5571,  0.7112,  ...,  0.6025,  0.4129,  0.3255],
         [ 0.0000,  0.1445,  0.4176,  ...,  0.4882, -0.2714,  0.5091],
         ...,
         [ 0.0000,  0.9921,  0.5531,  ...,  0.1027, -0.6658,  0.1879],
         [ 0.0000,  0.5340,  1.0160,  ..., -0.7268, -0.5335,  0.1841],
         [ 0.0000,  0.2934,  0.7532,  ..., -0.2197,  0.1762,  0.2255]],

        [[ 0.0000,  0.2135, -0.0099,  ..., -0.3568, -0.2432,  0.6962],
         [ 0.0000,  0.2477,  0.6386,  ..., -0.2720, -0.6797,  0.6723],
         [ 0.0000, -0.2421,  0.1027,  ...,  0.0649,  0.1477, -0.0215],
         ...,
         [ 0.0000,  0.5423,  0.5032,  ...,  0.1286, -0.2271, -0.0660],
         [ 0.0000,  0.6523,  0.6089,  ..., -0.1663, -0.1618,  0.4893],
         [ 0.0000,  0.3999,  0.3883,  ..., -0.1876, -0.0862,  0.1799]],

        [[ 0.0000,  0.1019,  0.7386,  ..., -

## 이제 시작해봅시다!

In [17]:
device = 'cpu' # gpu가 있으면 이것을 'cuda'로 변경하면 됨.
model = model.to(device)

In [18]:
def generate_batch(seqs, batch_size, device):
    n = len(seqs)
    nb = n//batch_size
    for i in range(nb):
        example = [make_input_n_label(seq) for seq in seqs[i*batch_size:(i+1)*batch_size]]
        inputs = [e[0] for e in example]
        labels = [e[1] for e in example]
        inputs = tokenizer(inputs, return_tensors='pt', padding=True)
        labels = tokenizer(labels, return_tensors='pt', padding=True)
        inputs['input_ids'].to(device)
        inputs['token_type_ids'].to(device)
        inputs['attention_mask'].to(device)
        labels['input_ids'][labels['input_ids']<5]=-100
        labels['input_ids'].to(device)
        yield inputs, labels

In [20]:
from transformers import AdamW
optim = AdamW(model.parameters(), lr = 1e-5)



In [None]:
epochs = 10
batch_size = 10
for epoch in range(epochs):
    gen = generate_batch(seqs, batch_size, device)
    for inputs, labels in gen:
        optim.zero_grad()
        out = model(**inputs, labels=labels['input_ids'])
        loss = out.loss
        loss.backward()
        optim.step()
        print('loss : ',loss.item())

In [22]:
os.makedirs('pretrained', exist_ok=True)
model.save_pretrained('./pretrained/')