In [None]:
!pip install transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from transformers import (
    EncoderDecoderModel,
    BertTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Trainer
)
from torch.utils.data import Dataset, DataLoader
import torch

## Google Drive import

In [None]:
drive_path = '/content/drive/MyDrive/project3/'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Data Load & Split

In [None]:
tokenizer = BertTokenizer(drive_path + f'dump/wpm-vocab-extend-30522.txt', do_lower_case=False)

In [None]:
import pickle

def make_corpus(src_file_path, trg_file_path, file_type: str):
    data = []

    if file_type == 'jit':

        with open(src_file_path, 'r') as fd:
                for line in fd.readlines():
                    data.append([line[:-1]])

        with open(trg_file_path, 'r') as fd:
            for i, line in enumerate(fd.readlines()):
                data[i].append(line[:-1])

    elif file_type == 'pickled_ai_hub':

        with open(src_file_path, 'rb') as f:
            tmp_src = pickle.load(f)

        with open(trg_file_path, 'rb') as f:
            tmp_trg = pickle.load(f)
            
        for i in range(len(tmp_src)):
            data.append([tmp_src[i], tmp_trg[i]])
    return data

In [None]:
# Train Korea to Jeju
ai_hub_data = make_corpus(drive_path + '전처리된_AI_HUB_data/ai_hub_standard_v2.pkl', drive_path + '전처리된_AI_HUB_data/ai_hub_trans_v2.pkl', 'pickled_ai_hub')
train_ai_hub = ai_hub_data[:-10000]
dev_ai_hub = ai_hub_data[-10000:-5000]
test_ai_hub = ai_hub_data[-5000:]
train_data = make_corpus(drive_path + 'jit/ko.train', drive_path + 'jit/je.train', 'jit')
dev_data = make_corpus(drive_path + 'jit/ko.dev', drive_path + 'jit/je.dev', 'jit')
test_data = make_corpus(drive_path + 'jit/ko.test', drive_path + 'jit/je.test', 'jit')

# Train Jeju to Korea
''' 
ai_hub_data = make_corpus(drive_path + '전처리된_AI_HUB_data/ai_hub_trans_v2.pkl', drive_path + '전처리된_AI_HUB_data/ai_hub_standard_v2.pkl', 'pickled_ai_hub')
train_ai_hub = ai_hub_data[:-10000]
dev_ai_hub = ai_hub_data[-10000:-5000]
test_ai_hub = ai_hub_data[-5000:]
train_data = make_corpus(drive_path + 'jit/je.train', drive_path + 'jit/ko.train', 'jit')
dev_data = make_corpus(drive_path + 'jit/je.dev', drive_path + 'jit/ko.dev', 'jit')
test_data = make_corpus(drive_path + 'jit/je.test', drive_path + 'jit/ko.test', 'jit')
'''

In [None]:
len(dev_data)

In [None]:
train_data = train_data + train_ai_hub
dev_data = dev_data + dev_ai_hub
test_data = test_data + test_ai_hub

len(train_data), len(dev_data), len(test_data)

## Data Tokenizing

In [None]:
def tokenized_data(tokenizer, data, max_length=128, stride=30):
    
    cnt = 0
    embeddings = []
    for src, trg in data:
        src_sample = tokenizer(src, truncation=True, max_length=max_length, stride=stride, return_token_type_ids=False, return_attention_mask=False, return_overflowing_tokens=True)
        trg_sample = tokenizer(trg, truncation=True, max_length=max_length, stride=stride, return_token_type_ids=False, return_attention_mask=False, return_overflowing_tokens=True)
        embeddings.append({'input_ids' : src_sample['input_ids'],
                           'labels': trg_sample['input_ids']})
        if src_sample['num_truncated_tokens'] > 0 and trg_sample['num_truncated_tokens'] > 0:
            src_tmp = src_sample['overflowing_tokens']
            trg_tmp = trg_sample['overflowing_tokens']
            while len(src_tmp) > 0 and len(trg_tmp) > 0:
                cnt += 1
                src_input = [tokenizer.cls_token_id]
                trg_input = [tokenizer.cls_token_id]
                src_input.extend(src_tmp[:max_length-2])
                src_input.append(tokenizer.sep_token_id)
                trg_input.extend(trg_tmp[:max_length-2])
                trg_input.append(tokenizer.sep_token_id)                
                embeddings.append({'input_ids' : src_input,
                               'labels': trg_input})
                src_tmp = src_tmp[max_length-stride-2:2*max_length-stride-2]
                trg_tmp = trg_tmp[max_length-stride-2:2*max_length-stride-2]
    print(f'Processed {cnt} amount of overflowing token set!')

    cnt = 0
    for item in embeddings:
        if len(item['input_ids']) == 2 or len(item['labels']) == 2:
            embeddings.remove(item)
            cnt += 1
    print(f'Removed {cnt} amount of empty token set!')
    return embeddings

In [None]:
train_tokenized_data = tokenized_data(tokenizer, train_data)
dev_tokenized_data = tokenized_data(tokenizer, dev_data)
test_tokenized_data = tokenized_data(tokenizer, test_data)

In [None]:
class DatasetRetriever(Dataset):
    def __init__(self, features):
        super(DatasetRetriever, self).__init__()
        self.features = features

    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, index):   
        feature = self.features[index]
        return {
            'input_ids':torch.tensor(feature['input_ids'] ,dtype=torch.long),
            'labels':torch.tensor(feature['labels'] ,dtype=torch.long)
        }

In [None]:
train_dataset = DatasetRetriever(train_tokenized_data)
dev_dataset = DatasetRetriever(dev_tokenized_data)
test_dataset = DatasetRetriever(test_tokenized_data)

## Modeling

In [None]:
from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel

# Initializing a BERT bert-base-uncased style configuration
config_encoder = BertConfig()
config_decoder = BertConfig()

config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)

# Initializing a Bert2Bert model from the bert-base-uncased style configurations
model = EncoderDecoderModel(config=config)

model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# Accessing the model configuration
config_encoder = model.config.encoder
config_decoder = model.config.decoder
# set decoder config to causal lm
config_encoder.bos_token_id = tokenizer.cls_token_id
config_encoder.eos_token_id = tokenizer.sep_token_id
config_encoder.decoder_start_token_id = tokenizer.cls_token_id
config_decoder.is_decoder = True
config_decoder.add_cross_attention = True
config_decoder.bos_token_id = tokenizer.cls_token_id
config_decoder.eos_token_id = tokenizer.sep_token_id
config_decoder.decoder_start_token_id = tokenizer.cls_token_id

## Loading Model on GPU

In [None]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)
model.cuda()

## Train

In [None]:
collator = DataCollatorForSeq2Seq(tokenizer, model)

arguments = Seq2SeqTrainingArguments(
    output_dir= drive_path + 'dump/models',
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_ratio=0.1,
    gradient_accumulation_steps=1,
    save_total_limit=5,
    dataloader_num_workers=1,
    fp16=True,
    load_best_model_at_end=True
)

trainer = Trainer(
    model,
    arguments,
    data_collator=collator,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset
)

In [None]:
trainer.train()

model.save_pretrained(drive_path + "dump/models")