## PRELIMINARIES

In [None]:
%pip install numpy
%pip install pandas
%pip install torch
%pip install git+https://github.com/huggingface/transformers.git
%pip install tqdm
%pip install git+https://github.com/huggingface/accelerate.git


In [13]:
%pip install sentencepiece

Note: you may need to restart the kernel to use updated packages.


RESTART KERNEL

## IMPORT LIBRARIES

In [14]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import (
    GPT2LMHeadModel,
    PreTrainedTokenizerFast,
    AdamW,
    get_linear_schedule_with_warmup
)
import pandas as pd


## DATASET

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class GPTDataset(Dataset):
    def __init__(self, tokenizer, data):
        self.data = data
        self.length = self.data.shape[0]
        self.tokenizer = tokenizer
    
    def __getitem__(self, i):
        prev, _next = self.data.iloc[i]['prev'], self.data.iloc[i]['next']
        prev = self.tokenizer.encode(prev)
        _next = self.tokenizer.encode(_next)
        # inputs_id = prev + [self.tokenizer.bos_token_id] + _next
        inputs_id = prev + _next
        while len(inputs_id) < 50:
            inputs_id.append(self.tokenizer.pad_token_id)
        
        outputs_id = [self.tokenizer.mask_token_id,] * (len(prev)-1) + _next + [self.tokenizer.eos_token_id]
        while len(outputs_id) < 50:
            outputs_id.append(self.tokenizer.pad_token_id)
        mask = [0] * len(prev) + [1] * len(_next) + [0] * (50 - len(prev) - len(_next))
        inputs_id = torch.LongTensor(inputs_id).to(device)
        outputs_id = torch.LongTensor(outputs_id).to(device)
        mask = torch.LongTensor(mask).to(device)
        return inputs_id, mask, outputs_id
    
    def __len__(self):
        return self.length

## DATALOADER

In [16]:
def GPTDataLoader(tokenizer, data, batch_size):
    data = GPTDataset(tokenizer=tokenizer, data=data)
    return DataLoader(data, batch_size=batch_size)

## TRAIN

In [17]:
# set variables
model_name = "skt/kogpt2-base-v2"
data_dir = "./data"
batch_size = 32
epochs = 50
lr = 2e-5
warmup_steps = 200

In [21]:
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name, bos_token='<s>', eos_token='</s>', unk_token='<unk>', pad_token='<pad>', mask_token='<mask>')
file_path = os.path.join(data_dir, "final_train.csv")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [None]:
# make train and val dataloader
data = pd.read_csv(file_path)[['prev', 'next']]
data = data.sample(frac=1).reset_index(drop=True)
num = int(len(data) * 0.8)
train_data, val_data = data[:num], data[:num]

train_dataloader = GPTDataLoader(tokenizer=tokenizer,data=train_data, batch_size=batch_size)
val_dataloader = GPTDataLoader(tokenizer=tokenizer, data=val_data, batch_size=batch_size)

In [None]:
# set to train mode
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
model.train()

In [None]:
optimizer = AdamW(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
)
criterion = torch.nn.CrossEntropyLoss(reduction="none")
min_loss = int(1e9)
Sneg = -1e18
for epoch in range(epochs):
    print(f"Training epoch {epoch}")
    for samples in tqdm(train_dataloader):
        optimizer.zero_grad()
        input_ids, mask, label = samples
        out = model(input_ids)
        out = out.logits
        mask_3d = mask.unsqueeze(dim=2).repeat_interleave(repeats=out.shape[2], dim=2)
        mask_out = torch.where(mask_3d == 1, out, Sneg * torch.ones_like(out))
        loss = criterion(mask_out.transpose(2, 1), label)
        avg_loss = loss.sum() / mask.sum()
        # out = model(input_ids, labels=label)
        # loss = out[0]
        avg_loss.backward()
        # loss.backward()
        optimizer.step()
        scheduler.step()

    print("Validating...")
    cnt=0
    with torch.no_grad():
        test_loss=0
        for samples in tqdm(val_dataloader):
            cnt+=1
            input_ids, mask, label = samples
            out = model(input_ids)
            out = out.logits
            mask_3d = mask.unsqueeze(dim=2).repeat_interleave(repeats=out.shape[2], dim=2)
            mask_out = torch.where(mask_3d == 1, out, Sneg * torch.ones_like(out))
            loss = criterion(mask_out.transpose(2, 1), label)
            avg_loss = loss.sum() / mask.sum()
            # out = model(input_ids, labels=label)
            # loss = out[0]
            test_loss+=avg_loss
            # test_loss+=loss
        test_loss/=cnt
        print(f"epoch: {epoch}/{epochs} valdiation loss: {test_loss:0.2f}")

        if test_loss < min_loss:
            min_loss = test_loss
            model.save_pretrained("./best_model_theRealNewVersion")
print("Training Done")