In [1]:
import pandas as pd
import gdown
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AdamW
)
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
gdown.download('https://drive.google.com/uc?id=1Ebtb6ItULnfFqz0qYKdVkaror-GvuNJr')#https://drive.google.com/file/d/1Ebtb6ItULnfFqz0qYKdVkaror-GvuNJr/view?usp=sharing
gdown.download('https://drive.google.com/uc?id=1SA4i75wppxd2Qf6E3-8Qe2quIPlYMS9d')#https://drive.google.com/file/d/1SA4i75wppxd2Qf6E3-8Qe2quIPlYMS9d/view?usp=sharing

Downloading...
From: https://drive.google.com/uc?id=1Ebtb6ItULnfFqz0qYKdVkaror-GvuNJr
To: /content/vi_squad.csv
100%|██████████| 93.3M/93.3M [00:01<00:00, 74.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1SA4i75wppxd2Qf6E3-8Qe2quIPlYMS9d
To: /content/vi_squad_dev.csv
100%|██████████| 11.7M/11.7M [00:00<00:00, 49.9MB/s]


'vi_squad_dev.csv'

In [3]:
df = pd.read_csv('vi_squad.csv')
df.head()

Unnamed: 0,context,question,answer
0,"Architecturally, trường học có một nhân vật Cô...",Ai đã làm Trinh nữ Mary bị cáo buộc xuất hiện ...,Saint Bernadette Soubirous
1,"Architecturally, trường học có một nhân vật Cô...",Cái gì ở trước tòa nhà chính của Notre Dame?,một bức tượng đồng của Chúa Kitô
2,"Architecturally, trường học có một nhân vật Cô...",Thánh đường của trái tim thiêng liêng tại Notr...,Tòa nhà chính
3,"Architecturally, trường học có một nhân vật Cô...",Glimpses ở Notre Dame là gì?,một nơi Marian của cầu nguyện và phản ánh
4,"Architecturally, trường học có một nhân vật Cô...",Cái gì ngồi trên đỉnh của tòa nhà chính tại No...,một bức tượng vàng của Trinh Mary


In [4]:
print(len(df))
df.dropna()
len(df)

84816


84816

In [5]:
# token_counts = df['context'].apply(lambda x: len(tokenizer.encode(x)))

# max_token_count = token_counts.max()
# max_token_count

In [6]:
tokenizer = AutoTokenizer.from_pretrained("NlpHUST/gpt2-vietnamese")
model = AutoModelForCausalLM.from_pretrained("NlpHUST/gpt2-vietnamese")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/854k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/512k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/884 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

In [7]:
print(tokenizer)

GPT2TokenizerFast(name_or_path='NlpHUST/gpt2-vietnamese', vocab_size=50257, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	50257: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}


In [8]:
PAD_TOKEN = '<pad>'

In [9]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, src_max_len: int, tgt_max_len: int):
        self.data = data
        self.tokenizer = tokenizer
        self.src_max_len = src_max_len
        self.tgt_max_len = tgt_max_len
        self.tokenizer.add_special_tokens({'pad_token': '<s>'})

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data_row = self.data.iloc[idx]

        input_encoding = self.tokenizer(
            data_row['context'],
            truncation=True,
            max_length=self.src_max_len,
            padding='max_length',
            return_tensors='pt'
        )

        output_text = f"{data_row['question']} {PAD_TOKEN} {data_row['answer']}"
        output_encoding = self.tokenizer(
            output_text,
            truncation=True,
            max_length=self.tgt_max_len,
            padding='max_length',
            return_tensors='pt'
        )

        labels = output_encoding['input_ids'].clone()
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'labels': labels.squeeze()
        }

In [10]:
train_pairs, test_pairs = train_test_split(df, test_size=0.2, random_state=42)
train_pairs, dev_pairs = train_test_split(train_pairs, test_size=0.2, random_state=42)


In [11]:
src_max_len = 1024
tgt_max_len = 1024
batch_size = 8


In [12]:
train_dataset = CustomDataset(train_pairs, tokenizer, src_max_len, tgt_max_len)
dev_dataset = CustomDataset(dev_pairs, tokenizer, src_max_len, tgt_max_len)
test_dataset = CustomDataset(test_pairs, tokenizer, src_max_len, tgt_max_len)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [13]:
epochs = 3
lr = 5e-5

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)



# for name, param in model.named_parameters():
#     if 'final_layer' in name or 'decoder' in name:
#         param.requires_grad = True
#     else:
#         param.requires_grad = False

# optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
optimizer = AdamW(model.parameters(), lr=lr)



best_dev_loss = float('inf')
for epoch in range(epochs):
    model.train()
    train_loop = tqdm(train_dataloader, leave=True)
    for batch in train_loop:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        train_loop.set_description(f'Epoch {epoch}')
        train_loop.set_postfix(loss=loss.item())

    model.eval()
    dev_loss = 0
    with torch.no_grad():
        for batch in dev_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            dev_loss += outputs.loss.item()

    dev_loss /= len(dev_dataloader)

    if dev_loss < best_dev_loss:
        best_dev_loss = dev_loss
        model.save_pretrained("gpt_2_fine_tuning")
        tokenizer.save_pretrained("gpt_2_fine_tuning")

model = AutoModelForCausalLM.from_pretrained("gpt_2_fine_tuning")
tokenizer = AutoTokenizer.from_pretrained("gpt_2_fine_tuning")
model.to(device)
model.eval()

test_loss = 0
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        test_loss += outputs.loss.item()

test_loss /= len(test_dataloader)
print(f'Test loss: {test_loss}')


  0%|          | 0/6786 [00:02<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.53 GiB. GPU 