In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel

In [2]:
!nvidia-smi

Thu May  5 20:11:23 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 512.15       Driver Version: 512.15       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:2B:00.0  On |                  N/A |
|  0%   55C    P8     6W / 170W |    517MiB / 12288MiB |     10%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
torch.manual_seed(50)

<torch._C.Generator at 0x21fe7cf9af0>

### Loading GPT2-Medium Model from 🤗 Model Hub 

In [4]:


tokenizer = GPT2Tokenizer.from_pretrained('indonesian-nlp/gpt2-medium-indonesian', bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained('indonesian-nlp/gpt2-medium-indonesian').cuda()
model.resize_token_embeddings(len(tokenizer))


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50259, 1024)

In [5]:
captions = pd.read_csv('list_caption.csv')['processed']

In [6]:
max_length = max([len(tokenizer.encode(caption, max_length=512)) for caption in captions])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [7]:
class CaptionDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [8]:
dataset = CaptionDataset(captions, tokenizer, max_length=max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])


In [9]:
import gc
gc.collect()

13

In [10]:
torch.cuda.empty_cache()

In [11]:
training_args = TrainingArguments(output_dir='./results', num_train_epochs=3, logging_steps=100, save_steps=5000,
                                  per_device_train_batch_size=1, per_device_eval_batch_size=1,
                                  warmup_steps=10, weight_decay=0.05, logging_dir='./logs', report_to = 'none', fp16=True)


In [None]:
Trainer(model=model,  args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()

In [43]:
#save model
model.save_pretrained('./results')

Configuration saved in ./results\config.json
Model weights saved in ./results\pytorch_model.bin


### OA Line Caption Generator

In [282]:
generated = tokenizer("<|startoftext|> Informasi Menggelitik", return_tensors="pt").input_ids.cuda()

In [283]:
sample_outputs = model.generate(generated, do_sample=True, top_k=50, 
                                max_length=512, top_p=1.5, temperature=0.8)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [284]:
for i, sample_output in enumerate(sample_outputs):
    print(tokenizer.decode(sample_output, skip_special_tokens=True))

 Informasi Menggelitik  - "Ya ampun, sekarang udah mau kuliah lagi aja!" -nya, salah satu anggota dari BSO Minat dan Bakat HIMAGIKA yang bertugas untuk mengoordinasi minat dan bakat di dalam dan di luar bidang gizi kesehatan FK UGM. 
Bironya anak Gizi Kesehatan, tapi yang ngatur anak-anak Gizi UTS ya kan?? 
Eh, bukan..... Bironya anak-anak yang hobi baca, kalo kalian suka baca apa? 
Kalo kalian suka baca apa nih?
-Sastra-







FIND MORE!
Email : himagika@ugm.ac.id


Website : himagika.fk.ugm.ac.id
Tiktok : @himagika.ugm
LINE@ : @clw3634c




