In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel

In [2]:
!nvidia-smi

Fri May  6 19:46:29 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 512.15       Driver Version: 512.15       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:2B:00.0  On |                  N/A |
| 30%   44C    P8     5W / 170W |   1326MiB / 12288MiB |      5%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
torch.manual_seed(24)

<torch._C.Generator at 0x13502fc0970>

### Loading GPT2-Medium Model from 🤗 Model Hub 

In [3]:

#indonesian-nlp/gpt2-medium-indonesian
model = GPT2LMHeadModel.from_pretrained('indonesian-nlp/gpt2-medium-indonesian').cuda()
tokenizer = GPT2Tokenizer.from_pretrained('indonesian-nlp/gpt2-medium-indonesian', bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')

model.resize_token_embeddings(len(tokenizer))


Embedding(50259, 1024)

In [4]:
captions = pd.read_csv('list_caption.csv')['processed']

In [6]:
max_length = max([len(tokenizer.encode(caption, max_length=512)) for caption in captions])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [7]:
class CaptionDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [8]:
dataset = CaptionDataset(captions, tokenizer, max_length=max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])


In [9]:
import gc
gc.collect()

4

In [10]:
torch.cuda.empty_cache()

In [11]:
training_args = TrainingArguments(output_dir='./results', num_train_epochs=2, logging_steps=100, save_steps=10000,
                                  per_device_train_batch_size=1, per_device_eval_batch_size=1,
                                  warmup_steps=10, weight_decay=0.05, logging_dir='./logs', report_to = 'none', fp16=True)


In [12]:
Trainer(model=model,  args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()

Using amp half precision backend
***** Running training *****
  Num examples = 10428
  Num Epochs = 2
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 20856


Step,Training Loss
100,0.3739
200,0.4219
300,0.4464
400,0.4355
500,0.3867
600,0.397
700,0.4826
800,0.4526
900,0.4098
1000,0.4395


Saving model checkpoint to ./results\checkpoint-10000
Configuration saved in ./results\checkpoint-10000\config.json
Model weights saved in ./results\checkpoint-10000\pytorch_model.bin
Saving model checkpoint to ./results\checkpoint-20000
Configuration saved in ./results\checkpoint-20000\config.json
Model weights saved in ./results\checkpoint-20000\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=20856, training_loss=0.35374151851008756, metrics={'train_runtime': 6335.7361, 'train_samples_per_second': 3.292, 'train_steps_per_second': 3.292, 'total_flos': 1.936898168507597e+16, 'train_loss': 0.35374151851008756, 'epoch': 2.0})

In [13]:
#save model
model.save_pretrained('./results')
tokenizer.save_pretrained('./results')

Configuration saved in ./results\config.json
Model weights saved in ./results\pytorch_model.bin
tokenizer config file saved in ./results\tokenizer_config.json
Special tokens file saved in ./results\special_tokens_map.json
added tokens file saved in ./results\added_tokens.json


('./results\\tokenizer_config.json',
 './results\\special_tokens_map.json',
 './results\\vocab.json',
 './results\\merges.txt',
 './results\\added_tokens.json')

### OA Line Caption Generator

In [131]:
generated = tokenizer("<|startoftext|> Staff of the Month - Selamat kepada nama-nama berikut", return_tensors="pt").input_ids.cuda()

In [136]:
sample_outputs = model.generate(generated, do_sample=True, top_k=50, 
                                max_length=512, top_p=1.5, temperature=0.7,num_return_sequences=10 )


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [137]:
for i, sample_output in enumerate(sample_outputs):
    print("%d."%(i+1),tokenizer.decode(sample_output, skip_special_tokens=True))

1.  Staff of the Month - Selamat kepada nama-nama berikut yang berhasil mendapatkan predikat staff of the month:
1. Muhammad Nur Haritsah (PSI ‘16)
2. Annisa Zulfa Zahara (PSI ‘16)
3. Syafira Indah Puspita (PSI ‘16)
4. Salsabila Rizki Amelia (PSI ‘16)
5. Shavira Dewi Puspita (PSI ‘16)
6. Resha Ramadhona (PSI ‘16)
7. Elysia Hutami (PSI ‘16)
8. Fanny Octaviani (PSI ‘16)
9. Nabila Putri Salsabila (PSI ‘16)
Selamat kepada para pemenang! 
Semoga dapat menginspirasi teman-teman lainnya!

2.  Staff of the Month - Selamat kepada nama-nama berikut yang berhasil menjadi Staff of the Month:
1. Nadia Salsabila (TI '15)
2. M. Faza A (TI '16)
3. Muhammad Izza M (TI '16)
4. Muhammad Bagas S (TI '16)
5. Muhammad Rizki H (TI '16)
6. Muhammad Iqbal A (TI '16)
Semoga dapat menginspirasi seluruh warga DT tidak hanya warga DT itu sendiri namun juga warga DT yang ada di luar sana.

3.  Staff of the Month - Selamat kepada nama-nama berikut yang berhasil kami pilih menjadi Staff of the Month:
1. Nadia Fawwaz 