Try to load from: IndoNLG_finals_mBart_model_v2_checkpoint_105_640000.pt

In [1]:
import os, sys
sys.path.append('../')
import torch
from transformers import GPT2LMHeadModel, MBartForConditionalGeneration
from src.indobenchmark import IndoNLGTokenizer
from torch.utils.data import DataLoader

In [2]:
def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())

# Init Model

In [3]:
%%time
bart_model = MBartForConditionalGeneration.from_pretrained('indobenchmark/indobart')
gpt_model = GPT2LMHeadModel.from_pretrained('indobenchmark/indogpt')
tokenizer = IndoNLGTokenizer.from_pretrained('indobenchmark/indobart')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


CPU times: user 7.44 s, sys: 3.79 s, total: 11.2 s
Wall time: 11.7 s


# Test GPT Model

In [4]:
gpt_input = tokenizer.prepare_input_for_generation('aku adalah anak', model_type='indogpt', return_tensors='pt')
gpt_out = gpt_model.generate(**gpt_input)
tokenizer.decode(gpt_out[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'<s> aku adalah anak pertama dari tiga bersaudara. </s> aku lahir di kota kecil yang sama dengan ayahku.'

In [5]:
gpt_input = tokenizer.prepare_input_for_generation('aku suka sekali makan', model_type='indogpt', return_tensors='pt')
gpt_out = gpt_model.generate(**gpt_input)
tokenizer.decode(gpt_out[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'<s> aku suka sekali makan di sini. </s> aku suka sekali dengan menu yang ada di sini. aku'

In [6]:
gpt_input = tokenizer.prepare_input_for_generation('hai, bagaimana kabar', model_type='indogpt', return_tensors='pt')
gpt_out = gpt_model.generate(**gpt_input)
tokenizer.decode(gpt_out[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'<s> hai, bagaimana kabar kalian? semoga sehat selalu ya. kali ini saya akan membahas tentang cara membuat'

# Test BART Model

In [7]:
inputs = ['aku pergi ke toko obat membeli <mask>']
bart_input = tokenizer.prepare_input_for_generation(inputs, return_tensors='pt',
                                         lang_token = '[indonesian]', decoder_lang_token='[indonesian]')

bart_out = bart_model(**bart_input)
print(tokenizer.decode(bart_input['input_ids'][0]))
print(tokenizer.decode(bart_out.logits.topk(1).indices[:,:].squeeze()))

<s> aku pergi ke toko obat membeli <mask> </s> [indonesian]
<s> aku pergi ke toko obat membeli obat jeung [indonesian]


In [8]:
inputs = ['aku menyang pasar karo <mask>']
bart_input = tokenizer.prepare_input_for_generation(inputs, return_tensors='pt',
                                         lang_token = '[javanese]', decoder_lang_token='[javanese]')

bart_out = bart_model(**bart_input)
print(tokenizer.decode(bart_input['input_ids'][0]))
print(tokenizer.decode(bart_out.logits.topk(1).indices[:,:].squeeze()))

<s> aku menyang pasar karo <mask> </s> [javanese]
<s> aku menyang pasar karo tuku, [javanese]


In [9]:
inputs = ['kuring ka pasar senen meuli daging <mask>']
bart_input = tokenizer.prepare_input_for_generation(inputs, return_tensors='pt',
                                         lang_token = '[sundanese]', decoder_lang_token='[sundanese]')

bart_out = bart_model(**bart_input)
print(tokenizer.decode(bart_input['input_ids'][0]))
print(tokenizer.decode(bart_out.logits.topk(1).indices[:,:].squeeze()))

<s> kuring ka pasar senen meuli daging <mask> </s> [sundanese]
<s> kuring ka pasar senen meuli daging sapi, kuring


# Batch Loading

In [10]:
data = []
for enc, dec in zip(
    ['aku adalah anak gembala', 'balonku ada lima', 'so I say'], 
    ['selalu riang serta gembira', 'see you once again my love', 'pokemon master']
):
    data.append(tokenizer.prepare_input_for_generation(
        enc, decoder_inputs=dec, model_type='indobart', 
        lang_token='[sundanese]', decoder_lang_token='[javanese]', padding=False
    ))

In [11]:
for batch in DataLoader(data, batch_size=3, collate_fn=lambda t: tokenizer.pad(t, padding='longest')):
    print(batch)
    break

{'input_ids': [[0, 498, 410, 602, 21682, 2, 40001], [0, 12811, 726, 434, 2481, 2, 40001], [0, 693, 344, 2922, 2, 40001, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0]], 'decoder_input_ids': [[40000, 0, 1049, 25478, 768, 8734, 2, 1, 1], [40000, 0, 12599, 4692, 33974, 19665, 4463, 7176, 2], [40000, 0, 15698, 5897, 2, 1, 1, 1, 1]], 'decoder_attention_mask': [[1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 0, 0, 0, 0]]}


In [12]:
data = []
for enc, dec in zip(
    ['aku adalah anak gembala', 'balonku ada lima', 'so I say'], 
    ['selalu riang serta gembira', 'see you once again my love', 'pokemon master']
):
    data.append(tokenizer.prepare_input_for_generation(
        enc, decoder_inputs=dec, model_type='indogpt', 
        lang_token='[sundanese]', decoder_lang_token='[javanese]', padding=False
    ))

In [13]:
for batch in DataLoader(data, batch_size=3, collate_fn=lambda t: tokenizer.pad(t, padding='longest')):
    print(batch)
    break

{'input_ids': [[0, 498, 410, 602, 21682], [0, 12811, 726, 434, 2481], [0, 693, 344, 2922, 1]], 'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 0]]}
