Try to load from: IndoNLG_finals_mBart_model_v2_checkpoint_105_640000.pt

In [1]:
import os, sys
sys.path.append('../')
import torch
from transformers import BartModel, BartForConditionalGeneration, GPT2LMHeadModel, MBartForConditionalGeneration, BartConfig

In [2]:
def prepare_input_for_generation(self, inputs, lang_token = '[indonesia]', decoder_lang_token = '[indonesia]', decoder_inputs=None, return_tensors='pt'):

    # Process encoder input
    if lang_token not in self.special_tokens_to_ids:
        raise ValueError(f"Unknown lang_token `{lang_token}`, lang_token must be either `[javanese]`, `[sundanese]`, or `[indonesian]`")  
    elif type(inputs) == list:
        if len(inputs) == 0 or type(inputs[0]) != str:
            raise ValueError(IndoNLGTokenizer.input_error_message)
    elif type(inputs) != str:
        raise ValueError(IndoNLGTokenizer.input_error_message)

    lang_id = self.special_tokens_to_ids[lang_token]
    input_batch = self(inputs, return_attention_mask=False)
    input_batch['input_ids'][0] = input_batch['input_ids'][0][1:-1]
    print(input_batch)
    if type(inputs) == str:
        input_batch['input_ids'] = [self.bos_token_id] + input_batch['input_ids'] + [self.eos_token_id, lang_id]
    else:
        input_batch['input_ids'] = list(map(
            lambda input_ids: [self.bos_token_id] + input_ids + [self.eos_token_id, lang_id], 
            input_batch['input_ids']))

    if decoder_inputs is None:
        # Return encoder input
        return self.pad(input_batch, return_tensors=return_tensors)
    else:
        # Process decoder input
        if decoder_lang_token not in self.special_tokens_to_ids:
            raise ValueError(f"Unknown decoder_lang_token `{decoder_lang_token}`, decoder_lang_token must be either `[javanese]`, `[sundanese]`, or `[indonesian]`")  
        elif type(decoder_inputs) == list:
            if len(decoder_inputs) == 0:
                raise ValueError(IndoNLGTokenizer.input_error_message)
            elif type(decoder_inputs[0]) != str:
                raise ValueError(IndoNLGTokenizer.input_error_message)
        elif type(decoder_inputs) != str:
            raise ValueError(IndoNLGTokenizer.input_error_message)

        decoder_lang_id = self.special_tokens_to_ids[decoder_lang_token]
        decoder_input_batch = self(decoder_inputs, return_attention_mask=False)
        decoder_input_batch['input_ids'][0] = decoder_input_batch['input_ids'][0][1:-1]

        if type(decoder_inputs) == str:
            decoder_input_batch['input_ids'] = [lang_id, self.bos_token_id] + decoder_input_batch['input_ids']  + [self.eos_token_id]
        else:
            decoder_input_batch['input_ids'] = list(map(lambda input_ids: [lang_id, self.bos_token_id] + input_ids + [self.eos_token_id], decoder_input_batch['input_ids']))

        # Padding
        input_batch = self.pad(input_batch, return_tensors=return_tensors)
        decoder_input_batch = self.pad(decoder_input_batch, return_tensors=return_tensors)

        # Store into a single dict
        input_batch['decoder_input_ids'] = decoder_input_batch['input_ids']
        input_batch['decoder_attention_mask'] = decoder_input_batch['attention_mask']

        return input_batch

In [46]:
model_checkpoint = '/home/samuel/indonlg/checkpoints/IndoNLG_finals_mBart_model_v2_checkpoint_105_640000.pt'
vocab_path = 'IndoNLG_finals_vocab_model_indo4b_plus_spm_bpe_9995_wolangid_bos_pad_eos_unk.model'

# source_lang = "id_ID"
# target_lang = "su_SU"

config = BartConfig.from_pretrained('facebook/bart-base')
config.vocab_size = 40004
model = MBartForConditionalGeneration(config=config)

bart = BartModel(config=config)
bart.load_state_dict(torch.load(model_checkpoint)['model'], strict=False)
# bart.shared.weight = bart.encoder.embed_tokens.weight
model.model = bart

bart_model = model

In [47]:
from tokenization_indonlg import IndoNLGTokenizer

tokenizer = IndoNLGTokenizer(vocab_file=vocab_path)

inputs = ['aku adalah <mask>']
bart_input = prepare_input_for_generation(tokenizer, inputs, decoder_inputs=['aku adalah <mask>'], return_tensors='pt')

# bart_out = bart_model(**bart_input)
# print(bart_input)
# tokenizer.decode(bart_out.logits.topk(1).indices[:,:,:].squeeze())

{'input_ids': [[498, 410, 1859, 1035, 39942, 39995]]}


In [51]:
bart_out = bart(**bart_input)
print(bart_input)
tokenizer.decode(bart_out.last_hidden_state.topk(1).indices[:,:,:].squeeze())

{'input_ids': tensor([[    0,   498,   410,  1859,  1035, 39942, 39995,     2, 40002]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'decoder_input_ids': tensor([[40002,     0,   498,   410,  1859,  1035, 39942, 39995,     2]]), 'decoder_attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}


'para sudah saat? amp<0x4C> 19 paraentu'

In [49]:
for i in range(20):
    bart_out = bart_model(**bart_input)
    print(bart_out.logits.topk(1).indices, tokenizer.decode(bart_out.logits.topk(1).indices[:,:,:].squeeze()), bart_model.model.encoder.embed_tokens.weight.mean())

tensor([[[ 2270],
         [11944],
         [25506],
         [39969],
         [ 5130],
         [19261],
         [20773],
         [ 5130],
         [11944]]]) bermanirat mengasuh5 dénebur penandatanganan dénirat tensor(6.4212e-05, grad_fn=<MeanBackward0>)
tensor([[[30675],
         [ 7838],
         [16819],
         [12175],
         [ 2270],
         [ 8933],
         [20773],
         [ 6739],
         [30675]]]) urban pdf dulur ingatlah berman bebentukan penandatanganan riwayaturban tensor(6.4212e-05, grad_fn=<MeanBackward0>)
tensor([[[27608],
         [33025],
         [ 4403],
         [ 5130],
         [27970],
         [ 5130],
         [20773],
         [22296],
         [33273]]]) memanaskanotype yuk dén kolon dén penandatanganan iyo zhou tensor(6.4212e-05, grad_fn=<MeanBackward0>)
tensor([[[25486],
         [ 2270],
         [23155],
         [23155],
         [ 5130],
         [ 5682],
         [ 7088],
         [21319],
         [21255]]]) buton berman berair berair d

In [None]:
bart_out.logits

In [None]:
bart_input

In [None]:
tokenizer.decode(bart_input['input_ids'][0])

In [None]:
tokenizer.decode(bart_input['decoder_input_ids'][0])

In [None]:
# from src.indobenchmark import IndoNLGTokenizer

# tokenizer = IndoNLGTokenizer.from_pretrained('indobenchmark/indobart')
# bart_input = tokenizer.prepare_input_for_generation(['aku adalah <mask>'], model_type='indobart', return_tensors='pt')
# bart_input

In [None]:
bart_input = tokenizer.prepare_input_for_generation(['aku adalah <mask>'], model_type='indobart', return_tensors='pt')

In [None]:
bart_input = tokenizer.prepare_input_for_generation(['abdi teh ayeuna','abdi teh ayeuna'], lang_token='[indonesian]',
    decoder_inputs=['abdi teh ayeuna','abdi teh ayeuna'], decoder_lang_token='[indonesian]', model_type='indobart', return_tensors='pt')
bart_out = bart_model(**bart_input)
tokenizer.decode(bart_out.logits.topk(1).indices[0,:,:].squeeze()), tokenizer.decode(bart_out.logits.topk(1).indices[1,:,:].squeeze())

In [None]:
def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())

# Init Model

In [None]:
bart_model = BartForConditionalGeneration.from_pretrained('indobenchmark/indobart')
# gpt_model = GPT2LMHeadModel.from_pretrained('indobenchmark/indogpt')
tokenizer = IndoNLGTokenizer.from_pretrained('indobenchmark/indobart')

# Test GPT Model

In [None]:
gpt_input = tokenizer.prepare_input_for_generation('aku adalah anak', model_type='indogpt', return_tensors='pt')
gpt_out = gpt_model.generate(**gpt_input)
tokenizer.decode(gpt_out[0])

In [None]:
gpt_input = tokenizer.prepare_input_for_generation('aku suka sekali makan', model_type='indogpt', return_tensors='pt')
gpt_out = gpt_model.generate(**gpt_input)
tokenizer.decode(gpt_out[0])

In [None]:
gpt_input = tokenizer.prepare_input_for_generation('hai, bagaimana kabar', model_type='indogpt', return_tensors='pt')
gpt_out = gpt_model.generate(**gpt_input)
tokenizer.decode(gpt_out[0])

# Test BART Model

In [None]:
bart_input = tokenizer.prepare_input_for_generation(['aku adalah <mask>'], model_type='indobart', return_tensors='pt')
bart_out = bart_model(**bart_input)
tokenizer.decode(bart_out.logits.topk(1).indices[:,:,:].squeeze())

In [None]:
bart_input = tokenizer.prepare_input_for_generation(['abdi teh ayeuna','abdi teh ayeuna'], lang_token='[indonesian]',
    decoder_inputs=['abdi teh ayeuna','abdi teh ayeuna'], decoder_lang_token='[indonesian]', model_type='indobart', return_tensors='pt')
bart_out = bart_model(**bart_input)
tokenizer.decode(bart_out.logits.topk(1).indices[0,:,:].squeeze()), tokenizer.decode(bart_out.logits.topk(1).indices[1,:,:].squeeze())