In [None]:
%cd ..

In [None]:
%load_ext autoreload
%autoreload 2

In [3]:
from idrecibrew2.data import Seq2SeqDataFactory, Seq2SeqDataFactoryArgs
from idrecibrew2.data.indonlg_tokenizer.tokenizer import IndoNLGTokenizer
from idrecibrew2.model import LitSeq2SeqTransformers, LitSeq2SeqTransformersArgs

In [4]:
import pandas as pd

In [5]:
from transformers import AutoTokenizer

In [6]:
from idrecibrew2.eval.training_eval import Seq2SeqTrainingEval

In [7]:
from tqdm import tqdm

# IndoBert

In [7]:
tokenizer = IndoNLGTokenizer.from_pretrained("indobenchmark/indobart-v2")

In [8]:
args_df = Seq2SeqDataFactoryArgs(tokenizer=tokenizer)

In [9]:
df = Seq2SeqDataFactory(args_df)

In [13]:
test_dl = df.produce_dataloader_from_csv("data/processed/test.csv", batch_size=64, n_workers=4, shuffle=False)

  0%|          | 0/2 [00:00<?, ?ba/s]

In [11]:
device = "cuda:1"

In [12]:
lit_model = LitSeq2SeqTransformers.load_from_checkpoint("outputs/indobert-v2/model-epoch=24-val_loss=1.731-val_bleu=26.443.ckpt")

In [14]:
lit_model = lit_model.to(device)

In [15]:
outputs = []
for batch in tqdm(test_dl):
    argmax = lit_model.model.generate(
        input_ids=batch["input_ids"].to(device),
        max_length=300,
        num_return_sequences=1,
        num_beams=1,
        num_beam_groups=1,
    )
    outputs.append({"preds": argmax, "tgts": batch.labels})

100%|██████████| 20/20 [00:36<00:00,  1.83s/it]


In [16]:
evaluator = Seq2SeqTrainingEval(tokenizer)

In [17]:
bleu_test = evaluator.compute_eval(outputs)

In [18]:
bleu_test

27.03383393952166

## T5

In [8]:
tokenizer = AutoTokenizer.from_pretrained("Wikidepia/IndoT5-base")

In [9]:
args_df = Seq2SeqDataFactoryArgs(tokenizer=tokenizer)

In [10]:
df = Seq2SeqDataFactory(args_df)

In [11]:
import pandas as pd

In [12]:
pd.set_option('max_colwidth', 0)

In [13]:
df_data = pd.read_csv("data/processed/test_t5.csv")

In [14]:
df_data.sample(2)

Unnamed: 0,no,src,tgt,basic_ingredient
150,2139,ikan patin asam manis,"resep: 1 ekor ikan patin , potong dadu , buang tulangnya @@ 1 / 4 kg tepung terigu @@ garam @@ lada bubuk @@ bumbu saus : @@ 5 sachet saos cabai @@ 10 sachet saos tomat @@ 1 butir bawang bombai @@ 1 / 4 gelas air @@",ikan
973,3532,tim ikan blanak,resep: 5 ekor ikan blanak @@ jeruk nipis @@ 2 sihung bawang putih @@ 1 ruas jahe memarkan @@ 1 ruas kayu manis @@ 2 btg bawang prei @@ garam @@ 500 ml air @@,ikan


In [15]:
test_dl = df.produce_dataloader_from_csv("data/processed/test_t5.csv", batch_size=128, n_workers=4, shuffle=False)

  0%|          | 0/2 [00:00<?, ?ba/s]

In [16]:
device = "cpu"

In [17]:
lit_model = LitSeq2SeqTransformers.load_from_checkpoint("outputs/indo-t5-2/model-epoch=19-val_loss=1.182.ckpt")

In [18]:
lit_model = lit_model.to(device)

In [19]:
lit_model = lit_model.eval()

In [20]:
import torch

In [21]:
with torch.no_grad():
    outputs = []
    for batch in tqdm(test_dl):
        argmax = lit_model.model.generate(
            input_ids=batch["input_ids"].to(device),
            max_length=500,
            num_return_sequences=1,
            num_beams=1,
            num_beam_groups=1,
        )
        outputs.append({"preds": argmax, "tgts": batch.labels})

100%|██████████| 10/10 [09:48<00:00, 58.90s/it]


In [30]:
sabana = tokenizer("tempe pedas teri", return_tensors="pt")['input_ids']

In [31]:
argmax = lit_model.model.generate(
            input_ids=sabana,
            max_length=300,
            num_return_sequences=1,
            num_beams=1,
            num_beam_groups=1,
        )

In [34]:
from typing import List

In [35]:
def decode_remove_resep(x):
    decoded = tokenizer.decode(x, skip_special_tokens=True)
    decoded = decoded.replace("resep: ", "")
    return decoded

In [36]:
evaluator = Seq2SeqTrainingEval(tokenizer)
bleu_test = evaluator.compute_eval(outputs, special_func=decode_remove_resep)

In [37]:
bleu_test

18.72685033429128

## GPT

Prepare data.
Append ">>>"

In [8]:
df_test = pd.read_csv("data/processed/test.csv")

In [9]:
df_test.src = df_test.src + " >>> "

In [10]:
df_test.to_csv("data/processed/test_gpt.csv", index=False)

Predict it!

GPT input: "Food title >>>"


In [11]:
tokenizer = IndoNLGTokenizer.from_pretrained("indobenchmark/indogpt")

In [12]:
args_df = Seq2SeqDataFactoryArgs(tokenizer=tokenizer)

In [13]:
df = Seq2SeqDataFactory(args_df)

In [14]:
test_dl = df.produce_dataloader_from_csv("data/processed/test_gpt.csv", batch_size=64, n_workers=4, shuffle=False)

  0%|          | 0/2 [00:00<?, ?ba/s]

In [15]:
device = "cuda:1"

In [39]:
lit_model = LitSeq2SeqTransformers.load_from_checkpoint("outputs/indogpt/model-epoch=06-val_loss=1.861-val_bleu=0.000.ckpt")

In [40]:
lit_model = lit_model.to(device)

In [41]:
import re

In [43]:
outputs = []
for batch in tqdm(test_dl):
    argmax = lit_model.model.generate(
        input_ids=batch["input_ids"].to(device),
        max_length=300,
        num_return_sequences=1,
        num_beams=1,
        num_beam_groups=1,
    )
    outputs.append({"preds": argmax, "tgts": batch.labels})

  0%|          | 0/20 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  5%|▌         | 1/20 [00:03<01:13,  3.88s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 10%|█         | 2/20 [00:07<01:07,  3.74s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 15%|█▌        | 3/20 [00:11<01:02,  3.70s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 20%|██        | 4/20 [00:14<00:58,  3.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 25%|██▌       | 5/20 [00:18<00:54,  3.64s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 30%|███       | 6/20 [00:22<00:50,  3.64s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 35%|███▌      | 7/20 [00:25<00:47,  3.64s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 40%|████      | 8/20 [00:29<00:43,  3.59s/it]Setting `pad_token

In [50]:
df_test

Unnamed: 0,no,src,tgt,basic_ingredient
0,12913,sambel tempe kering >>>,1 papan tempe yg ukuran 20 cm || 8 buah cabe ...,tempe
1,13472,oily oseng/tumis buncis tempe kecap (🇮🇩) >>>,"bahan : || 15 - 20 batang buncis , iris sero...",tempe
2,12167,asam manis tempe sosis telur puyuh >>>,1 papan tempe || sosis ( merk apa aja ) || 1...,tempe
3,13076,oreg tempe >>>,1 papan tempe || secukupnya buncis || 3 biji c...,tempe
4,3224,ikan bandeng presto salted egg (telur asin) >>>,2 buah bandeng presto || 3 butir telur asin ||...,ikan
...,...,...,...,...
1275,5827,lapis daging sapi >>>,750 daging sapi ( khas dalam ) || 3 lembar d...,sapi
1276,2448,pesmol ikan nila >>>,2 ekor ikan nila || 1 buah jeruk nipis || secu...,ikan
1277,11007,cloud tuna & egg #ketofriendly #ketopad_cp_sav...,"3 butir telur ayam , pisahkan putih dan kunin...",telur
1278,8064,pepes tahu >>>,3 biji tahu putih || 2 batang daun bawang + sl...,tahu


In [51]:
tokenizer.decode(outputs[0]['preds'][16], skip_special_tokens=True)

'lele taworcis(tahu,wortel,buncis) kuah segar >>> || 1 / 2 kg lele || 1 / 2 kg tahu || 1 / 2 kg kacang tanah || bumbu halus : || 5 siung bawang putih || 5 siung bawang merah || 1 / 2 sdt ketumbar || 1 / 2 sdt merica || 1 / 2 sdt garam || 1 / 2 sdt gula || 1 / 2 sdt penyedap rasa || 1 / 2 sdt kunyit bubuk || 1 / 2 sdt kunyit bubuk || 1 / 2 sdt jahe bubuk || 1 / 2 sdt kunyit bubuk || 1 / 2 sdt garam || 1 / 2 sdtutkan ||'

In [52]:
def decode_gpt(x):
    decoded = tokenizer.decode(x, skip_special_tokens=True)
    decoded_split = decoded.split(" >>> ")
    if len(decoded_split) == 1:
        return decoded
    else:  # for non-preds
        return ' '.join(decoded_split[1:])
    return decoded

In [53]:
decode_gpt(outputs[0]['preds'][16])

'|| 1 / 2 kg lele || 1 / 2 kg tahu || 1 / 2 kg kacang tanah || bumbu halus : || 5 siung bawang putih || 5 siung bawang merah || 1 / 2 sdt ketumbar || 1 / 2 sdt merica || 1 / 2 sdt garam || 1 / 2 sdt gula || 1 / 2 sdt penyedap rasa || 1 / 2 sdt kunyit bubuk || 1 / 2 sdt kunyit bubuk || 1 / 2 sdt jahe bubuk || 1 / 2 sdt kunyit bubuk || 1 / 2 sdt garam || 1 / 2 sdtutkan ||'

In [54]:
evaluator = Seq2SeqTrainingEval(tokenizer)
bleu_test = evaluator.compute_eval(outputs, special_func=decode_gpt)

In [55]:
bleu_test

9.981578042780926