In [1]:
# TODO: Add evaluation loss while training

In [2]:
# !pip install googletrans==4.0.0-rc1

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]='7'

In [4]:
import re
import json
from transformers import AutoTokenizer, AutoModelWithLMHead, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

path_to_recipe = "./dataset/recipes.json"
train_path = "./dataset/train.txt"
test_path = "./dataset/test.txt"
output_dir="./output/"
model="dbmdz/german-gpt2"

## Making Dataset

In [5]:
with open(path_to_recipe) as f:
    data = json.load(f)

def build_text_files(data_json, dest_path):
    f = open(dest_path, 'w')
    data = ''
    for texts in data_json:
        summary = str(texts['Instructions']).strip()
        summary = re.sub(r"\s", " ", summary)
        data += summary + "  "
    f.write(data)

train, test = train_test_split(data,test_size=0.15) 

build_text_files(train,train_path)
build_text_files(test,test_path)

print("Train dataset length: "+str(len(train)))
print("Test dataset length: "+ str(len(test)))

Train dataset length: 10361
Test dataset length: 1829


## Loading Dataset

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelWithLMHead.from_pretrained(model)

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)



In [7]:
print(test_dataset[0])

tensor([ 7879,   272,   465,  7831,   328, 23706,   812, 17609,    16,   369,
         2896,    74,   793,  1616,    16,  1273,   590,   369,  4545,   292,
        25180,    16,   369, 47186, 16093,   292,   375, 39853,   365,  2518,
        12556,    18,   369,  8882,   268, 41866,   554, 25951, 16113,   714,
          729,   425,   913,   480,  1578,  8636,  1273,   590,   292,   309,
         1410,  6609,   364, 46418,    18,  3146, 28764,  2233,   280, 40369,
           16,   355, 26877,  5933,   292,   355, 14746, 14855,   292, 28764,
         2233,   280,  2209,  2254,  3000,  9358,  1762,  2249,    16,   472,
         2081,   364,    70,   533, 11691,    16,   904,   369,  3479,   292,
        38831,   567,  8061,   623,   292,   369, 31533,   440,   593,  6256,
           39,   382, 27130,  4238,  3971,  5752,    18,  8319,   917,    18,
         3854,  5840,   286, 14895, 46313,   309, 23682, 24889,   292,   369,
          346,   309, 21582, 24167,   262,  6417,  2123,  1990])

## Load Trainer

In [10]:
training_args = TrainingArguments(
    output_dir=output_dir, #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    eval_strategy = 'epoch',
    per_device_train_batch_size=2, # batch size for training
    per_device_eval_batch_size=2, # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps=800, # after # steps model is saved 
    warmup_steps=500, # number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


## Training

In [12]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.9944,1.980978
2,1.6226,1.762404
3,1.3615,1.707143


TrainOutput(global_step=23556, training_loss=1.7873864500201564, metrics={'train_runtime': 1268.8719, 'train_samples_per_second': 37.129, 'train_steps_per_second': 18.565, 'total_flos': 3077497552896000.0, 'train_loss': 1.7873864500201564, 'epoch': 3.0})

In [13]:
model.save_pretrained('./german_recipe_gpt2/')
tokenizer.save_pretrained('./german_recipe_gpt2/')

('./german_recipe_gpt2/tokenizer_config.json',
 './german_recipe_gpt2/special_tokens_map.json',
 './german_recipe_gpt2/vocab.json',
 './german_recipe_gpt2/merges.txt',
 './german_recipe_gpt2/added_tokens.json',
 './german_recipe_gpt2/tokenizer.json')

## Evaluation

In [14]:
from transformers import pipeline

model = AutoModelWithLMHead.from_pretrained("./german_recipe_gpt2")
tokenizer = AutoTokenizer.from_pretrained("./german_recipe_gpt2")
tokenizer.pad_token_id = tokenizer.eos_token_id

chef = pipeline('text-generation', model=model, tokenizer=tokenizer, config={'max_length':800})

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [18]:
from googletrans import Translator
def check_output(input_text):
    # Instantiate translator
    translator= Translator()
    
    # Generate text with fine-tuned model
    text = chef(input_text)[0]['generated_text']
    
    # Translate input and generated text
    translate_in = translator.translate(input_text, src='de', dest='ko')
    translate_gen = translator.translate(text, src='de', dest='ko')
    
    # Print the result
    print(f"German input: {input_text}")
    print(f"Translated input: {translate_in.text}\n")
    print(f"German output: {text}")
    print(f"Translated output: {translate_gen.text}")

In [19]:
check_output("Die Nudeln Kochen, Fleisch anbraten")

German input: Die Nudeln Kochen, Fleisch anbraten
Translated input: 파스타를 요리하고 고기를 볶습니다

German output: Die Nudeln Kochen, Fleisch anbraten. Dann Kartoffeln und Tomaten und zum Schluss Sud über die Nudeln geben und weitere 1,5 – 2 Stunden leise simmern lassen. Zum Schluss Gemüse im Wok 10 Minuten mitbraten.Anrichten:Z
Translated output: 파스타를 요리하고 고기를 볶습니다.그런 다음 감자와 토마토를 넣고 마침내 파스타를 파스타 위에주고 1.5-2 시간 더 조용히 끓입니다.마지막으로, 냄비에 야채를 10 분 동안 볶습니다.


In [20]:
check_output("Zuerst Hähnchen")

German input: Zuerst Hähnchen
Translated input: 첫 번째 치킨

German output: Zuerst Hähnchenkeulen mit einem Kartoffelpüree mischen. Dazu gibt es jedes Mal etwas Salat und eine leckere Gemüsebrühe!  Hackfleisch mit der Zwiebel in Butter anbraten, Knoblauch zufügen und bei geringer Wärmezufuhr kräftig
Translated output: 먼저 감자 퓨레와 닭 다리를 섞습니다.매번 샐러드와 맛있는 야채 국물이 있습니다!다진 고기를 버터에 양파로 튀기고 마늘을 첨가하고 저온 공급으로 활발하게 첨가하십시오.


In [21]:
check_output("Der beste Weg, um einen Schokoladenkuchen zuzubereiten, ist")

German input: Der beste Weg, um einen Schokoladenkuchen zuzubereiten, ist
Translated input: 초콜릿 케이크를 준비하는 가장 좋은 방법은입니다

German output: Der beste Weg, um einen Schokoladenkuchen zuzubereiten, ist durch den Kontakt der Teig mit dem heißen Wasser... wie bei einem Eiskranz. Ein paar Löffel von der Pflaumensauce mit einem Löffelchen in der Auflaufform verteilt
Translated output: 초콜릿 케이크를 준비하는 가장 좋은 방법은 아이스크림 화환처럼 뜨거운 물로 반죽에 접촉하는 것입니다.베이킹 접시에 숟가락을 곁들인 매실 소스 숟가락
