In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]='7'

In [None]:
import re
import json
from transformers import AutoTokenizer, AutoModelWithLMHead, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

path_to_recipe = "./dataset/recipes.json"
train_path = "./dataset/train.txt"
test_path = "./dataset/test.txt"
output_dir="./output/"
model="dbmdz/german-gpt2"


<h3>Making Dataset

In [None]:
with open(path_to_recipe) as f:
    data = json.load(f)

def build_text_files(data_json, dest_path):
    f = open(dest_path, 'w')
    data = ''
    for texts in data_json:
        summary = str(texts['Instructions']).strip()
        summary = re.sub(r"\s", " ", summary)
        data += summary + "  "
    f.write(data)

train, test = train_test_split(data,test_size=0.15) 

build_text_files(train,train_path)
build_text_files(test,test_path)

print("Train dataset length: "+str(len(train)))
print("Test dataset length: "+ str(len(test)))


<h3>Loading Dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelWithLMHead.from_pretrained(model)

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)

In [None]:
print(test_dataset[0])

In [None]:


training_args = TrainingArguments(
    output_dir=output_dir, #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=2, # batch size for training
    per_device_eval_batch_size=2,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps=800, # after # steps model is saved 
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
    )


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

<h3>Training

In [None]:
trainer.train()

In [None]:
model.save_pretrained('./german_recipe_gpt2/')
tokenizer.save_pretrained('./german_recipe_gpt2/')

<h3>Evaluation

In [None]:
from transformers import pipeline

model = AutoModelWithLMHead.from_pretrained("./german_recipe_gpt2")
tokenizer = AutoTokenizer.from_pretrained("./german_recipe_gpt2")
tokenizer.pad_token_id = tokenizer.eos_token_id

chef = pipeline('text-generation', model=model, tokenizer=tokenizer, config={'max_length':800})

In [None]:
chef('Die Nudeln Kochen, Fleisch anbraten')[0]['generated_text']

In [None]:
chef('Zuerst Hähnchen')[0]['generated_text']

In [None]:
chef('Der beste Weg, um einen Schokoladenkuchen zuzubereiten, ist')[0]['generated_text']

In [None]:
chef('Zuerst Hähnchen')[0]['generated_text']