# Fine-tune a non-English GPT-2 Model with Huggingface

https://colab.research.google.com/github/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb#scrollTo=bH1W6YrdjNSp

# Download data

https://www.kaggle.com/datasets/sterby/german-recipes-dataset?resource=download

# Load json data
First, we are going to split the recipes.json into a train and test section and extract Instructions from the recipes and write them into a train_dataset.txt and test_dataset.txt

In [1]:
# py version
import sys
print(sys.version)

3.11.5 (main, Sep 11 2023, 08:31:25) [Clang 14.0.6 ]


In [2]:
import re
import json
from sklearn.model_selection import train_test_split

with open('recipes.json') as f:
    data = json.load(f)

def build_text_files(data_json, dest_path):
    f = open(dest_path, 'w')
    data = ''
    for texts in data_json:
        summary = str(texts['Instructions']).strip()
        summary = re.sub(r"\s", " ", summary)
        data += summary + "  "
    f.write(data)

train, test = train_test_split(data,test_size=0.15) 

build_text_files(train,'train_dataset.txt')
build_text_files(test,'test_dataset.txt')

print("Train dataset length: "+str(len(train)))
print("Test dataset length: "+ str(len(test)))

Train dataset length: 10361
Test dataset length: 1829


# tokenizer
download the tokenizer, which we use. We use the tokenizer from the german-gpt2 model on huggingface.

In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("anonymous-german-nlp/german-gpt2")

train_path = 'train_dataset.txt'
test_path = 'test_dataset.txt'

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
from transformers import TextDataset,DataCollatorForLanguageModeling

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)



# Initialize Trainer with TrainingArguments and GPT-2 model

In [3]:
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead

model = AutoModelWithLMHead.from_pretrained("anonymous-german-nlp/german-gpt2")

training_args = TrainingArguments(
    output_dir="./gpt2-gerchef", #The output directory
    overwrite_output_dir = True, #overwrite the content of the output directory
    num_train_epochs = 3, # number of training epochs
    per_device_train_batch_size = 32, # batch size for training
    per_device_eval_batch_size = 64,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps = 800, # after # steps model is saved 
    warmup_steps = 500,# number of warmup steps for learning rate scheduler
    prediction_loss_only = True,
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)



# Train and save the model

To train the model we can simply run `Trainer.train()`.

In [None]:
trainer.train()

Step,Training Loss


Kernel Restarting
The kernel for appears to have died. It will restart automatically.

In [None]:
trainer.save_model()