# Fine tuning Hugging Face for causal language modeling
[![Open in Layer](https://development.layer.co/assets/badge.svg)](https://development.layer.co/layer/causal-language-modeling) [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/layerai/examples/blob/main/causal-language-modeling/causal-language-modeling.ipynb) [![Layer Examples Github](https://badgen.net/badge/icon/github?icon=github&label)](https://github.com/layerai/examples/tree/main/causal-language-modeling)

In this project we fine tune a Hugging Face model for text generation on the wikitext dataset.

In [None]:
!pip install layer --upgrade -qqq

In [3]:
import layer
layer.login()

In [4]:
layer.init("causal-language-modeling")

In [5]:
from layer.decorators import model,pip_requirements,fabric

In [7]:
@pip_requirements(packages=["transformers","sentencepiece"])
@fabric("f-medium")
@model(name="tokenizer")
def download_tokenizer():
    from transformers import GPT2Tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    return tokenizer

In [8]:
download_tokenizer()

In [9]:
def tokenize_function(examples):
    from transformers import GPT2Tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    return tokenizer(examples["text"])

In [10]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    # block_size = tokenizer.model_max_length
    block_size = 128
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [11]:
@pip_requirements(packages=["transformers","sentencepiece"])
@fabric("f-gpu-small")
@model("gpt2-clm")
def train():
        from datasets import load_dataset
        import transformers
        from transformers import AutoTokenizer
        from transformers import AutoConfig, TFAutoModelForCausalLM
        from transformers import AdamWeightDecay
        import tensorflow as tf
        from transformers import DefaultDataCollator
        import math
        
        datasets = load_dataset("wikitext", "wikitext-2-raw-v1")
        model_checkpoint = "gpt2"
        tokenizer_checkpoint = "gpt2"
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)
        tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])
        lm_datasets = tokenized_datasets.map(
            group_texts,
            batched=True,
            batch_size=1000,
            num_proc=4,)

        config = AutoConfig.from_pretrained(model_checkpoint)
        model = TFAutoModelForCausalLM.from_config(config)
        learning_rate = 2e-5
        weight_decay = 0.01

        optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
        model.compile(optimizer=optimizer)

        data_collator = DefaultDataCollator(return_tensors="tf")

        train_set = lm_datasets["train"].to_tf_dataset(
            columns=["attention_mask", "input_ids", "labels"],
            shuffle=True,
            batch_size=16,
            collate_fn=data_collator,)
        
        validation_set = lm_datasets["validation"].to_tf_dataset(
            columns=["attention_mask", "input_ids", "labels"],
            shuffle=False,
            batch_size=16,
            collate_fn=data_collator,)

        model.fit(train_set, validation_data=validation_set, epochs=2)
        eval_loss = model.evaluate(validation_set)
        print(f"Perplexity: {math.exp(eval_loss):.2f}")
        return model

In [None]:
layer.run([train])

In [None]:
gpt2 = layer.get_model('gpt2-clm').get_train()
tokenizer = layer.get_model('tokenizer').get_train()
input_sequence = "I love reading books"
# encode context the generation is conditioned on
input_ids = tokenizer.encode(input_sequence, return_tensors='tf')
output = gpt2.generate(input_ids)
print("Output:\n" + 100 * '-')
print(tokenizer.decode(output[0], skip_special_tokens = True))