[![Open in Layer](https://development.layer.co/assets/badge.svg)](https://development.layer.co/layer/derrick-bert) [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/layerai/examples/blob/main/text-classification/text-classification-fine-tuning-hf.ipynb) [![Layer Examples Github](https://badgen.net/badge/icon/github?icon=github&label)](https://github.com/layerai/examples/tree/main/text-classification)

# Fine tuning Hugging Face BERT model for text classification
In this notebook we fine tune a [BERT model](https://huggingface.co/bert-base-uncased) for text classification.

In [None]:
# Transformers installation
! pip install transformers datasets -qqq
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git

In [None]:
!pip install layer-sdk --upgrade -qqq

In [2]:
import layer
from layer.decorators import model,pip_requirements,fabric
layer.login("https://development.layer.co")
layer.init("derrick-bert")

In [35]:
@pip_requirements(packages=["transformers","sentencepiece"])
@fabric("f-medium")
@model(name="bert-tokenizer")
def download_tokenizer():
    from transformers import BertTokenizer
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    return tokenizer

In [36]:
# Run the project on Layer Infra
# layer.run([download_tokenizer])
download_tokenizer()

In [41]:
def tokenize_function(examples):
    from transformers import BertTokenizer
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [42]:
@pip_requirements(packages=["transformers","sentencepiece"])
@fabric("f-medium")
@model("bert")
def train():    
    import tensorflow as tf
    from transformers import TFAutoModelForSequenceClassification
    from transformers import DefaultDataCollator
    from transformers import AutoTokenizer
    from datasets import load_dataset

    dataset = load_dataset("imdb")

    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
    small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

    data_collator = DefaultDataCollator(return_tensors="tf")

    tf_train_dataset = small_train_dataset.to_tf_dataset(
        columns=["attention_mask", "input_ids", "token_type_ids"],
        label_cols=["labels"],
        shuffle=True,
        collate_fn=data_collator,
        batch_size=8,)

    tf_validation_dataset = small_eval_dataset.to_tf_dataset(
        columns=["attention_mask", "input_ids", "token_type_ids"],
        label_cols=["labels"],
        shuffle=False,
        collate_fn=data_collator,
        batch_size=8,)
    model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),metrics=tf.metrics.SparseCategoricalAccuracy(),)
    model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3)
    return model

In [43]:
# layer.run([train])
train()

In [44]:
bert = layer.get_model('bert').get_train()
bert

In [45]:
tokenizer = layer.get_model('bert-tokenizer').get_train()
input_sequence = "I really loved that movie, the script was on point"
# encode context the generation is conditioned on
input_ids = tokenizer.encode(input_sequence, return_tensors='tf')
output = bert(input_ids)
logits = output.logits

In [46]:
logits

In [47]:
# https://huggingface.co/docs/transformers/main/en/model_doc/distilbert
import tensorflow as tf
predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0])
bert.config.id2label[predicted_class_id]