<a href="https://colab.research.google.com/github/ilsilfverskiold/transformers-nlp-docs/blob/main/cook/fine-tune/fine_tune_encoder_classification_custom_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-Tune an Encoder Model (BERT, RoBERTa) for text classification with a custom dataset
This cook book is for text classification primarily. Be aware if you're training a BERT model for a different task (like question-answering).

Text classification typically requires 'text' and a 'label'. This script will perform multi-class classification, where the model learns to predict the category of each keyword. Here each keyword (i.e. 'text') has a corresponding label (i.e. 'category').

Only has an encoder part (e.g., BERT, RoBERTa). Suitable for classification, entity recognition, etc. Only tokenizes the input text. Output is often a single label or a set of labels (not tokenized).

Remember when working with classification, you need an even distribution of examples for the different labels or the model will favor the ones with more examples. The more niche your labels are the more difficult it will be to train your model.

Make sure you set your runtime to T4 or better before running the script and look out for overfitting.

In [None]:
# install dependencies
!pip install -U datasets
!pip install -U accelerate
!pip install -U transformers
!pip install -U huggingface_hub

In [None]:
# connect to drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

# import file - make sure you set the correct path in your Google Drive (this would be the file I'm importing)
file_path = '/content/drive/My Drive/keywords_categories_even_distribution.csv'
df = pd.read_csv(file_path)
df.head()

In [None]:
from sklearn.model_selection import train_test_split

# remove any null values for the 'label' field - will cause issues later if there are any
df = df[df['label'].notnull()]

# Split dataset into training and temp (15%) (for validation and testing)
train_df, temp_df = train_test_split(df, test_size=0.15, random_state=42)

# Split temp into validation and testing (split 30% for testing - change if needed be)
val_df, test_df = train_test_split(temp_df, test_size=0.3, random_state=42)

In [None]:
from datasets import Dataset, DatasetDict

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
# create a dataset dict with the train, validate and test set
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

# print the dict
dataset_dict

In [None]:
# (Optional) map out some examples from the dataset
def show_samples(dataset, num_samples=10, seed=42):
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    for example in sample:
        print(f"\n'>> Text: {example['text']}'")
        print(f"'>> Label: {example['label']}'")


show_samples(dataset_dict)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit the LabelEncoder on all unique category labels in the DataFrame
label_encoder.fit(df['label'])

# Define a function to encode the categories in the dataset_dict
def encode_labels(example):
    # This will transform the text category to a numeric label
    return {'encoded_label': label_encoder.transform([example['label']])[0]}

# Apply the encode_labels function to each example in each split of dataset_dict
for split in dataset_dict:
    # The lambda function is replaced with the encode_labels function
    dataset_dict[split] = dataset_dict[split].map(encode_labels, batched=False)

# Check the number of unique labels
num_labels = len(label_encoder.classes_)
num_labels

In [None]:
import joblib

# Save the label encoder to a file
joblib.dump(label_encoder, 'label_encoder.joblib')

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = 'bert-base-uncased'  # or any other suitable encoder model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

In [None]:
def encode_data(dataset):
    tokenized_inputs = tokenizer(dataset["text"], padding=True, truncation=True, max_length=512)
    tokenized_inputs["labels"] = dataset["encoded_label"]
    return tokenized_inputs

# Apply this function to your dataset dictionary
dataset_encoded = dataset_dict.map(encode_data, batched=True)
dataset_encoded

In [None]:
dataset_encoded.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='bert_classifier',
    num_train_epochs=4,  # As needed
    warmup_steps=100,  # As needed
    per_device_train_batch_size=8,  # Adjust if necessary
    per_device_eval_batch_size=16,  # Can be larger if no memory issues during eval
    weight_decay=0.01,  # Prevent overfitting
    logging_steps=10,
    evaluation_strategy='steps',  # If you want more frequent feedback
    eval_steps=100,  # Evaluate every 100 steps, adjust as needed
    learning_rate=3e-5,  # Standard for BERT
    save_steps=500,  # Adjust as preferred
    gradient_accumulation_steps=4  # Increase if reducing batch size
)

trainer = Trainer(model=model, args=training_args, tokenizer=tokenizer, data_collator=data_collator,
                  train_dataset = dataset_encoded['train'], eval_dataset = dataset_encoded['validation'])

trainer.train()

In [None]:
# save the model
trainer.save_model('bert_classifier_model')

In [None]:
from transformers import pipeline
import joblib

# Load the label encoder
label_encoder = joblib.load('label_encoder.joblib')

# Load the pipeline for text classification with your model
pipe = pipeline('text-classification', model='bert_classifier_model')

# Loop through the test set from index 0 to 50
for i in range(0, 50):
    test_text = dataset_dict['test'][i]['text']
    original_label = dataset_dict['test'][i]['label']

    # Get the model's prediction
    predicted_output = pipe(test_keyword)

    # Extract the label number from the model's prediction
    predicted_label_num = int(predicted_output[0]['label'].split('_')[-1])

    # Use the LabelEncoder to get the original category name
    predicted_label_name = label_encoder.inverse_transform([predicted_label_num])[0]

    print(f"text: {test_text}")
    print(f"generated label: {predicted_label_name}")
    print(f"original label: {original_label}")
    print("-" * 50)

In [None]:
# If you're satisfied we can push it to Hugging Face
# You'll need a token from your Hugging Face account to log in
!huggingface-cli login

In [None]:
# You would replace your own username here
# You do not need to create a repository beforehand
trainer.push_to_hub("huggingface_username/bert_classifier_model")

In [None]:
from google.colab import files

# Download the encoder too - push it or upload it manually to your model in the Hugging Face repository
files.download('/content/label_encoder.joblib')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>