# Finetuning of BERT

In [41]:
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

import torch
import numpy as np
import pandas as pd
import random


from sklearn.model_selection import train_test_split
from datasets import load_dataset, Features, Value, DatasetDict

seed = 6
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x13a7081d0>

In [3]:
df = load_dataset('csv', data_files='dataset/recipes_df_r.csv')

In [4]:
# see datatype of df
print(type(df))

<class 'datasets.dataset_dict.DatasetDict'>


In [5]:
df.keys()

dict_keys(['train'])

In [6]:
df['train'].features

{'cooking_method': Value(dtype='string', id=None),
 'ingredients': Value(dtype='string', id=None),
 'recipe_name': Value(dtype='string', id=None),
 'tags': Value(dtype='string', id=None),
 'Vegetarian&Desserts': Value(dtype='int64', id=None),
 'Others&D': Value(dtype='int64', id=None),
 'Vegetarian': Value(dtype='int64', id=None),
 'Others': Value(dtype='int64', id=None),
 'Dairy Free': Value(dtype='int64', id=None),
 'Gluten Free': Value(dtype='int64', id=None),
 'Low Carb': Value(dtype='int64', id=None),
 'Low Fat': Value(dtype='int64', id=None),
 'Low Sodium': Value(dtype='int64', id=None)}

In [7]:
# first 5 instances of dataset
df['train'][:5]

{'cooking_method': ["['Remove the small side muscle from the scallops, rinse with cold water and thoroughly pat dry.', 'Add the butter and oil to a 12 to 14-inch saute pan on high heat. Salt and pepper the scallops. Once the fat begins to smoke, gently add the scallops, making sure they are not touching each other. Sear the scallops for 1 1/2 minutes on each side. The scallops should have a 1/4-inch golden crust on each side while still being translucent in the center. Serve immediately.']",
  '[\'With a sharp knife, slit the sausage skins lengthways and pop all the meat out. Using wet hands, roll little balls of sausage meat about the size of large marbles and set aside.\', "Heat a large frying pan and add a good splash of olive oil. Gently fry the sausage balls until golden brown all over, then add the pancetta and continue cooking for a couple of minutes, until it\'s golden. While this is cooking, bring a pan of salted water to the boil, add the linguine, and cook according to the p

## Data Casting
As we can see a few cells above, the first 4 features are already in the wanted data type: `string`. Let's cast the others into `bool` (... and make other adjustments). 

In [8]:
# REMOVE columns Vegetarian, Others
df = df.remove_columns(['Vegetarian', 'Others'])

df['train'].features

{'cooking_method': Value(dtype='string', id=None),
 'ingredients': Value(dtype='string', id=None),
 'recipe_name': Value(dtype='string', id=None),
 'tags': Value(dtype='string', id=None),
 'Vegetarian&Desserts': Value(dtype='int64', id=None),
 'Others&D': Value(dtype='int64', id=None),
 'Dairy Free': Value(dtype='int64', id=None),
 'Gluten Free': Value(dtype='int64', id=None),
 'Low Carb': Value(dtype='int64', id=None),
 'Low Fat': Value(dtype='int64', id=None),
 'Low Sodium': Value(dtype='int64', id=None)}

In [9]:
# RENAME columns Vegetarian&Dessert to Veg, Others&Dessert to NonVeg

# Define a function to rename columns
def rename_columns(example):
    # Rename 'Vegetarian&Dessert' to 'Veg'
    if 'Vegetarian&Desserts' in example:
        example['Veg'] = example.pop('Vegetarian&Desserts')
    # Rename 'Others&Dessert' to 'NonVeg'
    if 'Others&D' in example:
        example['NonVeg'] = example.pop('Others&D')
    return example

# Apply the rename_columns function to each example in the dataset
for split in df.keys():
    df[split] = df[split].map(rename_columns)

df['train'].features

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

{'cooking_method': Value(dtype='string', id=None),
 'ingredients': Value(dtype='string', id=None),
 'recipe_name': Value(dtype='string', id=None),
 'tags': Value(dtype='string', id=None),
 'Dairy Free': Value(dtype='int64', id=None),
 'Gluten Free': Value(dtype='int64', id=None),
 'Low Carb': Value(dtype='int64', id=None),
 'Low Fat': Value(dtype='int64', id=None),
 'Low Sodium': Value(dtype='int64', id=None),
 'Veg': Value(dtype='int64', id=None),
 'NonVeg': Value(dtype='int64', id=None)}

In [10]:
# CAST variables to boolean
df['train'] = df['train'].cast(Features({
    'cooking_method' : Value('string'),
    'ingredients' : Value('string'),
    'recipe_name' : Value('string'),
    'tags' : Value('string'),
    'Dairy Free': Value('bool'),
    'Gluten Free': Value('bool'),
    'Low Carb': Value('bool'),
    'Low Fat': Value('bool'),
    'Low Sodium': Value('bool'),
    'Veg': Value('bool'),
    'NonVeg': Value('bool')
}))

df['train'].features

Casting the dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

{'cooking_method': Value(dtype='string', id=None),
 'ingredients': Value(dtype='string', id=None),
 'recipe_name': Value(dtype='string', id=None),
 'tags': Value(dtype='string', id=None),
 'Dairy Free': Value(dtype='bool', id=None),
 'Gluten Free': Value(dtype='bool', id=None),
 'Low Carb': Value(dtype='bool', id=None),
 'Low Fat': Value(dtype='bool', id=None),
 'Low Sodium': Value(dtype='bool', id=None),
 'Veg': Value(dtype='bool', id=None),
 'NonVeg': Value(dtype='bool', id=None)}

## Split

In [40]:
#TODO: stratify ?!?!?!?!?!??!?!?!

In [11]:
# count how many recipes have 'cooking_method' of more than 512 tokens
count = 0
for i in range(len(df['train'])):
    if len(df['train']['cooking_method'][i].split()) > 512:
        count += 1

print(count)

173


In [12]:
# drop these instances
df['train'] = df['train'].filter(lambda x: len(x['cooking_method'].split()) <= 512)

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [13]:
# count how many recipes have 'cooking_method' of more than 512 tokens
count = 0
for i in range(len(df['train'])):
    if len(df['train']['cooking_method'][i].split()) > 512:
        count += 1

print(count)

0


In [16]:
df = df['train'].train_test_split(test_size=0.3, seed = seed)
train_validation = df['train'].train_test_split(test_size=0.2, seed = seed)

dataset = DatasetDict()
dataset['test'] = df['test']
dataset['train'] = train_validation['train']
dataset['validation'] = train_validation['test']
dataset.shape

{'test': (2949, 11), 'train': (5502, 11), 'validation': (1376, 11)}

!!! Notice that from now on it is immediate to switch to the multi-labels case !!!

In [17]:
dataset['train'].features.keys()

dict_keys(['cooking_method', 'ingredients', 'recipe_name', 'tags', 'Dairy Free', 'Gluten Free', 'Low Carb', 'Low Fat', 'Low Sodium', 'Veg', 'NonVeg'])

In [18]:
labels = [label for label in dataset['train'].features.keys() if label not in ['cooking_method','ingredients','recipe_name','tags','Dairy Free','Gluten Free','Low Carb','Low Fat','Low Sodium','NonVeg']]
labels

['Veg']

In [20]:
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

id2label, label2id

({0: 'Veg'}, {'Veg': 0})

In [21]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

In [22]:
def preprocess_data(examples):
  # take a batch of texts
  text = examples["cooking_method"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=512)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

In [23]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/2949 [00:00<?, ? examples/s]

Map:   0%|          | 0/5502 [00:00<?, ? examples/s]

Map:   0%|          | 0/1376 [00:00<?, ? examples/s]

In [26]:
encoded_dataset['train'].features

{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'labels': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None)}

In [30]:
i = encoded_dataset['train'][0]['input_ids']
len(i)

512

In [31]:
tokenizer.decode(i)

"[CLS] ['Preheat the oven to 350 degrees F. Grease six 12 - ounce ramekins with 2 tablespoons of the butter and set aside on a rimmed baking sheet. ','Spread the brioche on a second rimmed baking sheet and bake, tossing halfway through, until lightly browned, about 15 minutes. ','Melt the remaining 2 tablespoons butter in a large skillet over medium - high heat. Add the sausage and squash and cook, breaking up the meat with the back of a spoon, until the sausage is browned and the squash is tender, about 8 minutes. Stir in the leeks, thyme, 1 teaspoon salt and 1 / 2 teaspoon pepper and continue to cook, stirring often, until the leeks are wilted and softened, about 5 minutes. ','Whisk together the chicken broth, cream, parsley, chives and eggs in a medium bowl. Add half the brioche to the ramekins. Divide the sausage mixture among the ramekins ; top with 1 cup of the Gruyere and the remaining brioche. Pour the egg mixture over and sprinkle with the remaining 1 cup Gruyere. ','Set aside

In [35]:
encoded_dataset['train'][0]['labels']

[0.0]

In [34]:
# !!! useful for multi-labels case: see which labels it has
[id2label[idx] for idx, label in enumerate(encoded_dataset['train'][0]['labels']) if label == 1.0]

[]

In [36]:
encoded_dataset.set_format("torch")

In [37]:
type(encoded_dataset)

datasets.dataset_dict.DatasetDict

In [39]:
# NOTE we need to change problem_type to multi_label_classification.....
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", 
                                                           problem_type="binary_classification",
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id
                                                           )

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Train the model!

In [44]:
args = TrainingArguments(
    f"google-bert/bert-base-cased",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    #push_to_hub=True,
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`