# Finetuning of BERT: Single Label (Veg)

In [56]:
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EvalPrediction

import torch
import numpy as np
import pandas as pd
import random

from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Features, Value, DatasetDict
from tqdm import tqdm

seed = 6
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x2ab7d1d4ff0>

In [57]:
df = load_dataset('csv', data_files='dataset/recipes_80k_cleaned.csv')

In [58]:
# see datatype of df
print(type(df))

<class 'datasets.dataset_dict.DatasetDict'>


In [59]:
df.keys()

dict_keys(['train'])

In [60]:
df['train'].features

{'Unnamed: 0': Value(dtype='int64', id=None),
 'cooking_method': Value(dtype='string', id=None),
 'ingredients': Value(dtype='string', id=None),
 'recipe_name': Value(dtype='string', id=None),
 'tags': Value(dtype='string', id=None),
 'Vegetarian&Desserts': Value(dtype='int64', id=None),
 'Vegetarian': Value(dtype='int64', id=None),
 'Dairy Free': Value(dtype='int64', id=None),
 'Gluten Free': Value(dtype='int64', id=None),
 'Low Carb': Value(dtype='int64', id=None),
 'Low Fat': Value(dtype='int64', id=None),
 'Low Sodium': Value(dtype='int64', id=None)}

In [61]:
# drop the first column
df['train'] = df['train'].remove_columns('Unnamed: 0')

In [62]:
# first 5 instances of dataset
df['train'][:5]

{'cooking_method': ["['Set the racks in the middle and upper thirds of the oven and preheat the oven to 425 F', 'In a large skillet over medium heat, heat the olive oil until shimmering. Add the onion, garlic and red pepper flakes and cook until golden, stirring occasionally, about 5 minutes.', 'Add the fennel and cook until the vegetables are soft and translucent, an additional 3 to 5 minutes.', 'Reduce the heat to medium and add the tomatoes with their juices. Using the back of a wooden spoon, smash the tomatoes and cook for 5 minutes.', 'Add the basil, wine, olives, 1 teaspoon salt, and 1/8 teaspoon black pepper.', 'Reduce to low and simmer for 15 minutes, or until the sauce is slightly thickened, while you prepare the fish.', 'Pat the fillets dry, lightly spray them with cooking spray, and season with salt and pepper.', 'In a heavy ovenproof skillet over high heat, heat the olive oil until shimmering. Add the fillets, rounded-side down, and cook for 2 minutes.', 'Carefully flip the

## Data Casting
As we can see a few cells above, the first 4 features are already in the wanted data type: `string`. Let's cast the others into `bool` (... and make other adjustments). 

In [63]:
# REMOVE columns Vegetarian, Others
df = df.remove_columns(['Vegetarian'])

df['train'].features

{'cooking_method': Value(dtype='string', id=None),
 'ingredients': Value(dtype='string', id=None),
 'recipe_name': Value(dtype='string', id=None),
 'tags': Value(dtype='string', id=None),
 'Vegetarian&Desserts': Value(dtype='int64', id=None),
 'Dairy Free': Value(dtype='int64', id=None),
 'Gluten Free': Value(dtype='int64', id=None),
 'Low Carb': Value(dtype='int64', id=None),
 'Low Fat': Value(dtype='int64', id=None),
 'Low Sodium': Value(dtype='int64', id=None)}

In [64]:
# RENAME columns Vegetarian&Dessert to Veg, Others&Dessert to NonVeg

# Define a function to rename columns
def rename_columns(example):
    # Rename 'Vegetarian&Dessert' to 'Veg'
    if 'Vegetarian&Desserts' in example:
        example['Veg'] = example.pop('Vegetarian&Desserts')
    # Rename 'Others&Dessert' to 'NonVeg'
    if 'Others&D' in example:
        example['NonVeg'] = example.pop('Others&D')
    return example

# Apply the rename_columns function to each example in the dataset
for split in df.keys():
    df[split] = df[split].map(rename_columns)

df['train'].features

{'cooking_method': Value(dtype='string', id=None),
 'ingredients': Value(dtype='string', id=None),
 'recipe_name': Value(dtype='string', id=None),
 'tags': Value(dtype='string', id=None),
 'Dairy Free': Value(dtype='int64', id=None),
 'Gluten Free': Value(dtype='int64', id=None),
 'Low Carb': Value(dtype='int64', id=None),
 'Low Fat': Value(dtype='int64', id=None),
 'Low Sodium': Value(dtype='int64', id=None),
 'Veg': Value(dtype='int64', id=None)}

In [65]:
# CAST variables to boolean
df['train'] = df['train'].cast(Features({
    'cooking_method' : Value('string'),
    'ingredients' : Value('string'),
    'recipe_name' : Value('string'),
    'tags' : Value('string'),
    'Dairy Free': Value('bool'),
    'Gluten Free': Value('bool'),
    'Low Carb': Value('bool'),
    'Low Fat': Value('bool'),
    'Low Sodium': Value('bool'),
    'Veg': Value('bool')
}))

df['train'].features

{'cooking_method': Value(dtype='string', id=None),
 'ingredients': Value(dtype='string', id=None),
 'recipe_name': Value(dtype='string', id=None),
 'tags': Value(dtype='string', id=None),
 'Dairy Free': Value(dtype='bool', id=None),
 'Gluten Free': Value(dtype='bool', id=None),
 'Low Carb': Value(dtype='bool', id=None),
 'Low Fat': Value(dtype='bool', id=None),
 'Low Sodium': Value(dtype='bool', id=None),
 'Veg': Value(dtype='bool', id=None)}

## Split

In [66]:
# Percentage of Veg recipes
sum(df['train']['Veg'])/len(df['train'])*100

33.50841116864455

In [67]:
# count how many recipes have 'cooking_method' of more than 512 tokens

"""count = 0
for i in tqdm(range(len(df['train']))):
    if len(df['train']['cooking_method'][i].split()) > 512:
        count += 1

print(count)"""

"count = 0\nfor i in tqdm(range(len(df['train']))):\n    if len(df['train']['cooking_method'][i].split()) > 512:\n        count += 1\n\nprint(count)"

In [68]:
# drop these instances
df['train'] = df['train'].filter(lambda x: len(x['cooking_method'].split()) <= 512)

In [69]:

len(df['train'])

79375

In [70]:
sum(df['train']['Veg'])/len(df['train'])*100

33.1867716535433

In [74]:
# order the dataset based on 'Veg' label
df['train'] = df['train'].sort('Veg', reverse=True)





In [75]:
#count the number of instances with 'Veg' label as True
sum(df['train']['Veg'])

26342

In [76]:
# reduce the df to the first 26342*2 instances
df['train'] = df['train'].select(range(26342*2))

In [78]:
len(df['train'])

52684

In [79]:
sum(df['train']['Veg'])

26342

In [83]:
#shuffle the dataset
df['train'] = df['train'].shuffle(seed=seed)

In [84]:
# column we want to stratify with respect to
stratify_column_name = "Veg"

# create class label column and stratify
df['train'] = df['train'].class_encode_column(stratify_column_name)

Stringifying the column:   0%|          | 0/52684 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/52684 [00:00<?, ? examples/s]

In [85]:
df['train'].features

{'cooking_method': Value(dtype='string', id=None),
 'ingredients': Value(dtype='string', id=None),
 'recipe_name': Value(dtype='string', id=None),
 'tags': Value(dtype='string', id=None),
 'Dairy Free': Value(dtype='bool', id=None),
 'Gluten Free': Value(dtype='bool', id=None),
 'Low Carb': Value(dtype='bool', id=None),
 'Low Fat': Value(dtype='bool', id=None),
 'Low Sodium': Value(dtype='bool', id=None),
 'Veg': ClassLabel(names=['False', 'True'], id=None)}

In [86]:
# see percentage of Veg == True 
sum(df['train']['Veg'])/len(df['train'])*100

50.0

In [87]:
# split the dataset into train, validation and test stratifying with respect to Veg
df = df['train'].train_test_split(test_size=0.3, seed = seed, stratify_by_column = 'Veg')
train_validation = df['train'].train_test_split(test_size=0.2, seed = seed, stratify_by_column='Veg')

dataset = DatasetDict()
dataset['test'] = df['test']
dataset['train'] = train_validation['train']
dataset['validation'] = train_validation['test']
dataset.shape

{'test': (15806, 10), 'train': (29502, 10), 'validation': (7376, 10)}

In [88]:
# CAST 'Veg' back to boolean for split in dataset.keys()
dataset = dataset.cast(Features({
    'cooking_method' : Value('string'),
    'ingredients' : Value('string'),
    'recipe_name' : Value('string'),
    'tags' : Value('string'),
    'Dairy Free': Value('bool'),
    'Gluten Free': Value('bool'),
    'Low Carb': Value('bool'),
    'Low Fat': Value('bool'),
    'Low Sodium': Value('bool'),
    'Veg': Value('bool')
}))

Casting the dataset:   0%|          | 0/15806 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/29502 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7376 [00:00<?, ? examples/s]

In [89]:
dataset['train'].features

{'cooking_method': Value(dtype='string', id=None),
 'ingredients': Value(dtype='string', id=None),
 'recipe_name': Value(dtype='string', id=None),
 'tags': Value(dtype='string', id=None),
 'Dairy Free': Value(dtype='bool', id=None),
 'Gluten Free': Value(dtype='bool', id=None),
 'Low Carb': Value(dtype='bool', id=None),
 'Low Fat': Value(dtype='bool', id=None),
 'Low Sodium': Value(dtype='bool', id=None),
 'Veg': Value(dtype='bool', id=None)}

In [90]:
# print the distribution of each label: Veg, Dairy Free, Gluten Free, Low Carb, Low Fat, Low Sodium in the train, test and val set

def class_distribution(dataset, column):
    num_1 = sum(dataset[column])
    num_0 = len(dataset) - num_1
    perc_1 = num_1/len(dataset)*100
    perc_0 = num_0/len(dataset)*100
    if column == 'Veg':
        return f"Vegetarian 1: \t {round(perc_1, 1)}% \t Vegetarian 0:\t {round(perc_0, 1)}%"
    return f"{column} 1: \t {round(perc_1, 1)}% \t {column} 0:\t {round(perc_0, 1)}%"


for split in dataset.keys():
    print(split)
    for column in dataset[split].features:
        if dataset[split].features[column].dtype == 'bool' or dataset[split].features[column].dtype == 'ClassLabel':
            print(class_distribution(dataset[split], column))
    print("\n")

test
Dairy Free 1: 	 0.9% 	 Dairy Free 0:	 99.1%
Gluten Free 1: 	 44.2% 	 Gluten Free 0:	 55.8%
Low Carb 1: 	 4.9% 	 Low Carb 0:	 95.1%
Low Fat 1: 	 12.3% 	 Low Fat 0:	 87.7%
Low Sodium 1: 	 23.5% 	 Low Sodium 0:	 76.5%
Vegetarian 1: 	 50.0% 	 Vegetarian 0:	 50.0%


train
Dairy Free 1: 	 0.8% 	 Dairy Free 0:	 99.2%
Gluten Free 1: 	 43.9% 	 Gluten Free 0:	 56.1%
Low Carb 1: 	 4.8% 	 Low Carb 0:	 95.2%
Low Fat 1: 	 12.6% 	 Low Fat 0:	 87.4%
Low Sodium 1: 	 23.8% 	 Low Sodium 0:	 76.2%
Vegetarian 1: 	 50.0% 	 Vegetarian 0:	 50.0%


validation
Dairy Free 1: 	 0.6% 	 Dairy Free 0:	 99.4%
Gluten Free 1: 	 43.7% 	 Gluten Free 0:	 56.3%
Low Carb 1: 	 4.9% 	 Low Carb 0:	 95.1%
Low Fat 1: 	 12.3% 	 Low Fat 0:	 87.7%
Low Sodium 1: 	 23.3% 	 Low Sodium 0:	 76.7%
Vegetarian 1: 	 50.0% 	 Vegetarian 0:	 50.0%




!!! Notice that from now on it is immediate to switch to the multi-labels case !!!

In [23]:
dataset['train'].features.keys()

dict_keys(['cooking_method', 'ingredients', 'recipe_name', 'tags', 'Dairy Free', 'Gluten Free', 'Low Carb', 'Low Fat', 'Low Sodium', 'Veg'])

In [24]:
labels = [label for label in dataset['train'].features.keys() if label not in ['cooking_method','ingredients','recipe_name','tags', 'Dairy Free', 'Gluten Free', 'Low Carb', 'Low Fat', 'Low Sodium', 'NonVeg']]
labels

['Veg']

In [25]:
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

id2label, label2id

({0: 'Veg'}, {'Veg': 0})

In [26]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")



In [27]:
def preprocess_data(examples):
  # take a batch of texts
  text = examples["cooking_method"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=512)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

In [28]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/23813 [00:00<?, ? examples/s]

Map:   0%|          | 0/44449 [00:00<?, ? examples/s]

Map:   0%|          | 0/11113 [00:00<?, ? examples/s]

In [29]:
encoded_dataset['train'].features

{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'labels': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None)}

In [29]:
i = encoded_dataset['train'][0]['input_ids']
len(i)

512

In [37]:
encoded_dataset['train']['labels'][0]

[0.0]

In [30]:
tokenizer.decode(i)

"[CLS] ['Preheat a grill or grill pan for cooking at medium - high heat. ','Insert a 6 - inch caramel apple skewer a couple of inches into the base of each ear of corn. Grill the corn, turning occasionally, until tender and charred in spots, about 10 minutes. ','Meanwhile, mix together the mayonnaise, smoked paprika, lemon zest and juice and 1 / 4 teaspoon salt. ','Slather the seasoned mayonnaise all over the corn and sprinkle with the Parmesan. Dust lightly with chipotle powder and serve.'] [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

In [31]:
encoded_dataset['train'][0]['labels']

[0.0]

In [32]:
# !!! useful for multi-labels case: see which labels it has
[id2label[idx] for idx, label in enumerate(encoded_dataset['train'][0]['labels']) if label == 1.0]

[]

In [33]:
encoded_dataset.set_format("torch")

In [34]:
type(encoded_dataset)

datasets.dataset_dict.DatasetDict

In [35]:
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id
                                                           )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Train the model!

In [None]:
args = TrainingArguments(
    f"google-bert/bert-base-cased",
    evaluation_strategy = "steps",
    save_strategy = "steps",
    save_steps=500,
    eval_steps=500,
    save_total_limit=2,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    #push_to_hub=True,
)

In [None]:
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average='micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

# evaluate

In [None]:
trainer.evaluate()

In [None]:
trainer.save_model("./models/bert-finetuned-group15")

In [None]:
text = "meat meat meat meat"

encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)
logits = outputs.logits

In [None]:
# apply sigmoid + threshold
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
predictions = np.zeros(probs.shape)
"""togliere commenti per stampare le labels corrispondenti"""
# predictions[np.where(probs >= 0.7)] = 1
# turn predicted id's into actual label names
#predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
probs