# Finetuning of BERT

In [8]:
import transformers
from transformers import AutoTokenizer

import torch
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import train_test_split
from datasets import load_dataset, Features, Value

seed = 6
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x112b53f50>

In [2]:
df = load_dataset('csv', data_files='dataset/recipes_df_r.csv')

In [3]:
# see datatype of df
print(type(df))

<class 'datasets.dataset_dict.DatasetDict'>


In [4]:
df.keys()

dict_keys(['train'])

In [5]:
df['train'].features

{'cooking_method': Value(dtype='string', id=None),
 'ingredients': Value(dtype='string', id=None),
 'recipe_name': Value(dtype='string', id=None),
 'tags': Value(dtype='string', id=None),
 'Vegetarian&Desserts': Value(dtype='int64', id=None),
 'Others&D': Value(dtype='int64', id=None),
 'Vegetarian': Value(dtype='int64', id=None),
 'Others': Value(dtype='int64', id=None),
 'Dairy Free': Value(dtype='int64', id=None),
 'Gluten Free': Value(dtype='int64', id=None),
 'Low Carb': Value(dtype='int64', id=None),
 'Low Fat': Value(dtype='int64', id=None),
 'Low Sodium': Value(dtype='int64', id=None)}

In [9]:
# first 5 instances of dataset
df['train'][:5]

{'cooking_method': ["['Remove the small side muscle from the scallops, rinse with cold water and thoroughly pat dry.', 'Add the butter and oil to a 12 to 14-inch saute pan on high heat. Salt and pepper the scallops. Once the fat begins to smoke, gently add the scallops, making sure they are not touching each other. Sear the scallops for 1 1/2 minutes on each side. The scallops should have a 1/4-inch golden crust on each side while still being translucent in the center. Serve immediately.']",
  '[\'With a sharp knife, slit the sausage skins lengthways and pop all the meat out. Using wet hands, roll little balls of sausage meat about the size of large marbles and set aside.\', "Heat a large frying pan and add a good splash of olive oil. Gently fry the sausage balls until golden brown all over, then add the pancetta and continue cooking for a couple of minutes, until it\'s golden. While this is cooking, bring a pan of salted water to the boil, add the linguine, and cook according to the p

## Data Casting
As we can see a few cells above, the first 4 features are already in the wanted data type: `string`. Let's cast the others into `bool` (... and make other adjustments). 

In [14]:
# REMOVE columns Vegetarian, Others
df = df.remove_columns(['Vegetarian', 'Others'])

df['train'].features

{'cooking_method': Value(dtype='string', id=None),
 'ingredients': Value(dtype='string', id=None),
 'recipe_name': Value(dtype='string', id=None),
 'tags': Value(dtype='string', id=None),
 'Vegetarian&Desserts': Value(dtype='int64', id=None),
 'Others&D': Value(dtype='int64', id=None),
 'Dairy Free': Value(dtype='int64', id=None),
 'Gluten Free': Value(dtype='int64', id=None),
 'Low Carb': Value(dtype='int64', id=None),
 'Low Fat': Value(dtype='int64', id=None),
 'Low Sodium': Value(dtype='int64', id=None)}

In [16]:
# RENAME columns Vegetarian&Dessert to Veg, Others&Dessert to NonVeg

# Define a function to rename columns
def rename_columns(example):
    # Rename 'Vegetarian&Dessert' to 'Veg'
    if 'Vegetarian&Desserts' in example:
        example['Veg'] = example.pop('Vegetarian&Desserts')
    # Rename 'Others&Dessert' to 'NonVeg'
    if 'Others&D' in example:
        example['NonVeg'] = example.pop('Others&D')
    return example

# Apply the rename_columns function to each example in the dataset
for split in df.keys():
    df[split] = df[split].map(rename_columns)

df['train'].features

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

{'cooking_method': Value(dtype='string', id=None),
 'ingredients': Value(dtype='string', id=None),
 'recipe_name': Value(dtype='string', id=None),
 'tags': Value(dtype='string', id=None),
 'Dairy Free': Value(dtype='int64', id=None),
 'Gluten Free': Value(dtype='int64', id=None),
 'Low Carb': Value(dtype='int64', id=None),
 'Low Fat': Value(dtype='int64', id=None),
 'Low Sodium': Value(dtype='int64', id=None),
 'Veg': Value(dtype='int64', id=None),
 'NonVeg': Value(dtype='int64', id=None)}

In [21]:
# CAST variables to boolean
df['train'] = df['train'].cast(Features({
    'cooking_method' : Value('string'),
    'ingredients' : Value('string'),
    'recipe_name' : Value('string'),
    'tags' : Value('string'),
    'Dairy Free': Value('bool'),
    'Gluten Free': Value('bool'),
    'Low Carb': Value('bool'),
    'Low Fat': Value('bool'),
    'Low Sodium': Value('bool'),
    'Veg': Value('bool'),
    'NonVeg': Value('bool')
}))

df['train'].features

Casting the dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

{'cooking_method': Value(dtype='string', id=None),
 'ingredients': Value(dtype='string', id=None),
 'recipe_name': Value(dtype='string', id=None),
 'tags': Value(dtype='string', id=None),
 'Dairy Free': Value(dtype='bool', id=None),
 'Gluten Free': Value(dtype='bool', id=None),
 'Low Carb': Value(dtype='bool', id=None),
 'Low Fat': Value(dtype='bool', id=None),
 'Low Sodium': Value(dtype='bool', id=None),
 'Veg': Value(dtype='bool', id=None),
 'NonVeg': Value(dtype='bool', id=None)}

## Split

In [23]:
# count how many recipes have 'cooking_method' of more than 512 tokens
count = 0
for i in range(len(df['train'])):
    if len(df['train']['cooking_method'][i].split()) > 512:
        count += 1

print(count)

173


In [24]:
# TODO: continuare da qui in poi

In [24]:
# count how many are above 512
df['cooking_method'].apply(lambda x: len(x.split())).apply(lambda x: x > 512).sum()

173

In [25]:
# drop these instances
df = df[df['cooking_method'].apply(lambda x: len(x.split())).apply(lambda x: x <= 512)]

In [26]:
df['cooking_method'].apply(lambda x: len(x.split())).apply(lambda x: x > 512).sum()

0

In [27]:
X = df['cooking_method']
y = df['Veg']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=seed)

In [28]:
len(X_train), len(X_val), len(X_test)

(5502, 1376, 2949)

In [29]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

In [34]:
def preprocess_data(data):
    # take a batch of texts
    text = data["cooking_method"]
    # encode them
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=512)
    
    label = data["Veg"]
    
    # create numpy array of shape (batch_size, 1) for single label
    labels_matrix = np.array(label).reshape(-1, 1)
    
    encoding["labels"] = labels_matrix.tolist()
    
    return encoding

In [35]:
encoded_dataset = df.map(preprocess_data, batched=True)

TypeError: preprocess_data() got an unexpected keyword argument 'batched'