In [None]:
# The code in this notebook was run on an external GPU in a Python file. 
# Therefore, the code contains code to let it run on a GPU and does it save plots instead of showing them.

In [None]:
# Importing the needed packages
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support, auc, roc_curve, confusion_matrix
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from tqdm import tqdm
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from transformers import BertTokenizerFast as BertTokenizer, AdamW
import seaborn as sns

In [None]:
# link to the dataset used for this thesis
# https://www.kaggle.com/datasets/dkapitan/dutch-restaurant-reviews

In [None]:
# Read dataset and transfer it to a dataframe
file_path = file_path # Add the path to the file
df_raw_data = pd.read_parquet(file_path) # load the data

In [None]:
# Information of the raw dataset
print(df_raw_data.head(5)) # Print the first five rows of the data
print(len(df_raw_data)) # Print the number of rows of the dataframe

In [None]:
# Necessary preprocessing steps for thesis
df_pp_data = df_raw_data.copy()

# Remove the unknown characters in the avgPrice column
df_pp_data["avgPrice"] = df_pp_data["avgPrice"].str.replace("â\u0082¬","")
df_pp_data["avgPrice"] = df_pp_data["avgPrice"].str.replace("\u0080","")

# Convert the necessary numeric columns to numeric
columns_to_convert = ["scoreTotal", "avgPrice", "reviewerNumReviews", "reviewScoreOverall", "reviewScoreFood", "reviewScoreService", "reviewScoreAmbiance"]
df_pp_data[columns_to_convert] = df_pp_data[columns_to_convert].apply(pd.to_numeric, errors = "coerce")

# Remove reviews that are not positive and not negative
df_pp_data = df_pp_data[~((df_pp_data["reviewScoreFood"] == 5) | (df_pp_data["reviewScoreFood"] == 6) |
                         (df_pp_data["reviewScoreService"] == 5) | (df_pp_data["reviewScoreService"] == 6) |
                           (df_pp_data["reviewScoreAmbiance"] == 5) | (df_pp_data["reviewScoreAmbiance"] == 6))]

# change the personal review scores to sentiments
columns_to_change = ["reviewScoreFood", "reviewScoreService", "reviewScoreAmbiance"]
bins = [1,6,10]
labels = ["Negative", "Positive"]
for column in columns_to_change:
    df_pp_data[column] = pd.cut(df_pp_data[column], bins = bins, labels = labels)

In [None]:
# Extract the city name from the address
def extract_second_to_last_element(address): # Function to extract the second to last element from the address
    address_components = address.split()
    if len(address_components) >= 2:
        return address_components[-2]
    else:
        return None

df_pp_data["City"] = df_pp_data["address"].apply(extract_second_to_last_element) # create the new feature

unique_values_city = df_pp_data["City"].value_counts() # Store the number of unique cities

In [None]:
# remove rows that contain missing values for the important variables
# important variables: restoId, avgPrice, reviewScoreOverall, reviewScoreFood, reviewScoreService, reviewScoreAmbiance, reviewText, City
df_nomissing = df_pp_data.copy()

important_features = ["restoId", "avgPrice", "reviewScoreOverall", "reviewScoreFood", "reviewScoreService", "reviewScoreAmbiance", "reviewText", "City"]

df_nomissing = df_nomissing.dropna(subset = important_features)

# Remove reviews that are too short and too long
df_nomissing["reviewLength"] = df_nomissing["reviewText"].apply(len) # make a new feature that contains the length of a review

descriptives_length = df_nomissing["reviewLength"].describe()

df_lessreviews = df_nomissing.copy()
df_lessreviews = df_lessreviews[(df_lessreviews["reviewLength"] >= 20) & (df_lessreviews["reviewLength"] <= 2000)] # Reviews with less than 20 characters and more than 2000 characters will be removed

# Randomly remove rows with all positive labels to make the dataset more balanced (is still imbalanced after, but less imbalanced),
# also remove rows because the amount of data is too computationally expensive

sentiment_columns = ["reviewScoreFood", "reviewScoreService", "reviewScoreAmbiance"]
df_sentiment = df_lessreviews[sentiment_columns]

positive_rows = np.all(df_sentiment == "Positive", axis = 1) # Define which reviews only contain positive sentiments

sampled_positive_rows = df_lessreviews[positive_rows].sample(n = 232830, random_state = 68)

df_lessreviews = df_lessreviews.drop(sampled_positive_rows.index)

# analyze how many positive and negative reviews are left in the reduced dataframe
counts_food = df_lessreviews["reviewScoreFood"].value_counts()
counts_service = df_lessreviews["reviewScoreService"].value_counts()
counts_ambiance = df_lessreviews["reviewScoreAmbiance"].value_counts()

print(counts_food)
print(counts_service)
print(counts_ambiance)

In [None]:
# Encode the sentiments into labels (negative = 0, positive = 1)
class_mapping = {"Negative": 0, "Positive": 1}

# Initialize LabelEncoder with the custom mapping
label_encoder = LabelEncoder()
label_encoder.classes_ = class_mapping.keys()
label_encoder.transform = lambda x: [class_mapping[label] for label in x]

# Fit and transform the target variables using the custom mapping
df_lessreviews["labelFood"] = label_encoder.transform(df_lessreviews["reviewScoreFood"])
df_lessreviews["labelService"] = label_encoder.transform(df_lessreviews["reviewScoreService"])
df_lessreviews["labelAmbiance"] = label_encoder.transform(df_lessreviews["reviewScoreAmbiance"])
df_lessreviews.head()

# Normalize the price variable
scaler = MinMaxScaler()
df_lessreviews["avgPrice"] = scaler.fit_transform(df_lessreviews[["avgPrice"]])

# Embed the city variable
label_encoder_city = LabelEncoder()
df_lessreviews["city_encoded"] = label_encoder_city.fit_transform(df_lessreviews["City"])
amount_cities = len(label_encoder_city.classes_) # extract the number of unique locations
embedding_dim_city = int(amount_cities**0.5) # use square root rule to define the number of embedding dimensions
embedding_city = nn.Embedding(amount_cities, embedding_dim_city)

In [None]:
# split the data into a train, validation, and test set
df_final = df_lessreviews

# First split the data into train and temp sets
df_train, df_temp = train_test_split(df_final, test_size= 0.2, random_state=68)

# Then split the temp set into validation and test sets
df_validation, df_test = train_test_split(df_temp, test_size= 0.5, random_state=68)

# Print the number of samples in each set
print("Training set samples:", len(df_train))
print("Validation set samples:", len(df_validation))
print("Test set samples:", len(df_test))

# Optimize and run BERTje without expectations

In [None]:
# Tokenize the data for BERTje
tokenizer_BERTje = BertTokenizer.from_pretrained("wietsedv/bert-base-dutch-cased") # load the tokenizer for BERTje

# The following code for tokenizing is based on the function wrap_examples(examples, tokenizer) available here: https://github.com/wietsedv/bertje/blob/master/finetuning/v1/run_110kDBRD.py
# I used parts of this function to tokenize the reviews and return them as a dataset for my train, validation, and test samples seperately.

# Tokenize the reviews in the training set
train_input_ids, train_input_masks, train_labels = [], [], []
for text in df_train["reviewText"]:
    tokenized_bert_train = tokenizer_BERTje.encode_plus(text, max_length = 512, truncation = True, add_special_tokens = True,
                                                padding = "max_length", return_token_type_ids = False)
    train_input_ids.append(tokenized_bert_train["input_ids"])
    train_input_masks.append(tokenized_bert_train["attention_mask"])

train_input_ids = torch.tensor(train_input_ids, dtype = torch.long)
train_input_masks = torch.tensor(train_input_masks, dtype = torch.long)
train_labels = torch.tensor(df_train[["labelFood", "labelService", "labelAmbiance"]].values, dtype = torch.float32)

# Tokenize the reviews in the validation set
validation_input_ids, validation_input_masks, validation_labels = [], [], []
for text in df_validation["reviewText"]:
    tokenized_bert_validation = tokenizer_BERTje.encode_plus(text, max_length = 512, truncation = True, add_special_tokens = True,
                                                padding = "max_length", return_token_type_ids = False)
    validation_input_ids.append(tokenized_bert_validation["input_ids"])
    validation_input_masks.append(tokenized_bert_validation["attention_mask"])

validation_input_ids = torch.tensor(validation_input_ids, dtype = torch.long)
validation_input_masks = torch.tensor(validation_input_masks, dtype = torch.long)
validation_labels = torch.tensor(df_validation[["labelFood", "labelService", "labelAmbiance"]].values, dtype = torch.float32)

# Tokenize the reviews in the test set
test_input_ids, test_input_masks, test_labels = [], [], []
for text in df_test["reviewText"]:
    tokenized_bert_test = tokenizer_BERTje.encode_plus(text, max_length = 512, truncation = True, add_special_tokens = True,
                                                padding = "max_length", return_token_type_ids = False)
    test_input_ids.append(tokenized_bert_test["input_ids"])
    test_input_masks.append(tokenized_bert_test["attention_mask"])

test_input_ids = torch.tensor(test_input_ids, dtype = torch.long)
test_input_masks = torch.tensor(test_input_masks, dtype = torch.long)
test_labels = torch.tensor(df_test[["labelFood", "labelService", "labelAmbiance"]].values, dtype = torch.float32)

batch_size = 16
train_data = TensorDataset(train_input_ids, train_input_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = batch_size)

validation_data = TensorDataset(validation_input_ids, validation_input_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler = validation_sampler, batch_size = batch_size)

test_data = TensorDataset(test_input_ids, test_input_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size = batch_size)

In [None]:
# Define a function to optimize the parameters for the BERTje model without expectations

# This function is based on the function train_model(model, device, train_dataloader, dev_dataloader, output_path, num_epochs=4, lr=2e-5, eps=1e-8, max_grad_norm=1.0, seed=4327)
# Retrieved from: https://github.com/wietsedv/bertje/blob/master/finetuning/v1/run_110kDBRD.py
# The function from Wietse et al. (2019) only takes one set of parameters and my function optimizes a set of parameters, also my function has an early stopping criterion

def train_and_evaluate(lr, batch_size, epochs, train_dataset, val_dataset, patience = 1):
    # Initialize DataLoader with the given batch size
    train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
    val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

    # Initialize model
    model = BertForSequenceClassification.from_pretrained("wietsedv/bert-base-dutch-cased", problem_type="multi_label_classification", num_labels=3)
    model.to(device)

    # Define optimizer
    optimizer = AdamW(model.parameters(), lr=lr)

    best_val_loss = float('inf')
    best_model_state = None
    epochs_no_improve = 0

    # Make lists to store train and validation losses
    train_losses = []
    val_losses = []

    for epoch in range(epochs):
        model.train()
        train_loss = 0 # reset training loss

        progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}")

        for step, batch in enumerate(progress_bar):
            input_ids, attention_mask, labels = batch
            optimizer.zero_grad()

            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits

            loss_fn = torch.nn.BCEWithLogitsLoss()
            loss = loss_fn(logits, labels)

            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_dataloader)
        train_losses.append(train_loss)

        model.eval()
        val_loss = 0 # reset validation loss

        with torch.no_grad():
            for val_batch in val_dataloader:
                val_input_ids, val_attention_mask, val_labels = val_batch
                val_input_ids, val_attention_mask, val_labels = val_input_ids.to(device), val_attention_mask.to(device), val_labels.to(device)

                val_outputs = model(val_input_ids, attention_mask=val_attention_mask, labels=val_labels)
                val_logits = val_outputs.logits

                val_loss += loss_fn(val_logits, val_labels).item()

        val_loss /= len(val_dataloader)
        val_losses.append(val_loss)
        
        # Insert early stopping to prevent overfitting
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = model.state_dict()
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
        
        if epochs_no_improve >= patience:
            print(f"Early stopping at epoch {epoch + 1}")
            break

    if best_model_state is not None:
        model.load_state_dict(best_model_state)
    
    return best_val_loss, model, train_losses, val_losses

In [None]:
# Define hyperparameter grid
learning_rates = [1e-5, 2e-5, 3e-5, 4e-5, 5e-5]
batch_sizes = [8, 16, 32]
epochs = 10

# Initialize variables to store the best results
best_val_loss = float('inf')
best_hyperparams = None
best_model = None

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') 

# Make dictionaries to store train and validation losses
all_train_losses = {}
all_val_losses = {}

# Train and validate the BERTje model for each set of the defined hyperparameters
for lr in learning_rates:
    for batch_size in batch_sizes:
        print(f"Training with lr={lr}, batch_size={batch_size}")
        val_loss, model, train_losses, val_losses = train_and_evaluate(lr, batch_size, epochs, train_data, validation_data)
        
        key = f"lr_{lr}_bs_{batch_size}"
        all_train_losses[key] = train_losses
        all_val_losses[key] = val_losses
        
        # Define and store the best hyperparameters and model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_hyperparams = (lr, batch_size)
            best_model = model


# Define where the best model should be saved
checkpoint_path = './bert_checkpoints1/'
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)

# save the best model
final_model_path = os.path.join(checkpoint_path, 'bert_best_model1.pt')
torch.save(best_model.state_dict(), final_model_path)


print(f"Best hyperparameters: Learning rate = {best_hyperparams[0]}, Batch size = {best_hyperparams[1]}")
print(f"Best model saved at: {final_model_path}")
print("Training complete!")

In [None]:
# Plot training and validation loss for the best model
# Retrieve the best hyperparameters
best_lr, best_batch_size = best_hyperparams
best_key = f"lr_{best_lr}_bs_{best_batch_size}"

# Retrieve the data for the best hyperparameters
best_train_losses = all_train_losses[best_key]
best_val_losses = all_val_losses[best_key]

# Plot Losses for the best model
min_length = min(len(best_train_losses), len(best_val_losses))
epochs_range = range(1, min_length + 1)
plt.figure(figsize=(12, 6))
plt.plot(epochs_range, best_train_losses[:min_length], label='Train Loss', marker='o')
plt.plot(epochs_range, best_val_losses[:min_length], label='Val Loss', marker='o')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title(f'Training and Validation Losses for BERTje')
plt.legend()
plt.grid(True)

# Save the loss plot
plt.savefig(f'training_validation_losses_BERTje.png')
plt.close()

In [None]:
# Run the optimal model and retrieve evaluation metrics and plots

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Load the final model for evaluation
model = BertForSequenceClassification.from_pretrained("wietsedv/bert-base-dutch-cased",problem_type="multi_label_classification",  num_labels=3)
model.load_state_dict(torch.load(final_model_path))
model.to(device)
model.eval()

# Define lists to store the true and predicted labels
true_labels = []
pred_labels = []

# Make predictions with the best model
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits.cpu()

        # Apply sigmoid activation function
        probabilities = torch.sigmoid(logits)

        # Apply threshold to convert probabilities to binary predictions
        threshold = 0.5
        predicted_classes = (probabilities > threshold).float()

        true_labels.extend(labels.cpu().numpy())
        pred_labels.extend(predicted_classes.cpu().numpy())

# convert the true and predicted labels to numpy arrays
true_labels_array = np.array(true_labels)
pred_labels_array = np.array(pred_labels)

# Compute overall accuracy
accuracy = accuracy_score(true_labels, pred_labels)

# Compute precision, recall, and F1-score for each label
precision, recall, fscore, support = precision_recall_fscore_support(true_labels, pred_labels, average=None)

# Print overall accuracy
print(f"Accuracy: {accuracy}")

label_names = ["labelFood", "labelService", "labelAmbiance"]
class_labels = ["Negative", "Positive"]

# Print detailed metrics for each label
for i, label in enumerate(label_names):
    print(f"{label}:")
    print(f"  Precision: {precision[i]}")
    print(f"  Recall: {recall[i]}")
    print(f"  F1-score: {fscore[i]}")
    print(f"  Support: {support[i]}")


# print the classification report per label and make confusion matrices per label
for label_index, label_name in enumerate(label_names):
    # Binarize the true and predicted labels for the current label
    binary_true_labels = true_labels_array[:, label_index]
    binary_pred_labels = pred_labels_array[:, label_index]
    label_accuracy = accuracy_score(binary_true_labels, binary_pred_labels)

    # Print the classification report and accuracy for the current label
    print(f"Classification Report for {label_name}:")
    print(classification_report(binary_true_labels, binary_pred_labels, target_names=["Negative", "Positive"], labels = [0, 1]))
    print(f"Accuracy: {label_accuracy}\n")

    cm = confusion_matrix(binary_true_labels, binary_pred_labels)

    # Plot the confusion matrix for the current label
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels= ["Negative", "Positive"], yticklabels= ["Negative", "Positive"])
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title(f'Confusion Matrix - BERTje {label_name}')
    plt.savefig(f'confusion_matrix_{label_name}.png') # Save the plot
    plt.close()

# Optimize and run BERTje with expectations

In [None]:
# Tokenize the data for BERTje with expectations
tokenizer_BERTje = BertTokenizer.from_pretrained("wietsedv/bert-base-dutch-cased") # load the tokenizer for BERTje

# The following code for tokenizing is based on the function wrap_examples(examples, tokenizer) available here: https://github.com/wietsedv/bertje/blob/master/finetuning/v1/run_110kDBRD.py
# I used parts of this function to tokenize the reviews and return them as a dataset for my train, validation, and test samples seperately.

# Tokenize the reviews and convert the city encodings to PyTorch tensors in the training set
train_input_ids, train_input_masks, train_labels, train_cities, train_prices = [], [], [], [], []
for i, row in df_train.iterrows():
    tokenized_bert_train = tokenizer_BERTje.encode_plus(row["reviewText"], max_length = 512, truncation = True, add_special_tokens = True,
                                                padding = "max_length", return_token_type_ids = False)
    train_input_ids.append(tokenized_bert_train["input_ids"])
    train_input_masks.append(tokenized_bert_train["attention_mask"])
    train_labels.append(row[["labelFood", "labelService", "labelAmbiance"]].values)
    train_cities.append(embedding_city(torch.LongTensor([row["city_encoded"]])).squeeze())
    train_prices.append(row["avgPrice"])

train_input_ids = torch.tensor(train_input_ids, dtype = torch.long)
train_input_masks = torch.tensor(train_input_masks, dtype = torch.long)
train_labels = torch.tensor(train_labels, dtype = torch.float32)
train_cities = torch.stack(train_cities)
train_prices = torch.tensor(train_prices, dtype=torch.float32)

# Tokenize the reviews and convert the city encodings to PyTorch tensors in the validation set
validation_input_ids, validation_input_masks, validation_labels, validation_cities, validation_prices = [], [], [], [], []
for i, row in df_validation.iterrows():
    tokenized_bert_validation = tokenizer_BERTje.encode_plus(row["reviewText"], max_length = 512, truncation = True, add_special_tokens = True,
                                                padding = "max_length", return_token_type_ids = False)
    validation_input_ids.append(tokenized_bert_validation["input_ids"])
    validation_input_masks.append(tokenized_bert_validation["attention_mask"])
    validation_labels.append(row[["labelFood", "labelService", "labelAmbiance"]].values)
    validation_cities.append(embedding_city(torch.LongTensor([row["city_encoded"]])).squeeze())
    validation_prices.append(row["avgPrice"])

validation_input_ids = torch.tensor(validation_input_ids, dtype = torch.long)
validation_input_masks = torch.tensor(validation_input_masks, dtype = torch.long)
validation_labels = torch.tensor(validation_labels, dtype = torch.float32)
validation_cities = torch.stack(validation_cities)
validation_prices = torch.tensor(validation_prices, dtype=torch.float32)

# Tokenize the reviews in the test set
test_input_ids, test_input_masks, test_labels, test_cities, test_prices = [], [], [], [], []
for i, row in df_test.iterrows():
    tokenized_bert_test = tokenizer_BERTje.encode_plus(row["reviewText"], max_length = 512, truncation = True, add_special_tokens = True,
                                                padding = "max_length", return_token_type_ids = False)
    test_input_ids.append(tokenized_bert_test["input_ids"])
    test_input_masks.append(tokenized_bert_test["attention_mask"])
    test_labels.append(row[["labelFood", "labelService", "labelAmbiance"]].values)
    test_cities.append(embedding_city(torch.LongTensor([row["city_encoded"]])).squeeze())
    test_prices.append(row["avgPrice"])

test_input_ids = torch.tensor(test_input_ids, dtype = torch.long)
test_input_masks = torch.tensor(test_input_masks, dtype = torch.long)
test_labels = torch.tensor(test_labels, dtype = torch.float32)
test_cities = torch.stack(test_cities)
test_prices = torch.tensor(test_prices, dtype=torch.float32)

batch_size = 16
train_data = TensorDataset(train_input_ids, train_input_masks, train_labels, train_cities, train_prices)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = batch_size)

validation_data = TensorDataset(validation_input_ids, validation_input_masks, validation_labels, validation_cities, validation_prices)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler = validation_sampler, batch_size = batch_size)

test_data = TensorDataset(test_input_ids, test_input_masks, test_labels, test_cities, test_prices)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size = batch_size)

In [None]:
# Define a function to optimize the parameters for the BERTje model with expectations

# This function is based on the function train_model(model, device, train_dataloader, dev_dataloader, output_path, num_epochs=4, lr=2e-5, eps=1e-8, max_grad_norm=1.0, seed=4327)
# Retrieved from: https://github.com/wietsedv/bertje/blob/master/finetuning/v1/run_110kDBRD.py
# The function from Wietse et al. (2019) only takes one set of parameters and my function optimizes a set of parameters, also my function inserts an early stopping criterion

def train_and_evaluate(lr, batch_size, epochs, train_dataset, val_dataset, patience = 1):
    # Initialize DataLoader with the given batch size
    train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
    val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

    # Initialize model
    model = BertForSequenceClassification.from_pretrained("wietsedv/bert-base-dutch-cased", problem_type="multi_label_classification", num_labels=3)
    model.to(device)

    # Define optimizer
    optimizer = AdamW(model.parameters(), lr=lr)

    best_val_loss = float('inf')
    best_model_state = None
    epochs_no_improve = 0

    # Make lists to store train and validation losses
    train_losses = []
    val_losses = []

    for epoch in range(epochs):
        model.train()
        train_loss = 0 # reset training loss

        progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}")

        for step, batch in enumerate(progress_bar):
            input_ids, attention_mask, labels, cities, prices = batch
            optimizer.zero_grad()

            input_ids, attention_mask, labels, cities, prices = input_ids.to(device), attention_mask.to(device), labels.to(device), cities.to(device), prices.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits

            loss_fn = torch.nn.BCEWithLogitsLoss()
            loss = loss_fn(logits, labels)

            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_dataloader)
        train_losses.append(train_loss)

        model.eval()
        val_loss = 0 # reset validation loss

        with torch.no_grad():
            for val_batch in val_dataloader:
                val_input_ids, val_attention_mask, val_labels, val_cities, val_prices = val_batch
                val_input_ids, val_attention_mask, val_labels, val_cities, val_prices = val_input_ids.to(device), val_attention_mask.to(device), val_labels.to(device), val_cities.to(device), val_prices.to(device)

                val_outputs = model(val_input_ids, attention_mask=val_attention_mask, labels=val_labels)
                val_logits = val_outputs.logits

                val_loss += loss_fn(val_logits, val_labels).item()

        val_loss /= len(val_dataloader)
        val_losses.append(val_loss)
        
        # Insert early stopping to prevent overfitting
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = model.state_dict()
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
        
        if epochs_no_improve >= patience:
            print(f"Early stopping at epoch {epoch + 1}")
            break

    if best_model_state is not None:
        model.load_state_dict(best_model_state)
    
    return best_val_loss, model, train_losses, val_losses


In [None]:
# Define hyperparameter grid
learning_rates = [1e-5, 2e-5, 3e-5, 4e-5, 5e-5]
batch_sizes = [8, 16, 32]
epochs = 10

# Initialize variables to store the best results
best_val_loss = float('inf')
best_hyperparams = None
best_model = None

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') 

# Make dictionaries to store train and validation losses
all_train_losses = {}
all_val_losses = {}

# Train and validate the BERTje model for each set of the defined hyperparameters
for lr in learning_rates:
    for batch_size in batch_sizes:
        print(f"Training with lr={lr}, batch_size={batch_size}")
        val_loss, model, train_losses, val_losses = train_and_evaluate(lr, batch_size, epochs, train_data, validation_data)
        
        key = f"lr_{lr}_bs_{batch_size}"
        all_train_losses[key] = train_losses
        all_val_losses[key] = val_losses
        
        # Define and store the best hyperparameters and model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_hyperparams = (lr, batch_size)
            best_model = model


# Define where the best model should be saved
checkpoint_path = './bert_checkpoints2/'
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)

# save the best model
final_model_path = os.path.join(checkpoint_path, 'bert_best_model2.pt')
torch.save(best_model.state_dict(), final_model_path)


print(f"Best hyperparameters: Learning rate = {best_hyperparams[0]}, Batch size = {best_hyperparams[1]}")
print(f"Best model saved at: {final_model_path}")
print("Training complete!")

In [None]:
# Plot training and validation loss for the best model
# Retrieve the best hyperparameters
best_lr, best_batch_size = best_hyperparams
best_key = f"lr_{best_lr}_bs_{best_batch_size}"

# Retrieve the data for the best hyperparameters
best_train_losses = all_train_losses[best_key]
best_val_losses = all_val_losses[best_key]

# Plot Losses for the best model
min_length = min(len(best_train_losses), len(best_val_losses))
epochs_range = range(1, min_length + 1)
plt.figure(figsize=(12, 6))
plt.plot(epochs_range, best_train_losses[:min_length], label='Train Loss', marker='o')
plt.plot(epochs_range, best_val_losses[:min_length], label='Val Loss', marker='o')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title(f'Training and Validation Losses for BERTje with expectations')
plt.legend()
plt.grid(True)

# Save the loss plot
plt.savefig(f'training_validation_losses_BERTje_expectations.png')
plt.close()


In [None]:
# Run the optimal model and retrieve evaluation metrics and plots

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Load the final model for evaluation
model = BertForSequenceClassification.from_pretrained("wietsedv/bert-base-dutch-cased",problem_type="multi_label_classification",  num_labels=3)
model.load_state_dict(torch.load(final_model_path))
model.to(device)
model.eval()

# Define lists to store the true and predicted labels
true_labels = []
pred_labels = []

# Make predictions with the best model
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels, cities, prices = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits.cpu()

        # Apply sigmoid activation function
        probabilities = torch.sigmoid(logits)

        # Apply threshold to convert probabilities to binary predictions
        threshold = 0.5
        predicted_classes = (probabilities > threshold).float()

        true_labels.extend(labels.cpu().numpy())
        pred_labels.extend(predicted_classes.cpu().numpy())

# convert the true and predicted labels to numpy arrays
true_labels_array = np.array(true_labels)
pred_labels_array = np.array(pred_labels)

# Compute overall accuracy
accuracy = accuracy_score(true_labels, pred_labels)

# Compute precision, recall, and F1-score for each label
precision, recall, fscore, support = precision_recall_fscore_support(true_labels, pred_labels, average=None)

# Print overall accuracy
print(f"Accuracy: {accuracy}")

label_names = ["labelFood", "labelService", "labelAmbiance"]
class_labels = ["Negative", "Positive"]

# Print detailed metrics for each label
for i, label in enumerate(label_names):
    print(f"{label}:")
    print(f"  Precision: {precision[i]}")
    print(f"  Recall: {recall[i]}")
    print(f"  F1-score: {fscore[i]}")
    print(f"  Support: {support[i]}")


# print the classification report per label and make confusion matrices per label
for label_index, label_name in enumerate(label_names):
    # Binarize the true and predicted labels for the current label
    binary_true_labels = true_labels_array[:, label_index]
    binary_pred_labels = pred_labels_array[:, label_index]
    label_accuracy = accuracy_score(binary_true_labels, binary_pred_labels)

    # Print the classification report and accuracy for the current label
    print(f"Classification Report for {label_name} with expectations:")
    print(classification_report(binary_true_labels, binary_pred_labels, target_names=["Negative", "Positive"], labels = [0, 1]))
    print(f"Accuracy: {label_accuracy}\n")

    cm = confusion_matrix(binary_true_labels, binary_pred_labels)

    # Plot the confusion matrix for the current label
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels= ["Negative", "Positive"], yticklabels= ["Negative", "Positive"])
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title(f'Confusion Matrix - BERTje with expectations {label_name}')
    plt.savefig(f'confusion_matrix_{label_name}_expectations.png') # Save the plot
    plt.close()