In [None]:
# The code in this notebook was run on an external GPU in a Python file. 

In [None]:
# Importing the needed packages
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
import torch
from transformers import BertTokenizerFast as BertTokenizer

In [None]:
# link to the dataset used for this thesis
# https://www.kaggle.com/datasets/dkapitan/dutch-restaurant-reviews

In [None]:
# Read dataset and transfer it to a dataframe
file_path = file_path # Add the path to the file
df_raw_data = pd.read_parquet(file_path) # load the data

In [None]:
# Information of the raw dataset
print(df_raw_data.head(5)) # Print the first five rows of the data
print(len(df_raw_data)) # Print the number of rows of the dataframe

In [None]:
# Necessary preprocessing steps for thesis
df_pp_data = df_raw_data.copy()

# Remove the unknown characters in the avgPrice column
df_pp_data["avgPrice"] = df_pp_data["avgPrice"].str.replace("â\u0082¬","")
df_pp_data["avgPrice"] = df_pp_data["avgPrice"].str.replace("\u0080","")

# Convert the necessary numeric columns to numeric
columns_to_convert = ["scoreTotal", "avgPrice", "reviewerNumReviews", "reviewScoreOverall", "reviewScoreFood", "reviewScoreService", "reviewScoreAmbiance"]
df_pp_data[columns_to_convert] = df_pp_data[columns_to_convert].apply(pd.to_numeric, errors = "coerce")

# Remove reviews that are not positive and not negative
df_pp_data = df_pp_data[~((df_pp_data["reviewScoreFood"] == 5) | (df_pp_data["reviewScoreFood"] == 6) |
                         (df_pp_data["reviewScoreService"] == 5) | (df_pp_data["reviewScoreService"] == 6) |
                           (df_pp_data["reviewScoreAmbiance"] == 5) | (df_pp_data["reviewScoreAmbiance"] == 6))]

# change the personal review scores to sentiments
columns_to_change = ["reviewScoreFood", "reviewScoreService", "reviewScoreAmbiance"]
bins = [1,6,10]
labels = ["Negative", "Positive"]
for column in columns_to_change:
    df_pp_data[column] = pd.cut(df_pp_data[column], bins = bins, labels = labels)

In [None]:
# change the personal review scores to sentiments
columns_to_change = ["reviewScoreFood", "reviewScoreService", "reviewScoreAmbiance"]
bins = [1,6,10]
labels = ["Negative", "Positive"]
for column in columns_to_change:
    df_pp_data[column] = pd.cut(df_pp_data[column], bins = bins, labels = labels)

# Extract the city name from the address
def extract_second_to_last_element(address): # Function to extract the second to last element from the address
    address_components = address.split()
    if len(address_components) >= 2:
        return address_components[-2]
    else:
        return None

df_pp_data["City"] = df_pp_data["address"].apply(extract_second_to_last_element) # create the new feature

unique_values_city = df_pp_data["City"].value_counts() # Store the number of unique cities

In [None]:
# remove rows that contain missing values for the important variables
# important variables: restoId, avgPrice, reviewScoreOverall, reviewScoreFood, reviewScoreService, reviewScoreAmbiance, reviewText, City
df_nomissing = df_pp_data.copy()

important_features = ["restoId", "avgPrice", "reviewScoreOverall", "reviewScoreFood", "reviewScoreService", "reviewScoreAmbiance", "reviewText", "City"]

df_nomissing = df_nomissing.dropna(subset = important_features)

# Remove reviews that are too short and too long
df_nomissing["reviewLength"] = df_nomissing["reviewText"].apply(len) # make a new feature that contains the length of a review

descriptives_length = df_nomissing["reviewLength"].describe()

df_lessreviews = df_nomissing.copy()
df_lessreviews = df_lessreviews[(df_lessreviews["reviewLength"] >= 20) & (df_lessreviews["reviewLength"] <= 2000)] # Reviews with less than 20 characters and more than 2000 characters will be removed

# Randomly remove rows with all positive labels to make the dataset more balanced (is still imbalanced after, but less imbalanced),
# also remove rows because the amount of data is too computationally expensive

sentiment_columns = ["reviewScoreFood", "reviewScoreService", "reviewScoreAmbiance"]
df_sentiment = df_lessreviews[sentiment_columns]

positive_rows = np.all(df_sentiment == "Positive", axis = 1) # Define which reviews only contain positive sentiments

sampled_positive_rows = df_lessreviews[positive_rows].sample(n = 232830, random_state = 68)

df_lessreviews = df_lessreviews.drop(sampled_positive_rows.index)

# analyze how many positive and negative reviews are left in the reduced dataframe
counts_food = df_lessreviews["reviewScoreFood"].value_counts()
counts_service = df_lessreviews["reviewScoreService"].value_counts()
counts_ambiance = df_lessreviews["reviewScoreAmbiance"].value_counts()

print(counts_food)
print(counts_service)
print(counts_ambiance)


In [None]:
# Encode the sentiments into labels (negative = 0, positive = 1)
class_mapping = {"Negative": 0, "Positive": 1}

# Initialize LabelEncoder with the custom mapping
label_encoder = LabelEncoder()
label_encoder.classes_ = class_mapping.keys()
label_encoder.transform = lambda x: [class_mapping[label] for label in x]

# Fit and transform the target variables using the custom mapping
df_lessreviews["labelFood"] = label_encoder.transform(df_lessreviews["reviewScoreFood"])
df_lessreviews["labelService"] = label_encoder.transform(df_lessreviews["reviewScoreService"])
df_lessreviews["labelAmbiance"] = label_encoder.transform(df_lessreviews["reviewScoreAmbiance"])
df_lessreviews.head()

In [None]:
# split the data into a train, validation, and test set
df_final = df_lessreviews

# First split the data into train and temp sets
df_train, df_temp = train_test_split(df_final, test_size= 0.2, random_state=68)

# Then split the temp set into validation and test sets
df_validation, df_test = train_test_split(df_temp, test_size= 0.5, random_state=68)

# Print the number of samples in each set
print("Training set samples:", len(df_train))
print("Validation set samples:", len(df_validation))
print("Test set samples:", len(df_test))

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
# Tokenize the data for BERTje
tokenizer_BERTje = BertTokenizer.from_pretrained("wietsedv/bert-base-dutch-cased") # load the tokenizer for BERTje

# The following code for tokenizing is based on the function wrap_examples(examples, tokenizer) available here: https://github.com/wietsedv/bertje/blob/master/finetuning/v1/run_110kDBRD.py
# I used parts of this function to tokenize the reviews and return them as a dataset for my train, validation, and test samples seperately.

# Tokenize the reviews in the training set
train_input_ids, train_input_masks, train_labels = [], [], []
for text in df_train["reviewText"]:
    tokenized_bert_train = tokenizer_BERTje.encode_plus(text, max_length = 512, truncation = True, add_special_tokens = True,
                                                padding = "max_length", return_token_type_ids = False)
    train_input_ids.append(tokenized_bert_train["input_ids"])
    train_input_masks.append(tokenized_bert_train["attention_mask"])

train_input_ids = torch.tensor(train_input_ids, dtype = torch.long)
train_input_masks = torch.tensor(train_input_masks, dtype = torch.long)
train_labels = torch.tensor(df_train[["labelFood", "labelService", "labelAmbiance"]].values, dtype = torch.float32)

# Tokenize the reviews in the validation set
validation_input_ids, validation_input_masks, validation_labels = [], [], []
for text in df_validation["reviewText"]:
    tokenized_bert_validation = tokenizer_BERTje.encode_plus(text, max_length = 512, truncation = True, add_special_tokens = True,
                                                padding = "max_length", return_token_type_ids = False)
    validation_input_ids.append(tokenized_bert_validation["input_ids"])
    validation_input_masks.append(tokenized_bert_validation["attention_mask"])

validation_input_ids = torch.tensor(validation_input_ids, dtype = torch.long)
validation_input_masks = torch.tensor(validation_input_masks, dtype = torch.long)
validation_labels = torch.tensor(df_validation[["labelFood", "labelService", "labelAmbiance"]].values, dtype = torch.float32)

# Tokenize the reviews in the test set
test_input_ids, test_input_masks, test_labels = [], [], []
for text in df_test["reviewText"]:
    tokenized_bert_test = tokenizer_BERTje.encode_plus(text, max_length = 512, truncation = True, add_special_tokens = True,
                                                padding = "max_length", return_token_type_ids = False)
    test_input_ids.append(tokenized_bert_test["input_ids"])
    test_input_masks.append(tokenized_bert_test["attention_mask"])

test_input_ids = torch.tensor(test_input_ids, dtype = torch.long)
test_input_masks = torch.tensor(test_input_masks, dtype = torch.long)
test_labels = torch.tensor(df_test[["labelFood", "labelService", "labelAmbiance"]].values, dtype = torch.float32)

batch_size = 16
train_data = TensorDataset(train_input_ids, train_input_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = batch_size)

validation_data = TensorDataset(validation_input_ids, validation_input_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler = validation_sampler, batch_size = batch_size)

test_data = TensorDataset(test_input_ids, test_input_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size = batch_size)

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Retrieve the model (In this case the BERTje model without expectations)
checkpoint_path = './bert_checkpoints1/'
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)
final_model_path = os.path.join(checkpoint_path, 'bert_best_model1.pt')
model = BertForSequenceClassification.from_pretrained("wietsedv/bert-base-dutch-cased",problem_type="multi_label_classification",  num_labels=3)
final_model = torch.load(final_model_path, map_location=torch.device("cpu"))
model.load_state_dict(final_model)
model.to(device)
model.eval()

# Define lists to store the true and predicted labels, and a list to store the wrong predictions
true_labels = []
pred_labels = []
wrong_pred = []

with torch.no_grad():
    for i, batch in enumerate(test_dataloader):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits.cpu()

        # Apply sigmoid activation function
        probabilities = torch.sigmoid(logits)

        # Apply threshold to convert probabilities to binary predictions
        threshold = 0.5
        predicted_classes = (probabilities > threshold).float()

        true_labels.extend(labels.cpu().numpy())
        pred_labels.extend(predicted_classes.cpu().numpy())

        for idx in range(len(labels)):
            true_label = labels[idx].cpu().numpy()
            pred_label = predicted_classes[idx].cpu().numpy()
            if not all(true_label == pred_label):
                review_text = df_test.iloc[i * batch_size + idx]["reviewText"]
                wrong_pred.append([review_text, true_label, pred_label])

In [None]:
# Create a dataframe with the wrong predictions
df_wrong_pred = pd.DataFrame(wrong_pred, columns=["reviewText", "trueLabel", "predictedlabel"])

# Get the directory of the current script
script_dir = os.path.dirname(os.path.abspath(__file__))

# Save the dataframe to a csv file
csv_path = os.path.join(script_dir, "wrong_predictions.csv")
df_wrong_pred.to_csv(csv_path, index=False)