# imports

In [None]:
import torch

from transformers import  AdamW, get_linear_schedule_with_warmup, AutoTokenizer, AutoModelForSequenceClassification

import pandas as pd

import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer

from torcheval.metrics.functional import multiclass_f1_score

import random

import numpy as np

import re

from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay

from sklearn import metrics

from tqdm.notebook import tqdm_notebook

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

import optuna

# variable to use optuna
# Turn it to True in order to run the optuna code
use_optuna = False

# Load the models

In [None]:
tokenizer1 = AutoTokenizer.from_pretrained("nlpaueb/bert-base-greek-uncased-v1", do_lower_case=True)
model1 = AutoModelForSequenceClassification.from_pretrained('nlpaueb/bert-base-greek-uncased-v1', num_labels=3, output_attentions=False, output_hidden_states=False)

tokenizer2 = AutoTokenizer.from_pretrained("EftychiaKarav/DistilGREEK-BERT", do_lower_case=True)
model2 = AutoModelForSequenceClassification.from_pretrained("EftychiaKarav/DistilGREEK-BERT", num_labels=3, output_attentions=False, output_hidden_states=False)

# Choose Device and Set Seed for Reproducibility

In [None]:
# Set the seed value 
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Choose device to train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Read Data

In [None]:
# Read the data
TrainSet = pd.read_csv('/kaggle/input/ys19-2023-assignment-4a/train_set.csv')
ValidationSet = pd.read_csv('/kaggle/input/ys19-2023-assignment-4a/valid_set.csv')
TestSet = pd.read_csv('/kaggle/input/ys19-2023-assignment-4a/test_set.csv')

# Drop useless columns
Xtrain = TrainSet.drop(["Sentiment", "New_ID"], axis=1)
Xval = ValidationSet.drop(["Sentiment", "New_ID"], axis=1)
Xtest = TestSet.drop(["New_ID"], axis=1)

# Concat Text and Party to Text and Drop Party column
Xtrain["Text"] = Xtrain["Text"] + " " + Xtrain["Party"]
Xval["Text"] = Xval["Text"] + " " + Xval["Party"]
Xtest["Text"] = Xtest["Text"] + " " + Xtest["Party"]

# Drop Party column
Xtrain = Xtrain.drop(["Party"], axis=1)
Xval = Xval.drop(["Party"], axis=1)
Xtest = Xtest.drop(["Party"], axis=1)

# Word Cloud before Preprocessing

In [None]:
# Concatenate the text data from the column 'Text'
column_data = ' '.join(Xtrain['Text'])

# Create a wordcloud object before cleaning 
wordcloud = WordCloud(width = 1000, height = 500, background_color='white').generate(column_data)

# Plot the wordcloud
plt.figure(figsize=(15,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# Tokens Frequency Diagram before Preprocessing

In [None]:
# Create token frequency diagram before cleaning
nltk.download('punkt')
tokens = [word_tokenize(word, language='greek') for word in Xtrain['Text']]
token_frequency = FreqDist(np.hstack(tokens))
plt.figure(figsize=(15,7))
token_frequency.plot(60, cumulative=False)

# 

# Pre-process the data

In [None]:
# Pre-processing
def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

def remove_links(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    
    # Use the sub() function to replace URLs with an empty string
    text_without_links = re.sub(url_pattern, '', text)

    return text_without_links

# remove emojis
Xtrain['Text'] = Xtrain['Text'].map(remove_emoji)
Xval['Text'] = Xval['Text'].map(remove_emoji)
Xtest['Text'] = Xtest['Text'].map(remove_emoji)

# remove links
Xtrain['Text'] = Xtrain['Text'].map(remove_links)
Xval['Text'] = Xval['Text'].map(remove_links)
Xtest['Text'] = Xtest['Text'].map(remove_links)

# remove mentions
Xtrain['Text'] = Xtrain['Text'].map(lambda x: re.sub(r'@\S+', '', x))
Xval['Text'] = Xval['Text'].map(lambda x: re.sub(r'@\S+', '', x))
Xtest['Text'] = Xtest['Text'].map(lambda x: re.sub(r'@\S+', '', x))

# Word Cloud after Preprocessing

In [None]:
# Concatenate the text data from the column 'Text'
column_data = ' '.join(Xtrain['Text'])

# Create a wordcloud object after cleaning 
wordcloud = WordCloud(width = 1000, height = 500, background_color='white').generate(column_data)

# Plot the wordcloud
plt.figure(figsize=(15,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# Tokens Frequency Diagram after Preprocessing

In [None]:
# Create token frequency diagram after cleaning
tokens = [word_tokenize(word, language='greek') for word in Xtrain['Text']]
token_frequency = FreqDist(np.hstack(tokens))
plt.figure(figsize=(15,7))
token_frequency.plot(60, cumulative=False)

#  Encode Categorical Labels Into One-Hot Encoded Vectors

In [None]:
# Get the lists of Sentiments
Ytrain = TrainSet.Sentiment.values
Yval = ValidationSet.Sentiment.values

# Encode the labels
encoder = LabelBinarizer()
encoder.fit(Ytrain)
Ytrain = encoder.transform(Ytrain)
Yval = encoder.transform(Yval)

# Get the lists of Texts
Xtrain = Xtrain.Text.values
Xval = Xval.Text.values
Xtest = Xtest.Text.values

# Tokenize the Datasets

In [None]:
# First we need to find the maximum length of the sentences
def get_max_len(Xtrain, Xval, Xtest, tokenizer):
    train_max_len = 0
    val_max_len = 0
    test_max_len = 0
    
    for sent in Xtrain:
        # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
        input_ids = tokenizer.encode(sent, add_special_tokens=True)
        # Update the maximum sentence length.
        train_max_len = max(train_max_len, len(input_ids))

    for sent in Xval:
        # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
        input_ids = tokenizer.encode(sent, add_special_tokens=True)

        # Update the maximum sentence length.
        val_max_len = max(val_max_len, len(input_ids))

    for sent in Xtest:
        # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
        input_ids = tokenizer.encode(sent, add_special_tokens=True)

        # Update the maximum sentence length.
        test_max_len = max(test_max_len, len(input_ids))

    # Choose the biggest length
    max_len = max(train_max_len, val_max_len, test_max_len)

    print('Max sentence length: ', max_len)
    return max_len

print("GreekBERT...")
max_len1 = get_max_len(Xtrain, Xval, Xtest, tokenizer1)
print("DistilGREEK-BERT...")
max_len2 = get_max_len(Xtrain, Xval, Xtest, tokenizer2)

In [None]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
def tokenize_and_map(tokenizer, max_len, X, Y = None):
    input_ids = []
    attention_masks = []

    for text in X:
        encoded_dict = tokenizer.encode_plus(
                            text,                      
                            add_special_tokens = True,
                            max_length = max_len,  
                            truncation=True,
                            pad_to_max_length = True,
                            return_attention_mask = True,   
                            return_tensors = 'pt',     
                    )  
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    if Y is not None:   
        Y = torch.tensor(Y,dtype=torch.float32)

    return input_ids, attention_masks, Y


# For the GreekBERT model
Xtrain_ids, Xtrain_attention, Ytrain = tokenize_and_map(tokenizer1, max_len1, Xtrain, Ytrain)
Xval_ids, Xval_attention, Yval = tokenize_and_map(tokenizer1, max_len1, Xval, Yval)
Xtest_ids, Xtest_attention, _ = tokenize_and_map(tokenizer1, max_len1, Xtest)

# For the DistilGREEK-BERT model
Xtrain_ids_distil, Xtrain_attention_distil, Ytrain_distil = tokenize_and_map(tokenizer2, max_len2, Xtrain, Ytrain)
Xval_ids_distil, Xval_attention_distil, Yval_distil = tokenize_and_map(tokenizer2, max_len2, Xval, Yval)
Xtest_ids_distil, Xtest_attention_distil, _ = tokenize_and_map(tokenizer2, max_len2, Xtest)

# Helper Functions

In [None]:
# Plot the ROC curves and the confusion matrices
def plot_roc_curve_confusion_matrix(predicted_probabilities, Yval):  
    classes_of_interest = ["POSITIVE", "NEGATIVE", "NEUTRAL"]
    class_ids = [np.flatnonzero(encoder.classes_ == class_of_interest)[0] for class_of_interest in classes_of_interest]
    fig, ax = plt.subplots(figsize=(6, 6))

    for (i,class_id) in enumerate(class_ids):
        RocCurveDisplay.from_predictions(
            Yval[:, class_id],
            predicted_probabilities.cpu().detach().numpy()[:, class_id],
            name=f"{classes_of_interest[i]} vs the rest",
            color="blue" if class_id == 0 else "green" if class_id == 1 else "red",
            ax = ax
        )   
    plt.show()
    
    # Confusion Matrices 
    predicted_probabilities = torch.argmax(predicted_probabilities, axis=1)

    # we convert Yval and Ypred to their initial form
    Yval_init = encoder.inverse_transform(Yval)
    Ypred_init = np.zeros(shape=(predicted_probabilities.shape[0],3))

    Ypred_init[np.arange(len(predicted_probabilities.cpu().detach().numpy())), predicted_probabilities.cpu().detach().numpy()] = 1
    Ypred_init = encoder.inverse_transform(Ypred_init)

    ConfusionMatrixDisplay.from_predictions(Yval_init, Ypred_init, normalize="true",cmap=plt.cm.YlOrRd)
    plt.show()

In [None]:
# Plot the learning curves
def plot_learning_curves(list_of_train_losses, list_of_valid_losses, list_of_f1_scores_train, list_of_f1_scores_valid):  
    # plot validation and training f1 scores 
    plt.plot(list_of_f1_scores_train)
    plt.plot(list_of_f1_scores_valid)
    plt.xticks(np.arange(0, len(list_of_f1_scores_train), step=1))
    plt.xlabel("Epochs")
    plt.ylabel("F1 scores",fontweight='bold')
    plt.ylim(ymin=0.0, ymax=1.0)
    plt.legend(["Train", "Valid"])
    plt.show()

    # plot validation and training loss
    plt.plot(list_of_train_losses)
    plt.plot(list_of_valid_losses)
    plt.xticks(np.arange(0, len(list_of_train_losses), step=1))
    plt.xlabel("Epochs")
    plt.ylabel("Losses")
    plt.legend(["Train", "Valid"])
    plt.show()

# Train the model

In [None]:
def train(model, train_dataloader, validation_dataloader, optimizer, scheduler, epochs = 3, first_model = True, loss_func = torch.nn.CrossEntropyLoss):

    list_of_train_losses = []
    list_of_valid_losses = []
    list_of_f1_scores_train = []
    list_of_f1_scores_valid = []

    # Tell pytorch in which device to train
    model = model.to(device)

    loss_func = loss_func()

    for epoch_i in range(0, epochs):
        predicted_probabilities = torch.tensor([], dtype=torch.float32).to(device)
        val_predictions = torch.tensor([], dtype=torch.int16, device=device)
        val_true_labels = torch.tensor([], dtype=torch.int16, device=device)

        train_predictions = torch.tensor([], dtype=torch.int16, device=device)
        train_true_labels = torch.tensor([], dtype=torch.int16, device=device)

        total_train_loss = 0
        total_eval_loss = 0

        # Put the model into training mode
        model.train()
        
        for batch in tqdm_notebook(train_dataloader, desc="Training: Epoch " + str(epoch_i + 1)):

            # Unpack this training batch from our dataloader.
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            model.zero_grad() 

            # Perform a forward pass (evaluate the model on this training batch).  
            if first_model:
                result = model(b_input_ids, 
                                token_type_ids = None, 
                                attention_mask = b_input_mask)
            else:     
                result = model(b_input_ids, 
                                attention_mask = b_input_mask)
            
            # The output of the model is a tuple, where the first element is the logits
            logits = result.logits

            # Compute the loss for this batch.
            loss = loss_func(logits, b_labels)
            total_train_loss += loss.item()

            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()

            # Update the learning rate.
            scheduler.step()

            train_true_labels = torch.cat((train_true_labels, torch.argmax(b_labels, axis=1)))
            train_predictions = torch.cat((train_predictions, torch.argmax(logits, axis=1)))
            
        # Calculate the f1 score for the training set
        f1_train = multiclass_f1_score(train_true_labels, train_predictions, average='micro').cpu().tolist()

        # Calculate the average loss over all of the batches.
        avg_train_loss = total_train_loss / len(train_dataloader)            

        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  F1 Training Score: {0:.2f}".format(f1_train))

        # Put the model into evaluation mode
        model.eval()
        
        # Evaluate data for one epoch
        for batch in tqdm_notebook(validation_dataloader, desc="Validation: Epoch " + str(epoch_i + 1)):
            # Unpack this validation batch from our dataloader.
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            with torch.no_grad():  
                # Perform a forward pass (evaluate the model on this validation batch).      
                if first_model:
                    result = model(b_input_ids, 
                                    token_type_ids = None, 
                                    attention_mask = b_input_mask)
                else:
                    result = model(b_input_ids, 
                                    attention_mask = b_input_mask)
            
            # The output of the model is a tuple, where the first element is the logits
            logits = result.logits

            predicted_probabilities = torch.cat((predicted_probabilities, logits))

            # Compute the loss for this batch.
            loss = loss_func(logits, b_labels)
            total_eval_loss += loss.item()

            val_predictions = torch.cat((val_predictions, torch.argmax(logits, axis=1)))
            val_true_labels = torch.cat((val_true_labels, torch.argmax(b_labels, axis=1)))

        # Calculate the f1 score for the validation set
        f1_valid = multiclass_f1_score(val_true_labels, val_predictions, average='micro').cpu().tolist()

        # Calculate the average loss over all of the batches.
        avg_val_loss = total_eval_loss / len(validation_dataloader)
        
        print("  Validation Loss: {0:.2f}".format(avg_val_loss))
        print("  F1 Validatation Score: {0:.2f}".format(f1_valid))

        # Store the loss value for plotting the learning curve
        list_of_train_losses.append(avg_train_loss)
        list_of_valid_losses.append(avg_val_loss)
        list_of_f1_scores_train.append(f1_train)
        list_of_f1_scores_valid.append(f1_valid)

    # plot the learning curves
    plot_learning_curves(list_of_train_losses, list_of_valid_losses, list_of_f1_scores_train, list_of_f1_scores_valid)

    return predicted_probabilities, f1_valid

# First Model: GreekBERT Model

In [None]:
# define the batch size
batch_size = 32

# Create the datasets and dataloaders
train_dataset = torch.utils.data.TensorDataset(Xtrain_ids, Xtrain_attention, Ytrain)
val_dataset = torch.utils.data.TensorDataset(Xval_ids, Xval_attention, Yval)

train_dataloader = torch.utils.data.DataLoader(train_dataset, sampler = torch.utils.data.RandomSampler(train_dataset), batch_size = batch_size)
validation_dataloader = torch.utils.data.DataLoader(val_dataset, sampler = torch.utils.data.SequentialSampler(val_dataset), batch_size = batch_size)

# Define the optimizer 
optimizer = AdamW(model1.parameters(), lr = 3e-5, eps = 1e-6)

# Number of training epochs and the total number of training steps
epochs = 2
steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = steps)

# Variable to check if it is the GreekBERT model or the DistilGREEK-BERT model
first_model = True
predicted_probabilities, f1_valid = train(model1, train_dataloader, validation_dataloader, optimizer, scheduler, epochs, first_model = first_model)

# Plot the ROC curves and the confusion matrices
plot_roc_curve_confusion_matrix(predicted_probabilities, Yval)

# Predict the Test Set

In [None]:
# Predict the TestSet
with torch.no_grad():
    model1.eval()
    result = model1(Xtest_ids, token_type_ids = None, attention_mask = Xtest_attention)
    Ytest_predict = result.logits
    
# Convert it to the right form
Ytest_predict = torch.argmax(Ytest_predict, axis=1)
Ytest_pred_initial = np.zeros(shape=(Ytest_predict.shape[0],3))
Ytest_pred_initial[np.arange(len(Ytest_predict.cpu())), Ytest_predict.cpu()] = 1
Ytest_pred_initial = encoder.inverse_transform(Ytest_pred_initial)  

submission_df = pd.DataFrame({"Id": TestSet["New_ID"], "Predicted":Ytest_pred_initial})
submission_df.to_csv('submission.csv', index=False)

# Optuna Framework

In [None]:
# Here is implemented the Optuna framework in order to find the best hyperparameters for learning rate, batch size and epochs for the GreekBERT model
# Turn the use_optuna variable to True if you want to use it
trial_number = -1

def optimize_hyperparameters(trial):  
    # Define the model again in order to reset the weights
    model1 = AutoModelForSequenceClassification.from_pretrained('nlpaueb/bert-base-greek-uncased-v1', num_labels=3, output_attentions=False, output_hidden_states=False)

    # suggest the batch size
    batch_size = trial.suggest_categorical('batch_size', [16, 32])

    # suggest the learning rate
    lr = trial.suggest_categorical('lr', [2e-5, 3e-5, 5e-5])

    # suggest the number of epochs
    epochs = trial.suggest_categorical('epochs', [2, 3, 4])

    # Create the datasets and dataloaders
    train_dataset = torch.utils.data.TensorDataset(Xtrain_ids, Xtrain_attention, Ytrain)
    val_dataset = torch.utils.data.TensorDataset(Xval_ids, Xval_attention, Yval)

    train_dataloader = torch.utils.data.DataLoader(train_dataset, sampler = torch.utils.data.RandomSampler(train_dataset), batch_size = batch_size)
    validation_dataloader = torch.utils.data.DataLoader(val_dataset, sampler = torch.utils.data.SequentialSampler(val_dataset), batch_size = batch_size)

    # Define the optimizer 
    optimizer = AdamW(model1.parameters(), lr = lr, eps = 1e-6)

    steps = len(train_dataloader) * epochs

    # Create the learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = steps)

    # Variable to check if it is the GreekBERT model or the DistilGREEK-BERT model
    first_model = True
    predicted_probabilities, f1_valid = train(model1, train_dataloader, validation_dataloader, optimizer, scheduler, epochs, first_model = first_model)

    return f1_valid
    
# Create the study and optimize the hyperparameters
study = None
if use_optuna == True: 
    study = optuna.create_study(direction="maximize")
    study.optimize(optimize_hyperparameters, n_trials=15)
    optuna.visualization.matplotlib.plot_param_importances(study)
    optuna.visualization.matplotlib.plot_slice(study)


# Second Model: DistilGREEK-BERT Model

In [None]:
# define the batch size
batch_size = 16

# Create the datasets and dataloaders
train_dataset = torch.utils.data.TensorDataset(Xtrain_ids_distil, Xtrain_attention_distil, Ytrain_distil)
val_dataset = torch.utils.data.TensorDataset(Xval_ids_distil, Xval_attention_distil, Yval_distil)

train_dataloader = torch.utils.data.DataLoader(train_dataset, sampler = torch.utils.data.RandomSampler(train_dataset), batch_size = batch_size)
validation_dataloader = torch.utils.data.DataLoader(val_dataset, sampler = torch.utils.data.SequentialSampler(val_dataset), batch_size = batch_size)

# Define the optimizer 
optimizer = AdamW(model2.parameters(), lr = 5e-5, eps = 1e-6)

# Number of training epochs and the total number of training steps
epochs = 4
steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = steps)

# Variable to check if it is the GreekBERT model or the DistilGREEK-BERT model
first_model = False
predicted_probabilities, f1_valid = train(model2, train_dataloader, validation_dataloader, optimizer, scheduler, epochs, first_model = first_model)

# Plot the ROC curves and the confusion matrices
plot_roc_curve_confusion_matrix(predicted_probabilities, Yval_distil)

# Predict the Test Set

In [None]:
# Predict the TestSet
with torch.no_grad():
    model2.eval()
    result = model1(Xtest_ids_distil, attention_mask = Xtest_attention_distil)
    Ytest_predict = result.logits
    
# Convert it to the right form
Ytest_predict = torch.argmax(Ytest_predict, axis=1)
Ytest_pred_initial = np.zeros(shape=(Ytest_predict.shape[0],3))
Ytest_pred_initial[np.arange(len(Ytest_predict.cpu())), Ytest_predict.cpu()] = 1
Ytest_pred_initial = encoder.inverse_transform(Ytest_pred_initial)  

submission_distil_df = pd.DataFrame({"Id": TestSet["New_ID"], "Predicted":Ytest_pred_initial})
submission_distil_df.to_csv('submission_distil.csv', index=False)

# Optuna Framework

In [None]:
# Here is implemented the Optuna framework in order to find the best hyperparameters for learning rate, batch size and epochs for the DistilGREEK-BERT model
# Turn the use_optuna variable to True if you want to use it
trial_number = -1

def optimize_hyperparameters(trial):  
    # Define the model again in order to reset the weights
    model2 = AutoModelForSequenceClassification.from_pretrained("EftychiaKarav/DistilGREEK-BERT", num_labels=3, output_attentions=False, output_hidden_states=False)

    # suggest the batch size
    batch_size = trial.suggest_categorical('batch_size', [16, 32])

    # suggest the learning rate
    lr = trial.suggest_categorical('lr', [2e-5, 3e-5, 5e-5])

    # suggest the number of epochs
    epochs = trial.suggest_categorical('epochs', [2, 3, 4])

    # Create the datasets and dataloaders
    train_dataset = torch.utils.data.TensorDataset(Xtrain_ids_distil, Xtrain_attention_distil, Ytrain_distil)
    val_dataset = torch.utils.data.TensorDataset(Xval_ids_distil, Xval_attention_distil, Yval_distil)

    train_dataloader = torch.utils.data.DataLoader(train_dataset, sampler = torch.utils.data.RandomSampler(train_dataset), batch_size = batch_size)
    validation_dataloader = torch.utils.data.DataLoader(val_dataset, sampler = torch.utils.data.SequentialSampler(val_dataset), batch_size = batch_size)

    # Define the optimizer 
    optimizer = AdamW(model2.parameters(), lr = lr, eps = 1e-6)

    steps = len(train_dataloader) * epochs

    # Create the learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = steps)

    # Variable to check if it is the GreekBERT model or the DistilGREEK-BERT model
    first_model = False
    predicted_probabilities, f1_valid = train(model2, train_dataloader, validation_dataloader, optimizer, scheduler, epochs, first_model = first_model)

# Create the study object and optimize the hyperparameters
study = None
if use_optuna == True: 
    study = optuna.create_study(direction="maximize")
    study.optimize(optimize_hyperparameters, n_trials=15)
    optuna.visualization.matplotlib.plot_param_importances(study)
    optuna.visualization.matplotlib.plot_slice(study)