In [None]:
# importing libraries
import pandas as pd
from datasets import Dataset, load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments,Trainer
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import torch
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [None]:
# Reading the csv file
X = pd.read_csv('table_to_train_the_model.csv')

In [None]:
# Converting pandas dataframe into a dataset
dataset = Dataset.from_pandas(X, preserve_index=False)

In [None]:
# Splitting the data into train and test
dataset = dataset.train_test_split(test_size=0.3)
raw_train_ds = dataset["train"]  # Training dataset
remaining_data = dataset["test"]

In [None]:
# Get the validation dataset testing dataset
dataset_2 = remaining_data.train_test_split(test_size=0.5)
raw_test_ds = dataset_2["train"]  # Testing dataset
raw_val_ds = dataset_2["test"]  # Validation dataset

In [None]:
# Print the description of the train_ds
raw_train_ds, raw_val_ds, raw_test_ds

In [None]:
# Output of one row in the dataset
raw_train_ds[0]

In [None]:
# analysing the class (no_of_occurrence) distribution in each dataset.
fig, axs = plt.subplots(1, 3, tight_layout=True)
distributions = []

axs[0].set_title("Train")
axs[1].set_title("Validation")
axs[2].set_title("Test")

train_distributions = axs[0].hist(raw_train_ds["no_of_occurance"], bins=5)
val_distributions = axs[1].hist(raw_val_ds["no_of_occurance"], bins=5)
test_distributions = axs[2].hist(raw_test_ds["no_of_occurance"], bins=5)

for distributions, ax in zip([train_distributions, val_distributions, test_distributions], axs):
    for j in range(5):
        # Display the counts on each column of the histograms
        ax.text(distributions[1][j], distributions[0][j], str(int(distributions[0][j])), weight="bold")

In [None]:
# Load the model and the tokenizer
BASE_MODEL = "camembert-base"
LEARNING_RATE = 2e-5
MAX_LENGTH = 256
BATCH_SIZE = 16
EPOCHS = 20

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
# Setting the number of output logit to one
model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels = 1)

In [None]:
# Tokenize the dataset and associate the label attribute to each dataset item
ds = {"train": raw_train_ds, "validation": raw_val_ds, "test": raw_test_ds}


def preprocess_function(examples):
    label = examples["no_of_occurance"]
    examples = tokenizer(examples["RelatedLesson"], truncation=True, padding="max_length", max_length=256)
    examples["label"] = float(label)
    return examples


for split in ds:
    ds[split] = ds[split].map(preprocess_function, remove_columns=["RelatedLesson", "no_of_occurance"])

In [None]:
# Creating a function to calculate the global accuracy score
def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1,1)

    # mse, mae, r2 will be used in training args
    mse = mean_squared_error(labels, logits)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    single_squared_errors = ((logits - labels).flatten()**2).tolist()

    # Compute accuracy
    # Based on the fact that the rounded score = true score only if |single_squared_errors| < 0.5
    accuracy = sum([1 for e in single_squared_errors if e < 0.25]) / len(single_squared_errors)

    return {"mse": mse, "mae": mae, "r2": r2, "accuracy": accuracy}

In [None]:
# Load the model
training_args = TrainingArguments(
    output_dir="../Model Training/camembert-fine-tuned-regression",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
    weight_decay=0.01,
)

In [None]:
# changing the loss function
class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0][:, 0]
        loss = torch.nn.functional.mse_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
# Training the model
trainer = RegressionTrainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    compute_metrics=compute_metrics_for_regression,
)

trainer.train()

In [None]:
# Save the model/tokenizer
model.save_pretrained("model")
tokenizer.save_pretrained("tokenizer")