# Finetuning BERT
Notebook containing the steps necesarry to fine-tune BERT on paraphrased dataset

### Load data

In [23]:
import os

# Data paths

# path to the original Twitter data
path = os.path.join(".." , "data" , "original_Twitter_data.csv")

# path to the original Twitter data with 1st round of paraphrased tweets, with Mistral
path2 = os.path.join(".." , "data" , "train_paraphrased_w_mistral_seed42_1.csv")

# path to the original Twitter data with 2nd round of paraphrased tweets, with Mistral
path3 = os.path.join(".." , "data" , "train_paraphrased_w_mistral_seed42_2.csv")

# path to the original Twitter data paraphrased with GPT-4
path4 = os.path.join(".." , "data" , "train_paraphrased_with_gpt4.csv")

### Functions

The following chunk contains the four functions
1. Function to **load and prepare dataset**
2. Function to **split dataset into test, train and validation sets**
3. Function to **tokenize dataset**
4. Function **defining evaluation metrics** 

In [36]:
from datasets import Dataset
from datasets import DatasetDict
import pandas as pd
os.environ["WANDB_DISABLED"] = "true"


####################################################
#####                LOADING DATA              #####
####################################################
def load_and_prepare_dataset(file_path):
    # Load the dataset
    dataset = pd.read_csv(file_path)
    #print the number of times a label is positive, negative or neutral
    print(f"Number of labels in each category: {dataset['label'].value_counts()}")
    # equalize dataset making sure there are the same number of positive, negative and neutral tweets
    # Remove all rows where language is not 'da'
    dataset = dataset[dataset['language'] == 'da']
    # Remove all columns except 'text' and 'label'
    dataset = dataset[['text', 'label']]
    # Remove all duplicates
    dataset = dataset.drop_duplicates()
    # Convert to dict and then to a Hugging Face Dataset
    dataset = Dataset.from_dict(dataset)
    print("Dataset loaded and prepared")
    return dataset#, dataset_pd


####################################################
##### SPLIT DATASET INTO TRAIN, VALID AND TEST #####
####################################################
def split_dataset(dataset, path, seed=42): #def split_dataset(dataset, path_to_df_train, path_to_df_train_2, seed=42):
    
    # load paraphrasings dataset
    paraphrasings = pd.read_csv(path)

    # Keep only paraphrasings from the dataset
    paraphrasings = paraphrasings[paraphrasings['org_or_new'] == 0]
    print("Number of paraphrasings: ", len(paraphrasings))

    paraphrasings = paraphrasings.rename(columns={"New":"text"})
    paraphrasings = paraphrasings.drop(columns=["org_or_new"])

    # rename paraphrased_text to text
    paraphrasings = paraphrasings.rename(columns={"paraphrased_text":"text"})

    paraphrasings_plus_org = Dataset.from_dict(paraphrasings)


    # 60% train, 20% validation, 20% test
    train_test = dataset.train_test_split(test_size=0.4, seed=seed) 
    test_valid = train_test['test'].train_test_split(test_size=0.5, seed=seed)

    # combine train, test and valid to one dictionary
    dataset_splitted_dict = DatasetDict({
        'train': paraphrasings_plus_org,
        'valid': test_valid['train'],
        'test': test_valid['test']})
    
    print("Dataset splitted into train (60%), valid (20%) and test (20%)")

    # output the train dataset as a csv file
    #dataset_splitted_dict['train'].to_csv(os.path.join("..", "data", "train.csv"))

    # print the length of the train dataset
    print("Length of train dataset: ", len(dataset_splitted_dict['train']))
    print("Length of valid dataset: ", len(dataset_splitted_dict['valid']))
    print("Length of test dataset: ", len(dataset_splitted_dict['test']))

    print("")

    return dataset_splitted_dict

####################################################
#####             TOKENIZE DATASET             #####
####################################################
from transformers import AutoTokenizer
from datasets import ClassLabel

def tokenize_dataset(dataset, model_name="NbAiLab/nb-bert-large", max_length=128):
    # defining the labels
    labels_cl = ClassLabel(num_classes=3, names=['negative', 'neutral', 'positive'])

    # load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # defining a function to tokenize the text and translate all labels into integers instead of strings
    def tokenize_function(example):
        tokens = tokenizer(example["text"], padding="max_length", truncation=True, max_length=max_length)
        tokens['label'] = labels_cl.str2int(example['label'])
        return tokens

    # actually tokenizing the dataset
    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset['train'].column_names) # batched=True speeds up tokenization by allowing to process multiple lines at once


    print("Dataset tokenized")

    return tokenized_dataset

####################################################
#####              EVALUATION METRICS          #####
####################################################
import numpy as np
import evaluate

def compute_metrics(eval_pred):
    metric0 = evaluate.load("accuracy")
    metric1 = evaluate.load("precision")
    metric2 = evaluate.load("recall")
    metric3 = evaluate.load("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = metric0.compute(predictions=predictions, references=labels)["accuracy"]
    precision = metric1.compute(predictions=predictions, references=labels, average="weighted")["precision"]
    recall = metric2.compute(predictions=predictions, references=labels, average="weighted")["recall"]
    f1 = metric3.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [37]:
print("Loading and preparing dataset...")
dataset = load_and_prepare_dataset(path)

print("Splitting dataset...")
dataset_splitted_dict = split_dataset(dataset, path2) #split_dataset(dataset, path2, path3)

print("Tokenizing dataset...")
tokenized_dataset = tokenize_dataset(dataset_splitted_dict)

print("Loading model (NbAiLab/nb-bert-large)...")

from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("NbAiLab/nb-bert-large", num_labels=3)

Loading and preparing dataset...
Number of labels in each category: label
negative    1525
neutral     1281
positive    1000
Name: count, dtype: int64
Dataset loaded and prepared
Splitting dataset...
Number of paraphrasings:  1174
Dataset splitted into train (60%), valid (20%) and test (20%)
Length of train dataset:  1174
Length of valid dataset:  714
Length of test dataset:  715

Tokenizing dataset...



tokenizer_config.json: 100%|██████████| 506/506 [00:00<00:00, 14.8kB/s]

[A
sentencepiece.bpe.model: 100%|██████████| 1.09M/1.09M [00:00<00:00, 2.32MB/s]

[A
tokenizer.json: 100%|██████████| 3.80M/3.80M [00:00<00:00, 5.79MB/s]

special_tokens_map.json: 100%|██████████| 167/167 [00:00<00:00, 54.5kB/s]

Map: 100%|██████████| 1174/1174 [00:00<00:00, 14542.82 examples/s]

Map: 100%|██████████| 714/714 [00:00<00:00, 20099.69 examples/s]

Map: 100%|██████████| 715/715 [00:00<00:00, 20643.82 examples/s]


Dataset tokenized
Loading model (NbAiLab/nb-bert-large)...



config.json: 100%|██████████| 548/548 [00:00<00:00, 205kB/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
model.safetensors: 100%|██████████| 498M/498M [00:20<00:00, 23.8MB/s]
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at vesteinn/DanskBERT and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Specifying training args

In [32]:
import numpy as np  
# count number of labels in each dataset
print(np.unique(dataset_splitted_dict['train']['label'],return_counts=True))
print(np.unique(dataset_splitted_dict['test']['label'],return_counts=True))
print(np.unique(dataset_splitted_dict['valid']['label'],return_counts=True))

(array(['negative', 'neutral', 'positive'], dtype='<U8'), array([468, 379, 327]))
(array(['negative', 'neutral', 'positive'], dtype='<U8'), array([309, 221, 185]))
(array(['negative', 'neutral', 'positive'], dtype='<U8'), array([268, 252, 194]))


## Specifying training arguments

In [38]:
from transformers import TrainingArguments

batch_size = 8 # stating batch size
epochs = 4
learning_rate = 1e-5


training_args = TrainingArguments(output_dir="test_trainer",
                                  num_train_epochs=epochs,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  learning_rate=learning_rate,
                                  weight_decay=0.01,
                                  logging_dir="logs",
                                  logging_steps=10,
                                  load_best_model_at_end=True,
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",  # Add this line
                                  remove_unused_columns=False,
                                  run_name="test_trainer")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


## Initializing trainer

In [39]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    compute_metrics=compute_metrics
)

# Training loop

In [40]:
# specify the range which is the number of fine-tuning runs you want to do. Default is 10.
for i in range(2):    
    trainer.train()

    trainer.evaluate()

    import tensorflow as tf

    # creating model predictions for the validation data
    predictions_val = trainer.predict(tokenized_dataset["valid"])

    # choosing the prediction that has the highest probability 
    preds_val_val = np.argmax(predictions_val.predictions, axis=-1)

    # calculating the probabilities instead of logits from each
    predictions_probabilities = tf.nn.softmax(predictions_val.predictions)

    def compute_metrics_end(preds, refs):
        metric0 = evaluate.load("accuracy")
        metric1 = evaluate.load("precision")
        metric2 = evaluate.load("recall")
        metric3 = evaluate.load("f1")
        
        #logits, labels = eval_pred
        #predictions = np.argmax(logits, axis=-1)
        accuracy = metric0.compute(predictions=preds, references=refs)["accuracy"]
        precision = metric1.compute(predictions=preds, references=refs, average="weighted")["precision"]
        recall = metric2.compute(predictions=preds, references=refs, average="weighted")["recall"]
        f1 = metric3.compute(predictions=preds, references=refs, average="weighted")["f1"]
        return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

    metrics_val = compute_metrics_end(preds=preds_val_val, refs=predictions_val.label_ids)

    import tensorflow as tf

    # creating model predictions for the validation data
    predictions_test = trainer.predict(tokenized_dataset["test"])

    # choosing the prediction that has the highest probability 
    preds_test_test = np.argmax(predictions_test.predictions, axis=-1)

    # calculating the probabilities instead of logits from each
    predictions_probabilities_test = tf.nn.softmax(predictions_test.predictions)

    metrics_test = compute_metrics_end(preds=preds_test_test, refs=predictions_test.label_ids)

    print(metrics_test)
    print(metrics_val)

    import pandas as pd

    data = {'Predicted Labels': ["negative" if i == 0 else "neutral" if i == 1 else "positive" for i in preds_val_val],
            'True Labels': ["negative" if i == 0 else "neutral" if i == 1 else "positive" for i in predictions_val.label_ids],
            'Misclassification': ["TRUE" if preds_val_val[i] == predictions_val.label_ids[i] else 'MISS' for i, val in enumerate(preds_val_val)],
            'Text': dataset_splitted_dict['valid']['text'],
            'Logit Values': [str(i) for i in predictions_val.predictions],
            'Probabilities': [str(i) for i in np.asarray(predictions_probabilities)]}
    df = pd.DataFrame(data)



    import pandas as pd
    from sklearn.metrics import classification_report

    # Extract the true and predicted labels
    true_labels = df['True Labels']
    predicted_labels = df['Predicted Labels']

    # Create a mapping for the labels to numbers if needed
    label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}

    # Map the labels to numbers using the mapping
    true_labels_mapped = true_labels.map(label_mapping)
    predicted_labels_mapped = predicted_labels.map(label_mapping)

    # Generate the classification report
    report = classification_report(true_labels_mapped, predicted_labels_mapped, target_names=label_mapping.keys(), output_dict=True)

    # save classification report to csv
    df = pd.DataFrame(report).transpose()

    # change the name of the output according to what you're testing
    df.to_csv(f"../classification_reports_twitter/{i+1}classification_report_org_twitter_plus_1x_mistral_para.csv")

  0%|          | 2/588 [13:34<66:18:38, 407.37s/it]
  2%|▏         | 10/588 [00:37<21:20,  2.22s/it] 
  2%|▏         | 10/588 [00:37<21:20,  2.22s/it]  

{'loss': 1.0765, 'learning_rate': 9.829931972789115e-06, 'epoch': 0.07}


  2%|▏         | 12/588 [00:42<23:34,  2.46s/it]

KeyboardInterrupt: 