In [129]:
import os

# Data path
path = os.path.join("data", "tweets_data_temp.csv")

### Functions

In [130]:
from datasets import Dataset
import pandas as pd


# Load and preprocess the dataset
def load_and_prepare_dataset(file_path):
    # Load the dataset
    dataset = pd.read_csv(file_path)

    # Remove all rows where language is not 'da'
    dataset = dataset[dataset['language'] == 'da']

    # Remove all columns except 'text' and 'label'
    dataset = dataset[['text', 'label']]

    # Remove all duplicates
    dataset = dataset.drop_duplicates()

    # Convert to dict and then to a Hugging Face Dataset
    dataset = Dataset.from_dict(dataset)

    print("Dataset loaded and prepared")

    return dataset

# Split the dataset and convert into a Hugging Face DatasetDict
from datasets import DatasetDict

def split_dataset(dataset, seed=42):
    # 60% train, 20% validation, 20% test
    train_test = dataset.train_test_split(test_size=0.4, seed=seed) 
    test_valid = train_test['test'].train_test_split(test_size=0.5, seed=seed)

    # combine train, test and valid to one dictionary
    dataset_splitted_dict = DatasetDict({
        'train': train_test['train'],
        'valid': test_valid['train'],
        'test': test_valid['test']})
    
    print("Dataset splitted into train (60%), valid (20%) and test (20%)")

    # output the train dataset as a csv file
    dataset_splitted_dict['train'].to_csv(os.path.join("data", "train.csv"))

    return dataset_splitted_dict

# Tokenize the dataset 
from transformers import AutoTokenizer
from datasets import ClassLabel

def tokenize_dataset(dataset, model_name="NbAiLab/nb-bert-large", max_length=128):
    # defining the labels
    labels_cl = ClassLabel(num_classes=3, names=['negative', 'neutral', 'positive'])

    # load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # defining a function to tokenize the text and translate all labels into integers instead of strings
    def tokenize_function(example):
        tokens = tokenizer(example["text"], padding="max_length", truncation=True, max_length=max_length)
        tokens['label'] = labels_cl.str2int(example['label'])
        return tokens

    # actually tokenizing the dataset
    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset['train'].column_names) # batched=True speeds up tokenization by allowing to process multiple lines at once

    return tokenized_dataset

# evaluation metrics
import numpy as np
import evaluate

def compute_metrics(eval_pred):
    metric0 = evaluate.load("accuracy")
    metric1 = evaluate.load("precision")
    metric2 = evaluate.load("recall")
    metric3 = evaluate.load("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = metric0.compute(predictions=predictions, references=labels)["accuracy"]
    precision = metric1.compute(predictions=predictions, references=labels, average="weighted")["precision"]
    recall = metric2.compute(predictions=predictions, references=labels, average="weighted")["recall"]
    f1 = metric3.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [131]:
print("Loading and preparing dataset...")
dataset = load_and_prepare_dataset(path)

print("Splitting dataset...")
dataset_splitted_dict = split_dataset(dataset)

print("Tokenizing dataset...")
tokenized_dataset = tokenize_dataset(dataset_splitted_dict)

print("Loading model (NbAiLab/nb-bert-large)...")
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("NbAiLab/nb-bert-large", num_labels=3)

Loading and preparing dataset...
Dataset loaded and prepared
Splitting dataset...
Dataset splitted into train (60%), valid (20%) and test (20%)
Tokenizing dataset...


Map: 100%|██████████| 2143/2143 [00:00<00:00, 12322.54 examples/s]
Map: 100%|██████████| 714/714 [00:00<00:00, 20472.47 examples/s]
Map: 100%|██████████| 715/715 [00:00<00:00, 19011.48 examples/s]


Loading model (NbAiLab/nb-bert-large)...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at NbAiLab/nb-bert-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Specifying training args

In [103]:
from transformers import TrainingArguments

batch_size = 16 # stating batch size
epochs = 5
learning_rate = 2e-4

training_args = TrainingArguments(output_dir="test_trainer",
                                  num_train_epochs=epochs,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  learning_rate=learning_rate,
                                  warmup_steps=100,
                                  weight_decay=0.01,
                                  logging_dir="logs",
                                  logging_steps=10,
                                  load_best_model_at_end=True,
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",  # Add this line
                                  remove_unused_columns=False,
                                  run_name="test_trainer")

Initializing trainer

In [107]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    compute_metrics=compute_metrics
)

In [108]:
trainer.train()

  3%|▎         | 23/804 [19:00<10:45:37, 49.60s/it]
 33%|███▎      | 268/804 [04:43<15:55,  1.78s/it]
Downloading builder script: 100%|██████████| 7.55k/7.55k [00:00<00:00, 2.51MB/s]

Downloading builder script: 100%|██████████| 7.36k/7.36k [00:00<00:00, 14.5MB/s]

Downloading builder script: 100%|██████████| 6.77k/6.77k [00:00<00:00, 9.26MB/s]
  _warn_prf(average, modifier, msg_start, len(result))
                                                 
 33%|███▎      | 268/804 [05:20<15:55,  1.78s/it]

{'eval_loss': 1.0856143236160278, 'eval_accuracy': 0.43216783216783217, 'eval_precision': 0.18676903516064355, 'eval_recall': 0.43216783216783217, 'eval_f1': 0.26082003933566433, 'eval_runtime': 37.7255, 'eval_samples_per_second': 18.953, 'eval_steps_per_second': 2.386, 'epoch': 1.0}


 62%|██████▏   | 500/804 [09:21<05:08,  1.01s/it]  

{'loss': 1.0938, 'learning_rate': 1.890547263681592e-05, 'epoch': 1.87}


  _warn_prf(average, modifier, msg_start, len(result))
                                                 
 67%|██████▋   | 536/804 [10:31<04:28,  1.00s/it]

{'eval_loss': 1.0793061256408691, 'eval_accuracy': 0.43216783216783217, 'eval_precision': 0.18676903516064355, 'eval_recall': 0.43216783216783217, 'eval_f1': 0.26082003933566433, 'eval_runtime': 28.2783, 'eval_samples_per_second': 25.284, 'eval_steps_per_second': 3.183, 'epoch': 2.0}


  _warn_prf(average, modifier, msg_start, len(result))
                                                 
100%|██████████| 804/804 [15:34<00:00,  1.16s/it]

{'eval_loss': 1.0448379516601562, 'eval_accuracy': 0.43216783216783217, 'eval_precision': 0.18676903516064355, 'eval_recall': 0.43216783216783217, 'eval_f1': 0.26082003933566433, 'eval_runtime': 28.0049, 'eval_samples_per_second': 25.531, 'eval_steps_per_second': 3.214, 'epoch': 3.0}
{'train_runtime': 934.1242, 'train_samples_per_second': 6.882, 'train_steps_per_second': 0.861, 'train_loss': 1.0959507956433652, 'epoch': 3.0}





TrainOutput(global_step=804, training_loss=1.0959507956433652, metrics={'train_runtime': 934.1242, 'train_samples_per_second': 6.882, 'train_steps_per_second': 0.861, 'train_loss': 1.0959507956433652, 'epoch': 3.0})

In [109]:
trainer.evaluate()

  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 90/90 [00:27<00:00,  3.25it/s]


{'eval_loss': 1.0448379516601562,
 'eval_accuracy': 0.43216783216783217,
 'eval_precision': 0.18676903516064355,
 'eval_recall': 0.43216783216783217,
 'eval_f1': 0.26082003933566433,
 'eval_runtime': 45.8823,
 'eval_samples_per_second': 15.583,
 'eval_steps_per_second': 1.962,
 'epoch': 3.0}

In [133]:
# get a classification report
from sklearn.metrics import classification_report

report = classification_report(trainer.predict(tokenized_dataset['valid']).label_ids, trainer.predict(tokenized_dataset['valid']).predictions.argmax(-1), target_names=['negative', 'neutral', 'positive'], output_dict=True)

  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 90/90 [00:27<00:00,  3.22it/s]
  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 90/90 [00:27<00:00,  3.33it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [137]:
# classification report as a dataframe
print(pd.DataFrame(report).transpose())

# save the report as a csv file
pd.DataFrame(report).transpose().to_csv('classification_reports/classification_report_without_paraphrasings.csv')

              precision    recall  f1-score    support
negative       0.375350  1.000000  0.545825  268.00000
neutral        0.000000  0.000000  0.000000  252.00000
positive       0.000000  0.000000  0.000000  194.00000
accuracy       0.375350  0.375350  0.375350    0.37535
macro avg      0.125117  0.333333  0.181942  714.00000
weighted avg   0.140888  0.375350  0.204875  714.00000
