In [9]:
import os

# Data path
path = os.path.join(".." , "data" , "tweets_data_temp.csv")

### Functions

In [155]:
from datasets import Dataset
import pandas as pd


# Load and preprocess the dataset

def load_and_prepare_dataset(file_path):
    # Load the dataset
    dataset = pd.read_csv(file_path)
    
    #print the number of times a label is positive, negative or neutral
    print(dataset['label'].value_counts())
    # equalize dataset making sure there are the same number of positive, negative and neutral tweets
    dataset = dataset.groupby('label').head(1000).reset_index(drop=True)
    print(f'New sizes: {dataset["label"].value_counts()}')
    dataset = dataset[:100]

    # Remove all rows where language is not 'da'
    dataset = dataset[dataset['language'] == 'da']

    # Remove all columns except 'text' and 'label'
    dataset = dataset[['text', 'label']]

    # Remove all duplicates
    dataset = dataset.drop_duplicates()

    dataset_pd = dataset[:300]

    # Convert to dict and then to a Hugging Face Dataset
    dataset = Dataset.from_dict(dataset)

    print("Dataset loaded and prepared")
    print(dataset)

    return dataset, dataset_pd

# Split the dataset and convert into a Hugging Face DatasetDict
from datasets import DatasetDict

def split_dataset(dataset, seed=42):
    # 60% train, 20% validation, 20% test
    train_test = dataset.train_test_split(test_size=0.4, seed=seed) 
    test_valid = train_test['test'].train_test_split(test_size=0.5, seed=seed)

    # combine train, test and valid to one dictionary
    dataset_splitted_dict = DatasetDict({
        'train': train_test['train'],
        'valid': test_valid['train'],
        'test': test_valid['test']})
    
    print("Dataset splitted into train (60%), valid (20%) and test (20%)")

    # output the train dataset as a csv file
    dataset_splitted_dict['train'].to_csv(os.path.join("..", "data", "train.csv"))

    # print the length of the train dataset
    print("Length of train dataset: ", len(dataset_splitted_dict['train']))
    print("Length of valid dataset: ", len(dataset_splitted_dict['valid']))
    print("Length of test dataset: ", len(dataset_splitted_dict['test']))

    return dataset_splitted_dict

# Tokenize the dataset 
from transformers import AutoTokenizer
from datasets import ClassLabel

def tokenize_dataset(dataset, model_name="cardiffnlp/twitter-xlm-roberta-base", max_length=128):
    # defining the labels
    labels_cl = ClassLabel(num_classes=3, names=['negative', 'neutral', 'positive'])

    # load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # defining a function to tokenize the text and translate all labels into integers instead of strings
    def tokenize_function(example):
        tokens = tokenizer(example["text"], padding="max_length", truncation=True, max_length=max_length)
        tokens['label'] = labels_cl.str2int(example['label'])
        return tokens

    # actually tokenizing the dataset
    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset['train'].column_names) # batched=True speeds up tokenization by allowing to process multiple lines at once


    print("Dataset tokenized")

    return tokenized_dataset

# evaluation metrics
import numpy as np
import evaluate

def compute_metrics(eval_pred):
    metric0 = evaluate.load("accuracy")
    metric1 = evaluate.load("precision")
    metric2 = evaluate.load("recall")
    metric3 = evaluate.load("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = metric0.compute(predictions=predictions, references=labels)["accuracy"]
    precision = metric1.compute(predictions=predictions, references=labels, average="weighted")["precision"]
    recall = metric2.compute(predictions=predictions, references=labels, average="weighted")["recall"]
    f1 = metric3.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [156]:
print("Loading and preparing dataset...")
dataset, dataset_pd = load_and_prepare_dataset(path)
print(dataset)

print("Splitting dataset...")
dataset_splitted_dict = split_dataset(dataset)

print("Tokenizing dataset...")
tokenized_dataset = tokenize_dataset(dataset_splitted_dict)

print("Loading model (NbAiLab/nb-bert-large)...")

from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-xlm-roberta-base", num_labels=3)

Loading and preparing dataset...
label
negative    1525
neutral     1281
positive    1000
Name: count, dtype: int64
New sizes: label
negative    1000
positive    1000
neutral     1000
Name: count, dtype: int64
Dataset loaded and prepared
Dataset({
    features: ['text', 'label'],
    num_rows: 99
})
Dataset({
    features: ['text', 'label'],
    num_rows: 99
})
Splitting dataset...
Dataset splitted into train (60%), valid (20%) and test (20%)



Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 484.50ba/s]

Length of train dataset:  59
Length of valid dataset:  20
Length of test dataset:  20
Tokenizing dataset...




Map: 100%|██████████| 59/59 [00:00<00:00, 3971.75 examples/s]

Map: 100%|██████████| 20/20 [00:00<00:00, 5175.92 examples/s]

Map: 100%|██████████| 20/20 [00:00<00:00, 4873.41 examples/s]


Dataset tokenized
Loading model (NbAiLab/nb-bert-large)...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [157]:
# Assuming `df` is your DataFrame and 'label' your column name
# First, create a dictionary that maps the current labels to the new numeric values
label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}

# Use the replace method to update the 'label' column
dataset_pd['label'] = dataset_pd['label'].replace(label_mapping)

# Now df['label'] should contain the numeric values 0, 1, 2 instead of the string labels

In [158]:
#train test split of the pandas dataframe
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(dataset_pd, test_size=0.2, random_state=42)

In [174]:
# usning simple transformers
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
import logging

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

model_args = ClassificationArgs(num_train_epochs=3)

# Create a ClassificationModel
model = ClassificationModel(
    'bert',
    'NbAiLab/nb-bert-large',
    num_labels=3,
    args=model_args,
    use_cuda=False
) 

# Train the model
model.train_model(train_df)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at NbAiLab/nb-bert-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: Output directory (outputs/) already exists and is not empty. Set overwrite_output_dir: True to automatically overwrite.

In [173]:
from sklearn.metrics import f1_score, accuracy_score


def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='micro')
    

result, model_outputs, wrong_predictions = model.eval_model(test_df, f1=f1_multiclass, acc=accuracy_score)


# get a classification report 
from sklearn.metrics import classification_report
y_true = test_df['label']
y_pred = model_outputs.argmax(axis=1)
target_names = ['negative', 'neutral', 'positive']
print(classification_report(y_true, y_pred, target_names=target_names))

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
Python(6669) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Python(6670) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Python(6671) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The cur

              precision    recall  f1-score   support

    negative       0.35      0.75      0.48         8
     neutral       0.00      0.00      0.00         8
    positive       0.33      0.25      0.29         4

    accuracy                           0.35        20
   macro avg       0.23      0.33      0.26        20
weighted avg       0.21      0.35      0.25        20



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [171]:
print(result)
print(model_outputs)
print(wrong_predictions)

{'mcc': -0.04950737714883372, 'f1': 0.35, 'acc': 0.35, 'eval_loss': 1.1298232475916545}
[[ 1.95080474e-01 -1.74649432e-01 -1.04023919e-01]
 [ 7.08302796e-01  3.16931792e-02 -4.08638567e-01]
 [ 2.86453724e-01 -2.92465866e-01 -2.92772382e-01]
 [ 6.71456158e-01  1.39731169e-01 -4.92017478e-01]
 [ 2.20730767e-01 -2.07033291e-01 -9.06951576e-02]
 [ 3.39657485e-01 -1.90712675e-01 -6.74650967e-02]
 [ 1.75995007e-03 -2.53603041e-01 -1.59949791e-02]
 [ 9.43301171e-02 -2.87042081e-01 -1.11295491e-01]
 [ 1.76694021e-01 -3.36882830e-01 -3.68512660e-01]
 [ 9.80528593e-02 -4.33587670e-01  2.23039404e-01]
 [ 9.27601010e-04 -3.64341319e-01  1.54874668e-01]
 [ 5.71119845e-01 -6.36255518e-02 -2.40721121e-01]
 [ 1.27076596e-01 -2.12510571e-01 -2.66325057e-01]
 [ 1.83867738e-01 -3.84387314e-01 -3.13319534e-01]
 [ 1.82468370e-01 -2.34703198e-01 -1.08885556e-01]
 [ 1.75938472e-01 -2.07500592e-01 -1.70884892e-01]
 [ 1.57920465e-01 -2.65615702e-01 -1.62593961e-01]
 [-9.81983393e-02 -2.77918935e-01  3.59228589

In [166]:
print(result)
print("")
print(model_outputs)
print("")
print(wrong_predictions)

{'mcc': -0.04950737714883372, 'eval_loss': 1.1298232475916545}

[[ 1.95080474e-01 -1.74649432e-01 -1.04023919e-01]
 [ 7.08302796e-01  3.16931792e-02 -4.08638567e-01]
 [ 2.86453724e-01 -2.92465866e-01 -2.92772382e-01]
 [ 6.71456158e-01  1.39731169e-01 -4.92017478e-01]
 [ 2.20730767e-01 -2.07033291e-01 -9.06951576e-02]
 [ 3.39657485e-01 -1.90712675e-01 -6.74650967e-02]
 [ 1.75995007e-03 -2.53603041e-01 -1.59949791e-02]
 [ 9.43301171e-02 -2.87042081e-01 -1.11295491e-01]
 [ 1.76694021e-01 -3.36882830e-01 -3.68512660e-01]
 [ 9.80528593e-02 -4.33587670e-01  2.23039404e-01]
 [ 9.27601010e-04 -3.64341319e-01  1.54874668e-01]
 [ 5.71119845e-01 -6.36255518e-02 -2.40721121e-01]
 [ 1.27076596e-01 -2.12510571e-01 -2.66325057e-01]
 [ 1.83867738e-01 -3.84387314e-01 -3.13319534e-01]
 [ 1.82468370e-01 -2.34703198e-01 -1.08885556e-01]
 [ 1.75938472e-01 -2.07500592e-01 -1.70884892e-01]
 [ 1.57920465e-01 -2.65615702e-01 -1.62593961e-01]
 [-9.81983393e-02 -2.77918935e-01  3.59228589e-02]
 [ 1.56672657e-01 

In [164]:
# Make predictions with the model
predictions, raw_outputs = model.predict(["Sam was a Wizard"])

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
Python(6519) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Python(6521) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Python(6522) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The cur

Specifying training args

In [129]:
from transformers import TrainingArguments

batch_size = 4 # stating batch size
epochs = 2
learning_rate = 2e-5

training_args = TrainingArguments(output_dir="test_trainer",
                                  num_train_epochs=epochs,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  learning_rate=learning_rate,
                                  weight_decay=0.01,
                                  logging_dir="logs",
                                  logging_steps=10,
                                  load_best_model_at_end=True,
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",  # Add this line
                                  remove_unused_columns=False,
                                  run_name="test_trainer")

Initializing trainer

In [130]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    compute_metrics=compute_metrics
)

In [131]:
trainer.train()

  0%|          | 0/420 [02:02<?, ?it/s]
  0%|          | 0/30 [00:00<?, ?it/s]

RuntimeError: MPS backend out of memory (MPS allocated: 6.07 GB, other allocations: 11.64 GB, max allowed: 18.13 GB). Tried to allocate 732.43 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [92]:
trainer.evaluate()

import tensorflow as tf

# creating model predictions for the validation data
predictions_val = trainer.predict(tokenized_dataset["valid"])

# choosing the prediction that has the highest probability 
preds_val_val = np.argmax(predictions_val.predictions, axis=-1)

# calculating the probabilities instead of logits from each
predictions_probabilities = tf.nn.softmax(predictions_val.predictions)

def compute_metrics_end(preds, refs):
    metric0 = evaluate.load("accuracy")
    metric1 = evaluate.load("precision")
    metric2 = evaluate.load("recall")
    metric3 = evaluate.load("f1")
    
    #logits, labels = eval_pred
    #predictions = np.argmax(logits, axis=-1)
    accuracy = metric0.compute(predictions=preds, references=refs)["accuracy"]
    precision = metric1.compute(predictions=preds, references=refs, average="weighted")["precision"]
    recall = metric2.compute(predictions=preds, references=refs, average="weighted")["recall"]
    f1 = metric3.compute(predictions=preds, references=refs, average="weighted")["f1"]
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

metrics_val = compute_metrics_end(preds=preds_val_val, refs=predictions_val.label_ids)

import tensorflow as tf

# creating model predictions for the validation data
predictions_test = trainer.predict(tokenized_dataset["test"])

# choosing the prediction that has the highest probability 
preds_test_test = np.argmax(predictions_test.predictions, axis=-1)

# calculating the probabilities instead of logits from each
predictions_probabilities_test = tf.nn.softmax(predictions_test.predictions)

metrics_test = compute_metrics_end(preds=preds_test_test, refs=predictions_test.label_ids)

print(metrics_test)
print(metrics_val)

  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 70/70 [00:23<00:00,  2.93it/s]
  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 70/70 [00:23<00:00,  3.03it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 70/70 [00:23<00:00,  2.92it/s]


{'accuracy': 0.34285714285714286, 'precision': 0.11755102040816326, 'recall': 0.34285714285714286, 'f1': 0.17507598784194528}
{'accuracy': 0.33989266547406083, 'precision': 0.11552702404306182, 'recall': 0.33989266547406083, 'f1': 0.17244220678256755}


  _warn_prf(average, modifier, msg_start, len(result))


In [104]:
import pandas as pd

data = {'Predicted Labels': ["negative" if i == 0 else "neutral" if i == 1 else "positive" for i in preds_val_val],
        'True Labels': ["negative" if i == 0 else "neutral" if i == 1 else "positive" for i in predictions_val.label_ids],
        'Misclassification': ["TRUE" if preds_val_val[i] == predictions_val.label_ids[i] else 'MISS' for i, val in enumerate(preds_val_val)],
        'Text': dataset_splitted_dict['valid']['text'],
        'Logit Values': [str(i) for i in predictions_val.predictions],
        'Probabilities': [str(i) for i in np.asarray(predictions_probabilities)]}
df = pd.DataFrame(data)

In [105]:
import pandas as pd
from sklearn.metrics import classification_report



# Extract the true and predicted labels
true_labels = df['True Labels']
predicted_labels = df['Predicted Labels']

# Create a mapping for the labels to numbers if needed
label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}

# Map the labels to numbers using the mapping
true_labels_mapped = true_labels.map(label_mapping)
predicted_labels_mapped = predicted_labels.map(label_mapping)

# Generate the classification report
report = classification_report(true_labels_mapped, predicted_labels_mapped, target_names=label_mapping.keys())

# Print the classification report
print(report)

              precision    recall  f1-score   support

    negative       0.00      0.00      0.00       197
     neutral       0.00      0.00      0.00       172
    positive       0.34      1.00      0.51       190

    accuracy                           0.34       559
   macro avg       0.11      0.33      0.17       559
weighted avg       0.12      0.34      0.17       559



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [98]:
df

Unnamed: 0,Predicted Labels,True Labels,Misclassification,Text,Logit Values,Probabilities
0,positive,negative,MISS,"Nu er det så Swedbank og €135 milliarder, og e...",[-0.39701766 -0.42288524 -0.28771487],[0.32362834 0.3153642 0.36100742]
1,positive,negative,MISS,"Aha. Du betegner det som to lag vælgere, og de...",[-0.3970175 -0.42288476 -0.28771502],[0.32362837 0.31536436 0.36100736]
2,positive,positive,TRUE,"Bedre udbud for alle, styrket konkurrence og l...",[-0.39701766 -0.42288476 -0.28771493],[0.3236283 0.31536433 0.3610074 ]
3,positive,positive,TRUE,"Præcis, den med englændere på ferie er også ge...",[-0.39701957 -0.42288882 -0.28771558],[0.3236284 0.31536373 0.3610079 ]
4,positive,negative,MISS,RT @USER: Dagens bedste respons på Rasmus P. d...,[-0.3970187 -0.42288706 -0.28771544],[0.3236284 0.31536397 0.36100763]
...,...,...,...,...,...,...
554,positive,negative,MISS,Det er en bot der ændrer billedet hele tiden.,[-0.39702025 -0.42289037 -0.2877158 ],[0.32362843 0.31536347 0.3610081 ]
555,positive,neutral,MISS,#sundpol læs med hos Zetland. Hvor mange flere...,[-0.39701933 -0.4228887 -0.28771564],[0.3236284 0.3153637 0.36100784]
556,positive,positive,TRUE,RT @USER: Billund Kommune ❤️ jeg er vild med j...,[-0.39701957 -0.422889 -0.28771555],[0.3236284 0.31536368 0.36100793]
557,positive,negative,MISS,Mette Frederiksen til pressen: - I skal opføre...,[-0.39701986 -0.42288983 -0.2877158 ],[0.32362846 0.31536356 0.36100802]


In [95]:
dfdf_metrics_val = pd.DataFrame(metrics_val.items())
df_metrics_test = pd.DataFrame(metrics_test.items())

In [96]:
dfdf_metrics_val

Unnamed: 0,0,1
0,accuracy,0.339893
1,precision,0.115527
2,recall,0.339893
3,f1,0.172442


In [97]:
df_metrics_test

Unnamed: 0,0,1
0,accuracy,0.342857
1,precision,0.117551
2,recall,0.342857
3,f1,0.175076


In [116]:
trainer.evaluate()

  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 3/3 [00:05<00:00,  1.90s/it]


{'eval_loss': 1.1325998306274414,
 'eval_accuracy': 0.3,
 'eval_precision': 0.1,
 'eval_recall': 0.3,
 'eval_f1': 0.15,
 'eval_runtime': 8.011,
 'eval_samples_per_second': 2.497,
 'eval_steps_per_second': 0.374,
 'epoch': 2.0}

In [117]:
# get a classification report
from sklearn.metrics import classification_report

report = classification_report(trainer.predict(tokenized_dataset['valid']).label_ids, trainer.predict(tokenized_dataset['valid']).predictions.argmax(-1), target_names=['negative', 'neutral', 'positive'], output_dict=True)

  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 3/3 [00:05<00:00,  1.79s/it]
  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 3/3 [00:05<00:00,  1.99s/it]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [118]:
# classification report as a dataframe
print(pd.DataFrame(report).transpose())

# save the report as a csv file
#pd.DataFrame(report).transpose().to_csv('classification_reports/classification_report_without_paraphrasings.csv')

              precision    recall  f1-score  support
negative       0.000000  0.000000  0.000000     8.00
neutral        0.000000  0.000000  0.000000     6.00
positive       0.263158  0.833333  0.400000     6.00
accuracy       0.250000  0.250000  0.250000     0.25
macro avg      0.087719  0.277778  0.133333    20.00
weighted avg   0.078947  0.250000  0.120000    20.00
