In [28]:
import os

# Data path
path = os.path.join(".." , "data" , "tweets_data_temp.csv")
path2 = os.path.join(".." , "data" , "paraphrasings_on_train_2148rows_seed42.csv")
path3 = os.path.join(".." , "data" , "paraphrasings_on_train_2148rows_seed42_2.csv")

### Functions

In [2]:
import pandas as pd
pd.read_csv(path2).head()
# rename New to text
df = pd.read_csv(path2).rename(columns={"New":"text"})
df
# delete ord_or_new column
df = df.drop(columns=["org_or_new"])
df
df[['text', 'label']]

Unnamed: 0,text,label
0,"Øh, tak! Her er et link til et interview, der ...",positive
1,"det kan være værdt at nævne, at dyr også vari...",neutral
2,"#3 Om ateismen: ""Ateister mener, at religione...",negative
3,Øjeblikkeligt: Vild jubel over forøget politiu...,negative
4,Selvom vi ikke ved det præcise datum for næst...,negative
...,...,...
3312,"Hun er 12 og voksen på den måde, man kun kan v...",negative
3313,Kalder alle Broncos-fans. \n\nDrew Lock har be...,positive
3314,Se masser af billeder fra det traditionelle ra...,neutral
3315,Mere #entrepreneurshipeducation og mere #godun...,positive


In [29]:
os.environ["WANDB_DISABLED"] = "true"

In [38]:
from datasets import Dataset
import pandas as pd

# Load and preprocess the dataset
def load_and_prepare_dataset(file_path):
    # Load the dataset
    dataset = pd.read_csv(file_path)
    #print the number of times a label is positive, negative or neutral
    print(dataset['label'].value_counts())
    # equalize dataset making sure there are the same number of positive, negative and neutral tweets
    # Remove all rows where language is not 'da'
    dataset = dataset[dataset['language'] == 'da']
    # Remove all columns except 'text' and 'label'
    dataset = dataset[['text', 'label']]
    # Remove all duplicates
    dataset = dataset.drop_duplicates()
    # Convert to dict and then to a Hugging Face Dataset
    dataset = Dataset.from_dict(dataset)
    print("Dataset loaded and prepared")
    return dataset#, dataset_pd

# Split the dataset and convert into a Hugging Face DatasetDict
from datasets import DatasetDict

def split_dataset(dataset,path_to_df_train, seed=42): #def split_dataset(dataset, path_to_df_train, path_to_df_train_2, seed=42):
    
    # concatenate the paraphrasings dataset with the original dataset
    paraphrasings = pd.read_csv(path_to_df_train)
    #paraphrasings2 = pd.read_csv(path_to_df_train_2)

    #paraphrasings = pd.concat([paraphrasings, paraphrasings2])

    # remove rows where org_or_new

    # remove duplicates
    #paraphrasings = paraphrasings.drop_duplicates()
    paraphrasings = paraphrasings.rename(columns={"New":"text"})
    paraphrasings = paraphrasings.drop(columns=["org_or_new"])
    paraphrasings = paraphrasings[['text', 'label']]
    paraphrasings_plus_org = Dataset.from_dict(paraphrasings)


    # 60% train, 20% validation, 20% test
    train_test = dataset.train_test_split(test_size=0.4, seed=seed) 
    test_valid = train_test['test'].train_test_split(test_size=0.5, seed=seed)

    # combine train, test and valid to one dictionary
    dataset_splitted_dict = DatasetDict({
        'train': paraphrasings_plus_org,
        'valid': test_valid['train'],
        'test': test_valid['test']})
    
    print("Dataset splitted into train (60%), valid (20%) and test (20%)")

    # output the train dataset as a csv file
    #dataset_splitted_dict['train'].to_csv(os.path.join("..", "data", "train.csv"))

    # print the length of the train dataset
    print("Length of train dataset: ", len(dataset_splitted_dict['train']))
    print("Length of valid dataset: ", len(dataset_splitted_dict['valid']))
    print("Length of test dataset: ", len(dataset_splitted_dict['test']))

    print("")

    return dataset_splitted_dict

# Tokenize the dataset 
from transformers import AutoTokenizer
from datasets import ClassLabel

def tokenize_dataset(dataset, model_name="NbAiLab/nb-bert-large", max_length=128):
    # defining the labels
    labels_cl = ClassLabel(num_classes=3, names=['negative', 'neutral', 'positive'])

    # load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # defining a function to tokenize the text and translate all labels into integers instead of strings
    def tokenize_function(example):
        tokens = tokenizer(example["text"], padding="max_length", truncation=True, max_length=max_length)
        tokens['label'] = labels_cl.str2int(example['label'])
        return tokens

    # actually tokenizing the dataset
    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset['train'].column_names) # batched=True speeds up tokenization by allowing to process multiple lines at once


    print("Dataset tokenized")

    return tokenized_dataset

# evaluation metrics
import numpy as np
import evaluate

def compute_metrics(eval_pred):
    metric0 = evaluate.load("accuracy")
    metric1 = evaluate.load("precision")
    metric2 = evaluate.load("recall")
    metric3 = evaluate.load("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = metric0.compute(predictions=predictions, references=labels)["accuracy"]
    precision = metric1.compute(predictions=predictions, references=labels, average="weighted")["precision"]
    recall = metric2.compute(predictions=predictions, references=labels, average="weighted")["recall"]
    f1 = metric3.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [39]:
print("Loading and preparing dataset...")
dataset = load_and_prepare_dataset(path)
print(dataset)

print("Splitting dataset...")
dataset_splitted_dict = split_dataset(dataset, path2) #split_dataset(dataset, path2, path3)

print("Tokenizing dataset...")
tokenized_dataset = tokenize_dataset(dataset_splitted_dict)

print("Loading model (NbAiLab/nb-bert-large)...")

from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("NbAiLab/nb-bert-large", num_labels=3)

Loading and preparing dataset...
label
negative    1525
neutral     1281
positive    1000
Name: count, dtype: int64
Dataset loaded and prepared
Dataset({
    features: ['text', 'label'],
    num_rows: 3572
})
Splitting dataset...
Dataset splitted into train (60%), valid (20%) and test (20%)
Length of train dataset:  3317
Length of valid dataset:  714
Length of test dataset:  715

Tokenizing dataset...


Map: 100%|██████████| 3317/3317 [00:00<00:00, 18946.47 examples/s]
Map: 100%|██████████| 714/714 [00:00<00:00, 20967.26 examples/s]
Map: 100%|██████████| 715/715 [00:00<00:00, 20884.62 examples/s]


Dataset tokenized
Loading model (NbAiLab/nb-bert-large)...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at NbAiLab/nb-bert-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
dataset_splitted_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 4436
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 714
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 715
    })
})

Specifying training args

In [7]:
import numpy as np  
# count number of labels in each dataset
print(np.unique(dataset_splitted_dict['train']['label'],return_counts=True))
print(np.unique(dataset_splitted_dict['test']['label'],return_counts=True))
print(np.unique(dataset_splitted_dict['valid']['label'],return_counts=True))

(array(['negative', 'neutral', 'positive'], dtype='<U8'), array([1806, 1439, 1191]))
(array(['negative', 'neutral', 'positive'], dtype='<U8'), array([309, 221, 185]))
(array(['negative', 'neutral', 'positive'], dtype='<U8'), array([268, 252, 194]))


In [40]:
from transformers import TrainingArguments

batch_size = 8 # stating batch size
epochs = 4
learning_rate = 1e-5


training_args = TrainingArguments(output_dir="test_trainer",
                                  num_train_epochs=epochs,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  learning_rate=learning_rate,
                                  weight_decay=0.01,
                                  logging_dir="logs",
                                  logging_steps=10,
                                  load_best_model_at_end=True,
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",  # Add this line
                                  remove_unused_columns=False,
                                  run_name="test_trainer")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Initializing trainer

In [41]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [11]:
trainer.evaluate()

import tensorflow as tf

# creating model predictions for the validation data
predictions_val = trainer.predict(tokenized_dataset["valid"])

# choosing the prediction that has the highest probability 
preds_val_val = np.argmax(predictions_val.predictions, axis=-1)

# calculating the probabilities instead of logits from each
predictions_probabilities = tf.nn.softmax(predictions_val.predictions)

def compute_metrics_end(preds, refs):
    metric0 = evaluate.load("accuracy")
    metric1 = evaluate.load("precision")
    metric2 = evaluate.load("recall")
    metric3 = evaluate.load("f1")
    
    #logits, labels = eval_pred
    #predictions = np.argmax(logits, axis=-1)
    accuracy = metric0.compute(predictions=preds, references=refs)["accuracy"]
    precision = metric1.compute(predictions=preds, references=refs, average="weighted")["precision"]
    recall = metric2.compute(predictions=preds, references=refs, average="weighted")["recall"]
    f1 = metric3.compute(predictions=preds, references=refs, average="weighted")["f1"]
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

metrics_val = compute_metrics_end(preds=preds_val_val, refs=predictions_val.label_ids)

import tensorflow as tf

# creating model predictions for the validation data
predictions_test = trainer.predict(tokenized_dataset["test"])

# choosing the prediction that has the highest probability 
preds_test_test = np.argmax(predictions_test.predictions, axis=-1)

# calculating the probabilities instead of logits from each
predictions_probabilities_test = tf.nn.softmax(predictions_test.predictions)

metrics_test = compute_metrics_end(preds=preds_test_test, refs=predictions_test.label_ids)

print(metrics_test)
print(metrics_val)

100%|██████████| 90/90 [00:26<00:00,  3.45it/s]
100%|██████████| 90/90 [00:26<00:00,  3.37it/s]
100%|██████████| 90/90 [00:26<00:00,  3.44it/s]


{'accuracy': 0.7034965034965035, 'precision': 0.7208567419348564, 'recall': 0.7034965034965035, 'f1': 0.7066208658676242}
{'accuracy': 0.665266106442577, 'precision': 0.6759627052306597, 'recall': 0.665266106442577, 'f1': 0.6661282766130079}


In [12]:
import pandas as pd

data = {'Predicted Labels': ["negative" if i == 0 else "neutral" if i == 1 else "positive" for i in preds_val_val],
        'True Labels': ["negative" if i == 0 else "neutral" if i == 1 else "positive" for i in predictions_val.label_ids],
        'Misclassification': ["TRUE" if preds_val_val[i] == predictions_val.label_ids[i] else 'MISS' for i, val in enumerate(preds_val_val)],
        'Text': dataset_splitted_dict['valid']['text'],
        'Logit Values': [str(i) for i in predictions_val.predictions],
        'Probabilities': [str(i) for i in np.asarray(predictions_probabilities)]}
df = pd.DataFrame(data)


Unnamed: 0,Predicted Labels,True Labels,Misclassification,Text,Logit Values,Probabilities
0,negative,neutral,MISS,"Det er så ikke den, jeg var lidt for entusiast...",[ 0.9340587 -0.01499593 -1.1753395 ],[0.66294634 0.25663105 0.08042265]
1,neutral,negative,MISS,Åh gud... Har intet med det at gøre,[ 0.49472684 0.52091426 -1.059791 ],[0.44686255 0.45871928 0.09441815]
2,neutral,negative,MISS,"Det kan du sige, men med Artikel 13 bliver det...",[ 0.61225605 0.7817579 -0.8384904 ],[0.4133752 0.4897316 0.09689319]
3,positive,positive,TRUE,"Tak gode mand! Da jeg fik notifikationen, troe...",[-1.0158063 -0.6964614 2.1499164],[0.03834047 0.05276516 0.90889436]
4,negative,negative,TRUE,Den er særdeles troværdig. Viser sandheden om ...,[ 1.1367214 -0.47244602 -0.9655315 ],[0.7562952 0.15129997 0.09240479]
...,...,...,...,...,...,...
709,neutral,negative,MISS,Mere brug af GMO i landbruget kan ikke løse al...,[ 0.12570588 0.72970027 -1.148641 ],[0.32164422 0.5884197 0.08993609]
710,neutral,neutral,TRUE,Rusland fejrer femåret for annekteringen af Kr...,[-0.76456714 1.6279043 -0.502376 ],[0.07552715 0.8263046 0.09816829]
711,negative,negative,TRUE,"Det er minimeret til det ekstreme, så måske ik...",[ 1.3416474 -0.5490633 -1.1396143],[0.8099776 0.12227785 0.06774461]
712,neutral,neutral,TRUE,🇩🇰 #Superliga\nGrupo 1 Descenso\n29º Fecha\n40...,[-1.4751852 1.7833322 -0.67671984],[0.03420784 0.889778 0.07601419]


In [13]:
import pandas as pd
from sklearn.metrics import classification_report

# Extract the true and predicted labels
true_labels = df['True Labels']
predicted_labels = df['Predicted Labels']

# Create a mapping for the labels to numbers if needed
label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}

# Map the labels to numbers using the mapping
true_labels_mapped = true_labels.map(label_mapping)
predicted_labels_mapped = predicted_labels.map(label_mapping)

# Generate the classification report
report = classification_report(true_labels_mapped, predicted_labels_mapped, target_names=label_mapping.keys())

# Print the classification report
print(report)

              precision    recall  f1-score   support

    negative       0.72      0.70      0.71       268
     neutral       0.58      0.70      0.63       252
    positive       0.73      0.58      0.65       194

    accuracy                           0.67       714
   macro avg       0.68      0.66      0.66       714
weighted avg       0.68      0.67      0.67       714



# training loop