In [35]:
from datasets import load_dataset
import os


dataset = load_dataset("DDSC/europarl")
path1 = os.path.join("..", "data", "train_paraphrased_with_gpt4_HF_df.csv")

In [36]:
import pandas as pd

pd.read_csv(path1)

df1 = pd.read_csv(path1)
df2 = df1[["text", "label"]]
df3 = df1[["paraphrased_text", "label"]]
# rename paraphrased_text to text
df3 = df3.rename(columns={"paraphrased_text": "text"})

df4 = pd.concat([df2, df3])

df4

Unnamed: 0,text,label
0,"Det er svært at sige, hvordan det frie valg vi...",neutral
1,"Jeg synes stadig, det er lidt urimeligt, men d...",positive
2,"Gennemsigtighed, klarhed, kontrol og eksempler...",neutral
3,Dengang var Socialdemokraterne den største gru...,neutral
4,Lissabontraktaten er et kompromisresultat mell...,neutral
...,...,...
569,"Der er ingen udelukkede muligheder, men det er...",neutral
570,"Jeg er ikke sikker på, om jeg er så imponeret ...",negative
571,"Det er mærkeligt, men det er udelukkende op ti...",neutral
572,Den 24. november afholdt Kommissionen et møde ...,neutral


### Functions

In [37]:
import pandas as pd
import os

# dataset dictionary to pandas dataframe
df = pd.DataFrame(dataset["train"])
df1 = pd.DataFrame(dataset["test"])

# concatenate train and test dataframes
df_HF = pd.concat([df, df1], ignore_index=True)

# rename negativ to negative
df_HF = df_HF.replace(to_replace="negativ", value="negative")
# rename positiv to positive
df_HF = df_HF.replace(to_replace="positiv", value="positive")

df_HF

Unnamed: 0,text,label
0,Næste punkt på dagsordenen er forhandling unde...,neutral
1,"Vi mener, at eftersom får og geder var omfatte...",positive
2,I tillæg dertil er der stadig indblanding fra ...,negative
3,"Deres indflydelse på de beslutninger, der træf...",neutral
4,"Hvis De venligst kan bringe orden i det, fru f...",neutral
...,...,...
952,Vi har nu arbejdet sammen med alle grupper og ...,neutral
953,Derfor skal der være kollektiv finansiering på...,neutral
954,Derfor er det igen vigtigt for planlægning og ...,neutral
955,Jo mere EU forbedrer kvaliteten af uddannelse ...,positive


In [38]:
os.environ["WANDB_DISABLED"] = "true"

In [39]:
from datasets import Dataset
import pandas as pd

# Load and preprocess the dataset
def load_and_prepare_dataset(file):
    # Load the dataset
    dataset = file
    # Convert to dict and then to a Hugging Face Dataset
    dataset = Dataset.from_dict(dataset)
    print("Dataset loaded and prepared")
    return dataset#, dataset_pd

# Split the dataset and convert into a Hugging Face DatasetDict
from datasets import DatasetDict

def split_dataset(dataset, dataset_2, seed=42): #def split_dataset(dataset, path_to_df_train, path_to_df_train_2, seed=42):
    
    # load paraphrasings dataset
    paraphrasings = dataset_2
    # concatenate the paraphrasings dataset with the original dataset
    #paraphrasings = pd.read_csv(path_to_df_train)
    #paraphrasings2 = pd.read_csv(path_to_df_train_2)

    #paraphrasings = pd.concat([paraphrasings, paraphrasings2])

    # remove rows where org_or_new == 1 - removing original tweets
    #paraphrasings = paraphrasings[paraphrasings['org_or_new'] == 0]
    #print("Number of paraphrasings: ", len(paraphrasings))

    #paraphrasings = paraphrasings.rename(columns={"New":"text"})
    #paraphrasings = paraphrasings.drop(columns=["org_or_new"])
    #paraphrasings = paraphrasings[['paraphrased_text', 'label']]

    # rename paraphrased_text to text
    #paraphrasings = paraphrasings.rename(columns={"paraphrased_text":"text"})

    # remove duplicates
    #paraphrasings = paraphrasings.drop_duplicates()
    #print("Number of paraphrasings after removing duplicates: ", len(paraphrasings))

    paraphrasings_plus_org = Dataset.from_dict(paraphrasings)


    # 60% train, 20% validation, 20% test
    train_test = dataset.train_test_split(test_size=0.4, seed=seed) 
    test_valid = train_test['test'].train_test_split(test_size=0.5, seed=seed)

    # combine train, test and valid to one dictionary
    dataset_splitted_dict = DatasetDict({
        'train': paraphrasings_plus_org,
        'valid': test_valid['train'],
        'test': test_valid['test']})
    
    print("Dataset splitted into train (60%), valid (20%) and test (20%)")

    # output the train dataset as a csv file
    dataset_splitted_dict['train'].to_csv(os.path.join("..", "data", "train_HF_df.csv"))

    # print the length of the train dataset
    print("Length of train dataset: ", len(dataset_splitted_dict['train']))
    print("Length of valid dataset: ", len(dataset_splitted_dict['valid']))
    print("Length of test dataset: ", len(dataset_splitted_dict['test']))

    print("")

    return dataset_splitted_dict

# Tokenize the dataset 
from transformers import AutoTokenizer
from datasets import ClassLabel

def tokenize_dataset(dataset, model_name="NbAiLab/nb-bert-large", max_length=128):
    # defining the labels
    labels_cl = ClassLabel(num_classes=3, names=['negative', 'neutral', 'positive'])

    # load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # defining a function to tokenize the text and translate all labels into integers instead of strings
    def tokenize_function(example):
        tokens = tokenizer(example["text"], padding="max_length", truncation=True, max_length=max_length)
        tokens['label'] = labels_cl.str2int(example['label'])
        return tokens

    # actually tokenizing the dataset
    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset['train'].column_names) # batched=True speeds up tokenization by allowing to process multiple lines at once


    print("Dataset tokenized")

    return tokenized_dataset

# evaluation metrics
import numpy as np
import evaluate

def compute_metrics(eval_pred):
    metric0 = evaluate.load("accuracy")
    metric1 = evaluate.load("precision")
    metric2 = evaluate.load("recall")
    metric3 = evaluate.load("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = metric0.compute(predictions=predictions, references=labels)["accuracy"]
    precision = metric1.compute(predictions=predictions, references=labels, average="weighted")["precision"]
    recall = metric2.compute(predictions=predictions, references=labels, average="weighted")["recall"]
    f1 = metric3.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [40]:
print("Loading and preparing dataset...")
dataset = load_and_prepare_dataset(df_HF)
print(dataset)

print("Splitting dataset...")
dataset_splitted_dict = split_dataset(dataset, df4) #split_dataset(dataset, path2, path3)

print("Tokenizing dataset...")
tokenized_dataset = tokenize_dataset(dataset_splitted_dict)

print("Loading model (NbAiLab/nb-bert-large)...")

from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("NbAiLab/nb-bert-large", num_labels=3)

Loading and preparing dataset...
Dataset loaded and prepared
Dataset({
    features: ['text', 'label'],
    num_rows: 957
})
Splitting dataset...
Dataset splitted into train (60%), valid (20%) and test (20%)



Creating CSV from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 242.72ba/s]

Length of train dataset:  1148
Length of valid dataset:  191
Length of test dataset:  192

Tokenizing dataset...




Map: 100%|██████████| 1148/1148 [00:00<00:00, 22908.24 examples/s]

Map: 100%|██████████| 191/191 [00:00<00:00, 17414.34 examples/s]

Map: 100%|██████████| 192/192 [00:00<00:00, 16892.65 examples/s]


Dataset tokenized
Loading model (NbAiLab/nb-bert-large)...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at NbAiLab/nb-bert-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Specifying training args

In [41]:
import numpy as np  
# count number of labels in each dataset
print(np.unique(dataset_splitted_dict['train']['label'],return_counts=True))
print(np.unique(dataset_splitted_dict['test']['label'],return_counts=True))
print(np.unique(dataset_splitted_dict['valid']['label'],return_counts=True))

(array(['negative', 'neutral', 'positive'], dtype='<U8'), array([268, 592, 288]))
(array(['negative', 'neutral', 'positive'], dtype='<U8'), array([37, 96, 59]))
(array(['negative', 'neutral', 'positive'], dtype='<U8'), array([ 44, 104,  43]))


In [42]:
from transformers import TrainingArguments

batch_size = 8 # stating batch size
epochs = 4
learning_rate = 1e-5


training_args = TrainingArguments(output_dir="test_trainer",
                                  num_train_epochs=epochs,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  learning_rate=learning_rate,
                                  weight_decay=0.01,
                                  logging_dir="logs",
                                  logging_steps=10,
                                  load_best_model_at_end=True,
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",  # Add this line
                                  remove_unused_columns=False,
                                  run_name="test_trainer")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Initializing trainer

In [43]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    compute_metrics=compute_metrics
)

# Training loop

In [44]:
for i in range(10):    
    trainer.train()

    trainer.evaluate()

    import tensorflow as tf

    # creating model predictions for the validation data
    predictions_val = trainer.predict(tokenized_dataset["valid"])

    # choosing the prediction that has the highest probability 
    preds_val_val = np.argmax(predictions_val.predictions, axis=-1)

    # calculating the probabilities instead of logits from each
    predictions_probabilities = tf.nn.softmax(predictions_val.predictions)

    def compute_metrics_end(preds, refs):
        metric0 = evaluate.load("accuracy")
        metric1 = evaluate.load("precision")
        metric2 = evaluate.load("recall")
        metric3 = evaluate.load("f1")
        
        #logits, labels = eval_pred
        #predictions = np.argmax(logits, axis=-1)
        accuracy = metric0.compute(predictions=preds, references=refs)["accuracy"]
        precision = metric1.compute(predictions=preds, references=refs, average="weighted")["precision"]
        recall = metric2.compute(predictions=preds, references=refs, average="weighted")["recall"]
        f1 = metric3.compute(predictions=preds, references=refs, average="weighted")["f1"]
        return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

    metrics_val = compute_metrics_end(preds=preds_val_val, refs=predictions_val.label_ids)

    import tensorflow as tf

    # creating model predictions for the validation data
    predictions_test = trainer.predict(tokenized_dataset["test"])

    # choosing the prediction that has the highest probability 
    preds_test_test = np.argmax(predictions_test.predictions, axis=-1)

    # calculating the probabilities instead of logits from each
    predictions_probabilities_test = tf.nn.softmax(predictions_test.predictions)

    metrics_test = compute_metrics_end(preds=preds_test_test, refs=predictions_test.label_ids)

    print(metrics_test)
    print(metrics_val)

    import pandas as pd

    data = {'Predicted Labels': ["negative" if i == 0 else "neutral" if i == 1 else "positive" for i in preds_val_val],
            'True Labels': ["negative" if i == 0 else "neutral" if i == 1 else "positive" for i in predictions_val.label_ids],
            'Misclassification': ["TRUE" if preds_val_val[i] == predictions_val.label_ids[i] else 'MISS' for i, val in enumerate(preds_val_val)],
            'Text': dataset_splitted_dict['valid']['text'],
            'Logit Values': [str(i) for i in predictions_val.predictions],
            'Probabilities': [str(i) for i in np.asarray(predictions_probabilities)]}
    df = pd.DataFrame(data)



    import pandas as pd
    from sklearn.metrics import classification_report

    # Extract the true and predicted labels
    true_labels = df['True Labels']
    predicted_labels = df['Predicted Labels']

    # Create a mapping for the labels to numbers if needed
    label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}

    # Map the labels to numbers using the mapping
    true_labels_mapped = true_labels.map(label_mapping)
    predicted_labels_mapped = predicted_labels.map(label_mapping)

    # Generate the classification report
    report = classification_report(true_labels_mapped, predicted_labels_mapped, target_names=label_mapping.keys(), output_dict=True)


    # save classification report to csv
    df = pd.DataFrame(report).transpose()
    df.to_csv(f"../classification_reports_HF/{i+1}classification_report_1x_gpt4paraphrased_plus_org.csv")

100%|██████████| 4/4 [2:53:19<00:00, 2599.86s/it]
  2%|▏         | 10/576 [01:34<1:17:03,  8.17s/it]