In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
import numpy as np
from datasets import load_metric, load_dataset, Dataset
import pandas as pd
import os
import torch

In [2]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [3]:
df1 = pd.read_csv("/home/hilhag/prjs/robustly-sentimental/data/processed/absa-imm.csv")
df2 = pd.read_csv("/home/hilhag/prjs/robustly-sentimental/data/processed/twitter-data.csv")

In [6]:
# drop id 

df1 = df1[["text", "label"]]
df2 = df2[["text", "label"]]

In [15]:
len(df2)

12082

In [27]:
# set different weights for the datasets based on their quality
weight1 = 0.5
weight2 = 0.5

In [28]:
# shuffle the datasets
#df1 = shuffle(df1)
#df2 = shuffle(df2)

In [29]:
# combine the datasets with different weights
combined_dataset = pd.concat([df1.sample(frac=weight1), df2.sample(frac=weight2)], ignore_index=True)

In [30]:
combined_dataset.dropna(inplace=True)

In [34]:
#len(combined_dataset)

In [33]:
dataset = Dataset.from_pandas(combined_dataset)
dataset = dataset.train_test_split(test_size=0.1)

In [35]:
model = AutoModelForSequenceClassification.from_pretrained("KB/bert-base-swedish-cased", num_labels=3)

Some weights of the model checkpoint at KB/bert-base-swedish-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at KB/bert-base-swedi

In [36]:
def preprocess_function(examples):
    return tokenizer(examples["text"], padding='max_length', truncation=True, max_length=512)

In [37]:
def compute_metrics(eval_pred):
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"accuracy": accuracy, "f1": f1}

In [40]:
tokenizer = AutoTokenizer.from_pretrained("KB/bert-base-swedish-cased", model_max_length=512)

In [43]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [48]:
tokenized_train = dataset["train"].map(preprocess_function, batched=True)
tokenized_test = dataset["test"].map(preprocess_function, batched=True)

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [49]:
repo_name = "debug-test"

In [50]:
training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=4,
   per_device_eval_batch_size=4,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
   push_to_hub=False,
)

In [51]:
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [52]:
trainer.train()
trainer.save_model("test-debug")

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 7629
  Num Epochs = 2
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 3816
  Number of trainable parameters = 124693251
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.8746
1000,0.7813
1500,0.7831
2000,0.6929
2500,0.5592
3000,0.5689
3500,0.5692


Saving model checkpoint to debug-test/checkpoint-1908
Configuration saved in debug-test/checkpoint-1908/config.json
Model weights saved in debug-test/checkpoint-1908/pytorch_model.bin
tokenizer config file saved in debug-test/checkpoint-1908/tokenizer_config.json
Special tokens file saved in debug-test/checkpoint-1908/special_tokens_map.json
Saving model checkpoint to debug-test/checkpoint-3816
Configuration saved in debug-test/checkpoint-3816/config.json
Model weights saved in debug-test/checkpoint-3816/pytorch_model.bin
tokenizer config file saved in debug-test/checkpoint-3816/tokenizer_config.json
Special tokens file saved in debug-test/checkpoint-3816/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to test-debug
Configuration saved in test-debug/config.json
Model weights saved in test-debug/pytorch_model.bin
tokenizer config file saved in test-debug/tokenizer_config.json
Special tokens file saved 