In [2]:
!pip install datasets transformers
!pip install accelerate

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [3]:
from datasets import load_dataset, Dataset, DatasetDict
from transformers import pipeline
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import pandas as pd
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup

GBERT2: Finetuning Gbert-base with twitter-dataset and germeval_2017-dataset

In [4]:
twitter_dataset = load_dataset("Alienmaster/german_politicians_twitter_sentiment")
germeval_17_dataset = load_dataset("akash418/germeval_2017")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/326k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/81.3k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1428 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/357 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/2.27k [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/6.23M [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/1.54M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/19432 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2566 [00:00<?, ? examples/s]

In [5]:
print(twitter_dataset)

DatasetDict({
    train: Dataset({
        features: ['ID', 'majority_sentiment', 'text'],
        num_rows: 1428
    })
    test: Dataset({
        features: ['ID', 'majority_sentiment', 'text'],
        num_rows: 357
    })
})


In [6]:
print(germeval_17_dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'relevance', 'sentiment'],
        num_rows: 19432
    })
    test: Dataset({
        features: ['id', 'text', 'relevance', 'sentiment'],
        num_rows: 2566
    })
})


In [7]:
#add new column "label" with numerical values for sentiment: 0:positive, 1:negative, 2:neutral
label_map_1 = {1: 0, 2: 1, 3: 2}
label_map_2 = {"negative": 1, "neutral": 2, "positive": 0}

In [8]:
twitter_dataset = twitter_dataset.map(lambda x: {"label": label_map_1[x["majority_sentiment"]]})

Map:   0%|          | 0/1428 [00:00<?, ? examples/s]

Map:   0%|          | 0/357 [00:00<?, ? examples/s]

In [9]:
germeval_17_dataset = germeval_17_dataset.map(lambda x: {"label": label_map_2[x["sentiment"]]})

Map:   0%|          | 0/19432 [00:00<?, ? examples/s]

Map:   0%|          | 0/2566 [00:00<?, ? examples/s]

In [10]:
#delete unnecessary columns
twitter_dataset = twitter_dataset.remove_columns(["ID", "majority_sentiment"])

In [11]:
germeval_17_dataset = germeval_17_dataset.remove_columns(["id", "relevance", "sentiment"])

In [12]:
print(twitter_dataset)
print(germeval_17_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1428
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 357
    })
})
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 19432
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2566
    })
})


In [13]:
from datasets import concatenate_datasets

In [14]:
#Merge datasets, divided into train and test split
combined_train = concatenate_datasets([twitter_dataset["train"], germeval_17_dataset["train"]])
combined_test = concatenate_datasets([twitter_dataset["test"], germeval_17_dataset["test"]])

In [15]:
combined_dataset = {"train": combined_train, "test": combined_test}

In [16]:
model_name = "deepset/gbert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/83.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/362 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/240k [00:00<?, ?B/s]

In [17]:
#tokenize data
def tokenize_function(examples):
    examples["text"] = [str(text) for text in examples["text"]]
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=128)

In [18]:
tokenized_datasets = {split: combined_dataset[split].map(tokenize_function, batched=True) for split in ["train", "test"]}

Map:   0%|          | 0/20860 [00:00<?, ? examples/s]

Map:   0%|          | 0/2923 [00:00<?, ? examples/s]

In [19]:
#Convert to pytorch tensors for the model
tokenized_datasets["train"].set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
tokenized_datasets["test"].set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

In [20]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

In [22]:
#Defining training and evaluation parameters
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average="macro")
  acc = accuracy_score(labels, preds)
  precision = precision_score(labels, preds, average="macro")
  recall = recall_score(labels, preds, average="macro")
  return {"precision": precision, "recall": recall, "acc": acc, "f1": f1}

In [23]:
batch_size = 16
logging_steps = len(tokenized_datasets["train"]) // batch_size
model_name = f"{model_name}-finetuned-twitter_germeval_17"
training_args = TrainingArguments(output_dir = model_name,
                                  num_train_epochs = 4,
                                  learning_rate = 2e-5,
                                  logging_dir=None,
                                  logging_strategy="no",
                                  report_to=None,
                                  per_device_train_batch_size = batch_size,
                                  per_device_eval_batch_size = batch_size,
                                  evaluation_strategy="no",
                                  disable_tqdm = False,
                                  logging_steps = logging_steps,
                                  log_level="info")



In [24]:
trainer = Trainer(
    model = model,
    args = training_args,
    compute_metrics = compute_metrics,
    train_dataset=tokenized_datasets["train"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [25]:
import wandb
wandb.init(mode="disabled")

In [26]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 20,860
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5,216
  Number of trainable parameters = 109,929,987
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss


Saving model checkpoint to deepset/gbert-base-finetuned-twitter_germeval_17/checkpoint-500
Configuration saved in deepset/gbert-base-finetuned-twitter_germeval_17/checkpoint-500/config.json
Model weights saved in deepset/gbert-base-finetuned-twitter_germeval_17/checkpoint-500/model.safetensors
tokenizer config file saved in deepset/gbert-base-finetuned-twitter_germeval_17/checkpoint-500/tokenizer_config.json
Special tokens file saved in deepset/gbert-base-finetuned-twitter_germeval_17/checkpoint-500/special_tokens_map.json
Saving model checkpoint to deepset/gbert-base-finetuned-twitter_germeval_17/checkpoint-1000
Configuration saved in deepset/gbert-base-finetuned-twitter_germeval_17/checkpoint-1000/config.json
Model weights saved in deepset/gbert-base-finetuned-twitter_germeval_17/checkpoint-1000/model.safetensors
tokenizer config file saved in deepset/gbert-base-finetuned-twitter_germeval_17/checkpoint-1000/tokenizer_config.json
Special tokens file saved in deepset/gbert-base-finetun

TrainOutput(global_step=5216, training_loss=0.29757011741216927, metrics={'train_runtime': 1974.6759, 'train_samples_per_second': 42.255, 'train_steps_per_second': 2.641, 'total_flos': 5488545893806080.0, 'train_loss': 0.29757011741216927, 'epoch': 4.0})

In [27]:
#evaluation on testplit
eval_results = trainer.evaluate(tokenized_datasets["test"])

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2923
  Batch size = 16


In [28]:
eval_results_df = pd.DataFrame([eval_results])

In [29]:
print(eval_results_df)

   eval_loss  eval_precision  eval_recall  eval_acc   eval_f1  eval_runtime  \
0    0.84022        0.724311      0.72776  0.799179  0.725919       20.7782   

   eval_samples_per_second  eval_steps_per_second  epoch  
0                  140.676                  8.807    4.0  


In [30]:
#save the model
model_path = "/content/gbert_finetuned_twitter&germeval17"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Configuration saved in /content/gbert_finetuned_twitter&germeval17/config.json
Model weights saved in /content/gbert_finetuned_twitter&germeval17/model.safetensors
tokenizer config file saved in /content/gbert_finetuned_twitter&germeval17/tokenizer_config.json
Special tokens file saved in /content/gbert_finetuned_twitter&germeval17/special_tokens_map.json


('/content/gbert_finetuned_twitter&germeval17/tokenizer_config.json',
 '/content/gbert_finetuned_twitter&germeval17/special_tokens_map.json',
 '/content/gbert_finetuned_twitter&germeval17/vocab.txt',
 '/content/gbert_finetuned_twitter&germeval17/added_tokens.json',
 '/content/gbert_finetuned_twitter&germeval17/tokenizer.json')

In [31]:
import shutil
from google.colab import files

In [32]:
shutil.make_archive("/content/gbert_finetuned_twitter&germeval17", 'zip', "/content/gbert_finetuned_twitter&germeval17")

'/content/gbert_finetuned_twitter&germeval17.zip'

In [33]:
files.download("/content/gbert_finetuned_twitter&germeval17.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>