In [None]:
from google.colab import drive
drive.mount('/content/drive')

#Pfad individuell dorthin setzen, wo die Daten liegen                           ####
path = '/content/drive/MyDrive/techlabs/Github/Daten/'

## Laden aller nötigen relevanten Pakete
import pandas as pd
import os
import torch
# diese Setzung wurde später beim Training empfohlen
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

#!pip install datasets evaluate transformers[sentencepiece]

import datasets
from datasets import load_dataset
from transformers import  AutoModelForSequenceClassification, get_scheduler

#!pip install accelerate
#from accelerate import Accelerator


Mounted at /content/drive


ModuleNotFoundError: No module named 'datasets'

In [None]:
# Welche Variante soll trainiert werden? Die mit oder die ohne Partei?          #### !!
with_party_info = ["no", "yes"][1]
with_party_info

# Laden der Datensätze und auch schon Definition des späteren Modellnamens
if with_party_info == "yes":
  train_df = pd.read_parquet(path + "df_train2.parquet")
  val_df = pd.read_parquet(path + "df_val.parquet")
  #test_df = pd.read_parquet(path + "df_test.parquet")

  model_name = "GottBERT_model"
  #model_name = "second_GottBERT_model"
  #previous_model_name = "GottBERT_model"

else:
  train_df = pd.read_parquet(path + "df_train2_np.parquet")
  val_df = pd.read_parquet(path + "df_val_np.parquet")
  #test_df = pd.read_parquet(path + "df_test_np.parquet")

  model_name =  "GottBERT_model_np"
  #model_name = "second_GottBERT_model_np"
  #previous_model_name = "GottBERT_model_np"


In [None]:
#Überprüfen, ob es geklappt hat
print(train_df.head())
print(train_df.tail())
print(val_df.tail())

In [None]:
# Überführe die Datensätze in die für das Training richtige Dateiformat
from datasets import Dataset
from datasets import DatasetDict

ds_training = Dataset.from_pandas(train_df[['Rede', 'labels']])
ds_val = Dataset.from_pandas(val_df[['Rede', 'labels']])
# ds_test = Dataset.from_pandas(df_test[['Rede', 'labels']])

dataset = DatasetDict({
    'train': ds_training,
    'validation': ds_val
    })

In [None]:
# Importiere das Modell
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = 'TUM/GottBERT_base_best'   #"deepset/gbert-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


In [None]:
#probiere den Tokenizer an einem Satz aus
test = dataset["train"][1]["Rede"]
inputs = tokenizer(test)
tokenizer.convert_ids_to_tokens(inputs["input_ids"])

In [None]:
# Definiere eine Funktion zum Tokenizieren und wende diese auf das dataset an

def tokenize_function(example):
  return tokenizer(example["Rede"],truncation=True, padding = "max_length", max_length = 512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [None]:
#entferne irrelevante Spalten
tokenized_dataset = tokenized_dataset.remove_columns(['Rede', '__index_level_0__'])

#set the format of the datasets so they return PyTorch tensors instead of lists
tokenized_dataset.set_format("torch")

#Lade die Daten in den DataLoader
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_dataset["train"], shuffle=True, batch_size= 40, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_dataset["validation"], batch_size= 40, collate_fn=data_collator
)

In [None]:
#Wichtiger Zwischenschritt: Irrelevante Datensätze löschen, damit RAM frei wird
import gc
del train_df
del val_df
del ds_val
del ds_training
del dataset

gc.collect()

62

In [None]:
# Überprüfen des Formats
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([40]),
 'input_ids': torch.Size([40, 512]),
 'attention_mask': torch.Size([40, 512])}

In [None]:
# Lade GottBERT-Modell mit der Aufgabe Sequenz-Klassifikation
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=6)

# ggf. (bei sequentiellem Trainieren): Lade Gewichte des votrainierten Modells
#model.load_state_dict(torch.load(path + previous_model_name + ".pth"))

#outputs = model(**batch)
#print(outputs.loss, outputs.logits.shape)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at TUM/GottBERT_base_best and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Entscheide über Hyper-Parameter zum Trainieren
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr= 5e-5)

from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

print(num_training_steps)


In [None]:
# Nutze die GPU
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
print(device)

cuda


In [None]:
#Nun trainiere das Modell!
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
       # daten zur GPU schicken
       batch = {k: v.to(device) for k, v in batch.items()}
       #compute model-outputs
       outputs = model(**batch)
       #compute loss
       loss = outputs.loss
       #compute gradients with loss
       loss.backward()
       #make a training step
       optimizer.step()
       #update learning rate
       lr_scheduler.step()
       #zero the gradients
       optimizer.zero_grad()
       progress_bar.update(1)

# Speichere das trainierte Modell ab
torch.save(model.state_dict(), path + model_name + ".pth")

#so kann man es wieder laden:
#model.load_state_dict(torch.load(path + model_name + ".pth"))

In [None]:
#Modell evaluieren

from sklearn.metrics import accuracy_score, f1_score, classification_report
from evaluate import load

#Lade das Parteien-Mapping
import pickle
with open(path +'party_mapping.pkl', 'rb') as f:
    parties = pickle.load(f)
fraktionen_labels = [str(k) for k in parties]

all_predictions = []
all_labels = []

metric = load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
  batch = {k: v.to(device) for k, v in batch.items()}
  with torch.no_grad():
    outputs = model(**batch)

  logits = outputs.logits
  predictions = torch.argmax(logits, dim=-1)
  metric.add_batch(predictions = predictions, references = batch["labels"])

  all_predictions.extend(predictions.cpu().numpy())
  all_labels.extend(batch["labels"].cpu().numpy())

print("Accuracy:",accuracy_score(all_labels, all_predictions))
print("f1-score:", f1_score(all_labels, all_predictions, average = "weighted"))

report = classification_report(all_labels, all_predictions)
print("\nClassification Report:")
print(report)

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

cm = confusion_matrix(all_labels, all_predictions)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = fraktionen_labels)
disp.plot()
plt.show()