In [None]:
# %%
import os
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"


In [None]:
# %%
import torch

print('Pytorch version', torch.__version__)
if torch.backends.mps.is_available():
    active_device = torch.device('mps')
    print("Using MPS device")
elif torch.cuda.is_available():
    active_device = torch.device('cuda', 0)
    print("Using CUDA device")
else:
    active_device = torch.device('cpu')
    print("Using CPU device")


In [None]:
# %%
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

train_data_df = pd.read_csv('../datasets/data_train.csv').head(1)
test_data_df = pd.read_csv('../datasets/data_test.csv').head(1)
valid_data_df = pd.read_csv('../datasets/data_valid.csv').head(1)

all_labels = pd.concat([train_data_df["label"], test_data_df["label"], valid_data_df["label"]])
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)

train_data_df["label"] = label_encoder.transform(train_data_df["label"])
test_data_df["label"] = label_encoder.transform(test_data_df["label"])
valid_data_df["label"] = label_encoder.transform(valid_data_df["label"])


In [None]:
# %%
from datasets import Dataset
train_data = Dataset.from_pandas(train_data_df)
test_data = Dataset.from_pandas(test_data_df)
valid_data = Dataset.from_pandas(valid_data_df)


In [None]:
# %%
train_data[0]


In [None]:
# %%
from transformers import BertTokenizer

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-german-cased')

def tokenize_function(examples):
    return tokenizer(examples["data"], padding="max_length", truncation=True, max_length=400)


In [None]:
# %%
train_dataset = train_data.map(tokenize_function, batched=True)
test_dataset = test_data.map(tokenize_function, batched=True)
valid_dataset = valid_data.map(tokenize_function, batched=True)


In [None]:
# %%
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=len(label_encoder.classes_))
model.to(active_device)


In [None]:
# %%
from transformers import TrainingArguments
from transformers import EarlyStoppingCallback, IntervalStrategy

# Defining the TrainingArguments() arguments
args = TrainingArguments(
   f"training_with_callbacks",
   evaluation_strategy = IntervalStrategy.STEPS, # "steps"
   eval_steps = 50, # Evaluation and Save happens every 50 steps
   save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
   learning_rate=2e-5,
   per_device_train_batch_size=32,
   per_device_eval_batch_size=32,
   num_train_epochs=5,
   weight_decay=0.01,
   push_to_hub=False,
   metric_for_best_model = 'f1',
   load_best_model_at_end=True,
   use_mps_device = torch.backends.mps.is_available())


In [None]:
# %%
import numpy as np
import evaluate

metric = evaluate.load("accuracy")


In [None]:
# %%
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='macro')
    precision = precision_score(y_true=labels, y_pred=pred, average='macro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='macro')
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}


In [None]:
# %%
from transformers import TrainingArguments, Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)


In [None]:
# %%
trainer.train()
