In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# 1) Auswahl Datensatz für das Finetuning
- Quelle: 4 Datensätze aus Kaggle
- Exploration der Datensätze
- Auswahl des besten Datensatzes

In [None]:
emails = pd.read_csv('emails.csv')
spam = pd.read_excel('spam.xlsx')
spam_ham_dataset = pd.read_csv('spam_ham_dataset.csv')
spam_or_not_spam = pd.read_csv('spam_or_not_spam.csv')

#### 1) emails - nicht geeignet, da Aufbau wie Dokumenterm-Matrix

In [None]:
emails.info()
emails.head()

In [None]:
emails_spam = emails[(emails['Prediction']) == 1]
len(emails_spam)

#### 2) spam - nicht geeignet, da weitere Datentransformationen nötig und keine gute Datenqualität

In [None]:
spam.head()

In [None]:
spam = spam[['v1,v2,']]
spam.info()
spam.head()

In [None]:
word_to_search = 'ham'
rows_containing_word = spam['v1,v2,'].str.contains(word_to_search, case=False).sum()

print(f"Number of rows containing '{word_to_search}': {rows_containing_word}")

#### 3) spam_ham_dataset - Winner durch geeignete Struktur, Subject-Kennung und beste Datenqualität unter allen

In [None]:
spam_ham_dataset = spam_ham_dataset.drop('Unnamed: 0', axis=1)
spam_ham_dataset.info()
spam_ham_dataset.head()

In [None]:
print(spam_ham_dataset['text'][6])

#### 4) spam_or_not_spam - Struktur ist geeignet aber Datenqualität nicht gut, oft Nachrichten anstatt E-Mails

In [None]:
spam_or_not_spam.info()
spam_or_not_spam.tail()

In [None]:
test_spam = spam_or_not_spam[(spam_or_not_spam['label']) == 1]
len(test_spam)

In [None]:
print(spam_or_not_spam['email'][6])

# 2) Ausgewählten Datensatz "spam_ham_dataset" preprocessen

In [None]:
df = spam_ham_dataset.drop('label', axis=1)
df.rename(columns={'label_num': 'label'}, inplace=True)
df.head()

#### zu entfernen
- \r
- \n
- Alle Sonderzeichen nachdem 8. Zeichen (davor Subject:)

In [None]:
def cleaning(df):
    df['text'] = df['text'].str.replace(r'[\r\n\t]+', '', regex=True) # \r \n \t 
    df['text'] = df['text'].str.slice(0, 8) + df['text'].str.slice(8).str.replace(r'[^a-zA-Z0-9\s]+', '', regex=True) # Sonder
    df['text'] = df['text'].str.replace(r'\bhttp\S*\b', '', regex=True) # Links
    return df

In [None]:
df = cleaning(df)
df.head()

In [None]:
df.info()

# 3) Preprocessing der Evaluationsstichproben

#### Stichprobe von 50 Spam E-Mails aus den Datensatz "spam_ham_dataset"

In [None]:
df_spam = df[df['label'] == 1]
print(len(df))
print(len(df_spam))

In [None]:
sample_kaggle = df_spam['text'].sample(50, random_state=123)
sample_kaggle.info()

In [None]:
for index, text in sample_kaggle.items():
    print(f"Index: {index} \nText: {text}")

#### 40 von ChatGPT generierte Spam E-mails

In [None]:
sample_chatgpt = pd.read_excel('spam_chatgpt_v2.xlsx')
sample_chatgpt = sample_chatgpt['email']
sample_chatgpt.info()

#### 14 eigene Spam E-Mails

In [None]:
own_spam = pd.read_excel('own_spam.xlsx')
own_spam.head()

In [None]:
own_spam = cleaning(own_spam)
own_spam.head()

# 4) Preprocessing mit BERT Tokenizer

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
import numpy as np
from tabulate import tabulate
from tqdm import trange
import random

#### BERT Tokenizer Bibliothek führt folgende Preprocessing-Schritte durch:
- Tokenisierung: Inputtext in einzelne Tokens runterbrechen
- Wordpiece Encoding: nutzt Wordpiece-Algorithmus zur Aufteilung von Tokens in Subwörter
- Spezielle Token: CLS (Anfang), SEP (Trennung zwischen Sätzen/Segmenten)
- Padding und Truncation: behandelt Eingabesequenz, sodass alle Sequenzen gleich lang sind
- Token IDs und Attention Mask: Zuweisung von einzigartigen Token IDs und zu beachtende Tokens

In [None]:
text = df.text.values
labels = df.label.values

In [None]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True
    )

#### Eine zufällige, vom Tokenizer aufbereite E-Mail sehen

In [None]:
def print_rand_sentence():
  '''Displays the tokens and respective IDs of a random text sample'''
  index = random.randint(0, len(text)-1)
  table = np.array([tokenizer.tokenize(text[index]), 
                    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text[index]))]).T
  print(tabulate(table,
                 headers = ['Tokens', 'Token IDs'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence()

#### Mit der Funktion encode_plus von Tokenizer Preprocessing durchführen

In [None]:
token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 32, # longer sentences truncated, shorter populated with PAD tokens
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )


for sample in text:
  encoding_dict = preprocessing(sample, tokenizer)
  token_id.append(encoding_dict['input_ids']) 
  attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

In [None]:
token_id[6]

#### Kodierte Eingabesequenz annschauen 

In [None]:
def print_rand_sentence_encoding():
  '''Displays tokens, token IDs and attention mask of a random text sample'''
  index = random.randint(0, len(text) - 1)
  tokens = tokenizer.tokenize(tokenizer.decode(token_id[index]))
  token_ids = [i.numpy() for i in token_id[index]]
  attention = [i.numpy() for i in attention_masks[index]]

  table = np.array([tokens, token_ids, attention]).T
  print(tabulate(table, 
                 headers = ['Tokens', 'Token IDs', 'Attention Mask'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence_encoding()

# 5) Finetuning von BertForSequenceClassification auf Spamerkennung

#### Durchgeführte Schritte:
- 80/20 Split
- Daten als torch.utils.data.DataLoder Objekt wrappen
- batch_size = Anzahl Stichproben je Batch im Training
- train_set and val_set = erstellt mit TensorDataset und kombiniert input token IDs, attention masks und labels
- train_dataloader and validation_dataloader = erstellt mit DataLoader von torch.utils.data > Iteration über Batches während Training und Validierung
- RandomSampler und SequentialSampler = zufällig Daten vom Datensatz samplen für Training und Validierung

In [None]:
val_ratio = 0.2

batch_size = 16 # Empfohlene batch size nach BERT-Studie: 16, 32

# Stratifiziertes Sampling
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels)

# Training und Validierung Datensatz
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

# Als DataLoader-Objekt
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

In [None]:
def b_tp(preds, labels):
  '''Returns True Positives (TP): count of correct predictions of actual class 1'''
  return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_fp(preds, labels):
  '''Returns False Positives (FP): count of wrong predictions of actual class 1'''
  return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_tn(preds, labels):
  '''Returns True Negatives (TN): count of correct predictions of actual class 0'''
  return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_fn(preds, labels):
  '''Returns False Negatives (FN): count of wrong predictions of actual class 0'''
  return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_metrics(preds, labels):
  '''
  Returns the following metrics:
    - accuracy    = (TP + TN) / N
    - precision   = TP / (TP + FP)
    - recall      = TP / (TP + FN)
    - specificity = TN / (TN + FP)
  '''
  preds = np.argmax(preds, axis = 1).flatten()
  labels = labels.flatten()
  tp = b_tp(preds, labels)
  tn = b_tn(preds, labels)
  fp = b_fp(preds, labels)
  fn = b_fn(preds, labels)
  b_accuracy = (tp + tn) / len(labels)
  b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
  b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
  b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
  return b_accuracy, b_precision, b_recall, b_specificity

#### BertForSequenceClassification herunterladen mit empfohlenen Parametern

In [None]:
# BertForSequenceClassification modell laden ohne Finetuning - Base BERT
model_base = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

# Vorgeschlagene Learning rates (Adam) gemäß originalem Paper: 5e-5, 3e-5, 2e-5
optimizer = torch.optim.AdamW(model_base.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08
                              )



In [None]:
# Alle Modellparameter ausgeben
params = list(model_base.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [None]:
if torch.cuda.is_available():
    print("CUDA is available. Running on GPU.")
else:
    print("CUDA is not available. Running on CPU.")

In [None]:
# BertForSequenceClassification modell laden mit Finetuning - Finetuned BERT
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08
                              )



#### Finetuning sowie Ausgabe von Evaluationsmetriken

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
epochs = 4

train_losses = []       # List to store training losses
val_losses = []         # List to store validation losses

for _ in trange(epochs, desc = 'Epoch'):
    
    # ========== Training ==========
    
    # Set model to training mode
    model.train()
    
    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        
    # average training loss > training loss / number of training steps in epoch
    train_losses.append(tr_loss / nb_tr_steps) 

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()
    
    # Tracking variables
    val_loss = 0
    nb_val_examples, nb_val_steps = 0, 0

    # Tracking variables 
    val_accuracy = []
    val_precision = []
    val_recall = []
    val_specificity = []

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask,
                              labels = b_labels)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Update tracking variables
        val_loss += eval_output.loss.item()
        nb_val_examples += b_input_ids.size(0)
        nb_val_steps += 1
        
        # Calculate validation metrics
        b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
        val_accuracy.append(b_accuracy)
        # Update precision only when (tp + fp) !=0; ignore nan
        if b_precision != 'nan': val_precision.append(b_precision)
        # Update recall only when (tp + fn) !=0; ignore nan
        if b_recall != 'nan': val_recall.append(b_recall)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if b_specificity != 'nan': val_specificity.append(b_specificity)
            
    val_losses.append(val_loss / nb_val_steps) # average evalluation loss > evaluation loss / number of evaluation steps in epoch

    print('\n\t - Train loss: {:.8f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation loss: {:.4f}'.format(val_loss / nb_val_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')

#### Finetuningprozess evaluieren anhand training und validation loss je Epoche

In [None]:
import matplotlib.pyplot as plt
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
plt.subplots_adjust(wspace=0.4)

ax1.plot((range(1, epochs + 1)), train_losses, marker='o', linestyle='-')
ax1.set_xlabel('Epoch')
ax1.set_xticks(range(1,5))
ax1.set_ylabel('Training Loss')
ax1.set_title('Average Training Loss per Epoch')

ax2.plot((range(1, epochs + 1)), val_losses, marker='o', linestyle='-')
ax2.set_xlabel('Epoch')
ax2.set_xticks(range(1,5))
ax2.set_ylabel('Validation Loss')
ax2.set_title('Average Validation Loss per Epoch')

# 6) Spamvorhersage Evaluationsstichproben mit base und finetuned BERT

## Vorhersagen mit base BERT machen

In [None]:
def calculate_accuracy(spam, total):
    return spam / total    

In [None]:
def classify_email_base(new_sentence):
    '''Nimmt einen Text-String und macht Vorhersagen mit base BERT modell ob spam/ham'''
    
    # Token IDs und Attention Mask benötigt für Inferenz auf die neue Email
    test_ids = [] 
    test_attention_mask = []
    
    encoding = preprocessing(new_sentence, tokenizer) # Tokenizer anwenden
    
    # Token IDs und Attention Mask extrahieren
    test_ids.append(encoding['input_ids'])
    test_attention_mask.append(encoding['attention_mask'])
    test_ids = torch.cat(test_ids, dim = 0)
    test_attention_mask = torch.cat(test_attention_mask, dim = 0)
    
    # Forward pass und Berechnung von logit Vorhersagen
    with torch.no_grad():
        output = model_base(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))
        
    prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Ham'
    return new_sentence, prediction

#### Stichprobe von 50 Spam E-Mails aus den Datensatz "spam_ham_dataset"

In [None]:
email_kaggle_base = []
prediction_kaggle_base = []
for index, text in sample_kaggle.items():
    email, prediction = classify_email_base(text)
    email_kaggle_base.append(email)
    prediction_kaggle_base.append(prediction)
    
results_kaggle_base = pd.DataFrame(list(zip(email_kaggle_base, prediction_kaggle_base)), columns = ['email', 'spam_prediction'])
results_kaggle_base.head()

spam = results_kaggle_base['spam_prediction'] == 'Spam'
spam_df = results_kaggle_base[spam]
accuracy_kaggle_base = calculate_accuracy(len(spam_df), len(results_kaggle_base))*100
print(f"Accuracy base BERT auf Kaggle Datensatz: {accuracy_kaggle_base}")

#### 40 von ChatGPT generierte Spam E-Mails

In [None]:
email_chatgpt_base = []
prediction_chatgpt_base = []
for index, text in sample_chatgpt.items():
    email, prediction = classify_email_base(text)
    email_chatgpt_base.append(email)
    prediction_chatgpt_base.append(prediction)
    
results_chatgpt_base = pd.DataFrame(list(zip(email_chatgpt_base, prediction_chatgpt_base)),
                                    columns = ['email', 'spam_prediction'])

spam = results_chatgpt_base['spam_prediction'] == 'Spam'
spam_df = results_chatgpt_base[spam]
accuracy_chatgpt_base = calculate_accuracy(len(spam_df), len(results_chatgpt_base))*100
print(f"Accuracy base BERT auf ChatGPT Datensatz: {accuracy_chatgpt_base}")

#### 14 eigene Spam E-Mails

In [None]:
sample_own = own_spam.rename(columns={'text': 'email'}).squeeze()
sample_own.info()

In [None]:
email_own_base = []
prediction_own_base = []
for index, text in sample_own.items():
    email, prediction = classify_email_base(text)
    email_own_base.append(email)
    prediction_own_base.append(prediction)
    
results_own_base = pd.DataFrame(list(zip(email_own_base, prediction_own_base)),
                                    columns = ['email', 'spam_prediction'])

spam = results_own_base['spam_prediction'] == 'Spam'
spam_df = results_own_base[spam]
accuracy_own_base = round(calculate_accuracy(len(spam_df), len(results_own_base))*100, 0)
print(f"Accuracy base BERT auf eigenen Datensatz: {accuracy_own_base}")

## Vorhersagen mit finetuned BERT machen

In [None]:
def classify_email_finetuned(new_sentence):
    '''Takes in a text string and generates predictions of base BERT model if its spam or not'''
    # We need Token IDs and Attention Mask for inference on the new sentence
    test_ids = [] 
    test_attention_mask = []
    
    encoding = preprocessing(new_sentence, tokenizer) # Apply the tokenizer
    
    # Extract IDs and Attention Mask
    test_ids.append(encoding['input_ids'])
    test_attention_mask.append(encoding['attention_mask'])
    test_ids = torch.cat(test_ids, dim = 0)
    test_attention_mask = torch.cat(test_attention_mask, dim = 0)
    
    # Forward pass, calculate logit predictions
    with torch.no_grad():
        output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))
        
    prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Ham'
    return new_sentence, prediction
    #print('Input Sentence: ', new_sentence)
    #print('Predicted Class: ', prediction)

#### Stichprobe von 50 Spam E-Mails aus den Datensatz "spam_ham_dataset"

In [None]:
email_kaggle_finetuned = []
prediction_kaggle_finetuned = []
for index, text in sample_kaggle.items():
    #print(f"Index: {index} \nText: {text}")
    email, prediction = classify_email_finetuned(text)
    #print('Input Email: ', email)
    #print('Predicted Class: ', prediction)
    email_kaggle_finetuned.append(email)
    prediction_kaggle_finetuned.append(prediction)
    #results_kaggle = pd.DataFrame{}
    #break
    
results_kaggle_finetuned = pd.DataFrame(list(zip(email_kaggle_finetuned, prediction_kaggle_finetuned)),
                                        columns = ['email', 'spam_prediction'])
#results_kaggle_finetuned.head(50)

spam = results_kaggle_finetuned['spam_prediction'] == 'Spam'
spam_df = results_kaggle_finetuned[spam]
accuracy_kaggle_finetuned = calculate_accuracy(len(spam_df), len(results_kaggle_finetuned))*100
print(f"Accuracy finetuned BERT auf Kaggle Datensatz: {accuracy_kaggle_finetuned}")

#### 40 von ChatGPT generierte Spam E-Mails

In [None]:
email_chatgpt_finetuned = []
prediction_chatgpt_finetuned = []
for index, text in sample_chatgpt.items():
    #print(f"Index: {index} \nText: {text}")
    email, prediction = classify_email_finetuned(text)
    #print('Input Email: ', email)
    #print('Predicted Class: ', prediction)
    email_chatgpt_finetuned.append(email)
    prediction_chatgpt_finetuned.append(prediction)
    #results_kaggle = pd.DataFrame{}
    #break
    
results_chatgpt_finetuned = pd.DataFrame(list(zip(email_chatgpt_finetuned, prediction_chatgpt_finetuned)),
                                        columns = ['email', 'spam_prediction'])
#results_kaggle_finetuned.head(50)

spam = results_chatgpt_finetuned['spam_prediction'] == 'Spam'
spam_df = results_chatgpt_finetuned[spam]
accuracy_chatgpt_finetuned = calculate_accuracy(len(spam_df), len(results_chatgpt_finetuned))*100
print(f"Accuracy finetuned BERT auf ChatGPT Datensatz: {accuracy_chatgpt_finetuned}")

In [None]:
results_chatgpt_finetuned.head(50)

#### 14 eigene Spam E-Mails

In [None]:
email_own_finetuned = []
prediction_own_finetuned = []
for index, text in sample_own.items():
    email, prediction = classify_email_finetuned(text)
    email_own_finetuned.append(email)
    prediction_own_finetuned.append(prediction)
    
results_own_finetuned = pd.DataFrame(list(zip(email_own_finetuned, prediction_own_finetuned)),
                                        columns = ['email', 'spam_prediction'])

spam = results_own_finetuned['spam_prediction'] == 'Spam'
spam_df = results_own_finetuned[spam]
accuracy_own_finetuned = calculate_accuracy(len(spam_df), len(results_own_finetuned))*100
print(f"Accuracy finetuned BERT auf eigenen Datensatz: {accuracy_own_finetuned}")

## Performance Vergleich zwischen Vorhersagen von base und finetuned BERT

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(10,4))
ax = fig.add_axes([0,0,1,1,])
datasets = ['Base Kaggle', 'Finetuned Kaggle', 'Base ChatGPT', 'Finetuned ChatGPT', 'Base Own', 'Finetuned Own']
colors = []

# Assign colors based on the datasets
for dataset in datasets:
    if dataset in ['Base Kaggle', 'Finetuned Kaggle']:
        colors.append('red')
    elif dataset in ['Base ChatGPT', 'Finetuned ChatGPT']:
        colors.append('blue')
    elif dataset in ['Base Own', 'Finetuned Own']:
        colors.append('green')

# Add labels for each bar
accuracies = [accuracy_kaggle_base, accuracy_kaggle_finetuned,
              accuracy_chatgpt_base, accuracy_chatgpt_finetuned,
              accuracy_own_base, accuracy_own_finetuned]
ax.bar(datasets, accuracies, color=colors)

for i, v in enumerate(accuracies):
    ax.text(i, v, str(v), ha='center', va='bottom')
    
ax.bar(datasets, accuracies, color=colors)
plt.ylabel('Accuracy in %')
plt.title('Accuracy je Validierungsdatensatz mit base und finetuned BERT')
plt.show()