In [1]:
import pandas as pd
import numpy as np
import datasets
import torch
import os
from datasets import load_dataset, load_from_disk, concatenate_datasets
from transformers import AutoTokenizer, DataCollatorWithPadding, TrainingArguments, AutoModelForSequenceClassification
from transformers import Trainer
from sklearn.model_selection import train_test_split
import evaluate

In [2]:
dataset = load_dataset('../data/ReviewPrediction', data_files={'train': 'train_binary.csv', 'test': 'test_binary.csv', 'validation': 'validation_binary.csv'})
dataset

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0.1', 'Unnamed: 0', '_id', 'review', 'score', 'upvotes', 'downvotes', 'sum'],
        num_rows: 6826
    })
    test: Dataset({
        features: ['Unnamed: 0.1', 'Unnamed: 0', '_id', 'review', 'score', 'upvotes', 'downvotes', 'sum'],
        num_rows: 1897
    })
    validation: Dataset({
        features: ['Unnamed: 0.1', 'Unnamed: 0', '_id', 'review', 'score', 'upvotes', 'downvotes', 'sum'],
        num_rows: 759
    })
})

In [3]:
checkpoint = 'bert-base-german-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example['review'], truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset["train"][0]

Map:   0%|          | 0/6826 [00:00<?, ? examples/s]

Map:   0%|          | 0/1897 [00:00<?, ? examples/s]

Map:   0%|          | 0/759 [00:00<?, ? examples/s]

{'Unnamed: 0.1': 1893,
 'Unnamed: 0': 1893,
 '_id': '5c34e1c593ac7c001ca22f47',
 'review': 'Sehr gute Vorlesung - sehr unfaire Prüfung....',
 'score': 3,
 'upvotes': 11.0,
 'downvotes': 2.0,
 'sum': 1,
 'input_ids': [3,
  19386,
  4493,
  15428,
  27,
  26935,
  1120,
  174,
  8716,
  942,
  4185,
  26914,
  26914,
  26914,
  26914,
  4],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [4]:
tokenized_dataset = tokenized_dataset.remove_columns(["upvotes", "downvotes", "score", "Unnamed: 0", '_id', 'review'])
tokenized_dataset = tokenized_dataset.rename_column("sum", "labels")
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0.1', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6826
    })
    test: Dataset({
        features: ['Unnamed: 0.1', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1897
    })
    validation: Dataset({
        features: ['Unnamed: 0.1', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 759
    })
})

In [5]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
torch.cuda.is_available()

True

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
from sklearn.metrics import classification_report, accuracy_score

def compute_metrics(eval_preds):
    results = {}
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    class_report= classification_report(y_pred=predictions, y_true=labels)
    acc = accuracy_score(y_pred=predictions, y_true=labels)
    print("############ Classification report ############")
    print(class_report)
    print("############      Accuracy       ##############")
    print(acc)
    results.update({'classification report' : class_report})
    results.update({'accuracy':acc})
    return results

training_args = TrainingArguments(
    'test-trainer', 
    auto_find_batch_size=True,
    #gradient_accumulation_steps=4,
    evaluation_strategy="epoch", 
    num_train_epochs=3,
    #per_device_train_batch_size=4,  
    #per_device_eval_batch_size=1,
    #eval_accumulation_steps=1,
    learning_rate=1e-5,
    #save_strategy="epoch",
    #load_best_model_at_end=True
)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [8]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Classification report,Accuracy
1,0.7424,0.6853,precision recall f1-score support  0 0.69 0.93 0.79 499  1 0.58 0.18 0.28 260  accuracy 0.67 759  macro avg 0.63 0.56 0.53 759 weighted avg 0.65 0.67 0.61 759,0.674572
2,0.8229,1.228326,precision recall f1-score support  0 0.68 0.92 0.78 499  1 0.52 0.17 0.25 260  accuracy 0.66 759  macro avg 0.60 0.54 0.52 759 weighted avg 0.63 0.66 0.60 759,0.662714
3,0.6843,1.732968,precision recall f1-score support  0 0.69 0.81 0.74 499  1 0.44 0.29 0.35 260  accuracy 0.63 759  macro avg 0.56 0.55 0.55 759 weighted avg 0.60 0.63 0.61 759,0.629776


############ Classification report ############
              precision    recall  f1-score   support

           0       0.69      0.93      0.79       499
           1       0.58      0.18      0.28       260

    accuracy                           0.67       759
   macro avg       0.63      0.56      0.53       759
weighted avg       0.65      0.67      0.61       759

############      Accuracy       ##############
0.6745718050065876
############ Classification report ############
              precision    recall  f1-score   support

           0       0.68      0.92      0.78       499
           1       0.52      0.17      0.25       260

    accuracy                           0.66       759
   macro avg       0.60      0.54      0.52       759
weighted avg       0.63      0.66      0.60       759

############      Accuracy       ##############
0.6627140974967062
############ Classification report ############
              precision    recall  f1-score   support

           0 

TrainOutput(global_step=10239, training_loss=0.7433729763200523, metrics={'train_runtime': 1989.188, 'train_samples_per_second': 10.295, 'train_steps_per_second': 5.147, 'total_flos': 692637797680080.0, 'train_loss': 0.7433729763200523, 'epoch': 3.0})

In [13]:
predictions = trainer.predict(tokenized_dataset["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

preds = np.argmax(predictions.predictions, axis=-1)

############ Classification report ############
              precision    recall  f1-score   support

           0       0.69      0.83      0.75      1247
           1       0.47      0.28      0.35       650

    accuracy                           0.64      1897
   macro avg       0.58      0.56      0.55      1897
weighted avg       0.61      0.64      0.62      1897

############      Accuracy       ##############
0.6441750131787032
(1897, 2) (1897,)


In [14]:
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

print(accuracy_metric.compute(predictions=preds, references=predictions.label_ids))
print(precision_metric.compute(predictions=preds, references=predictions.label_ids, average='macro'))
print(recall_metric.compute(predictions=preds, references=predictions.label_ids, average='macro'))
print(f1_metric.compute(predictions=preds, references=predictions.label_ids, average='macro'))

{'accuracy': 0.6441750131787032}
{'precision': 0.5793836066679027}
{'recall': 0.5581056073036827}
{'f1': 0.5542615757622171}


In [15]:
preds

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)