In [28]:
from datasets import load_dataset

imdb = load_dataset('../data/ReviewPrediction', data_files={'train': 'train.csv', 'test': 'test.csv', 'validation': 'validation.csv'})

In [29]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-german-cased")

In [30]:
def preprocess_function(examples):
    return tokenizer(examples["review"], truncation=True)   

In [31]:
tokenized_imdb = imdb.map(preprocess_function, batched=True)

Map:  26%|██▌       | 2000/7656 [00:00<00:00, 15504.31 examples/s]

Map: 100%|██████████| 7656/7656 [00:00<00:00, 13995.37 examples/s]
Map: 100%|██████████| 1423/1423 [00:00<00:00, 13373.37 examples/s]
Map: 100%|██████████| 403/403 [00:00<00:00, 12136.10 examples/s]


In [32]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [33]:
import evaluate

accuracy = evaluate.load("accuracy")

In [34]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [35]:
id2label = {1: "NEGATIVE", 2: "NEUTRAL", 3: "USEFUL", 4: "SUPERUSEFUL"}
label2id = {"NEGATIVE": 1, "NEUTRAL": 2, "USEFUL": 3, "SUPERUSEFUL": 4}

In [36]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=4, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
training_args = TrainingArguments(
    output_dir="models/",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


: 