In [1]:
from datasets import load_dataset
import evaluate
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    pipeline,
)

In [2]:
def preprocess_function(data, tokenizer):
    return tokenizer(data["text"], truncation=True)


def compute_metrics(eval_pred):
    accuracy = evaluate.load("accuracy")
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [3]:
model_id = "microsoft/deberta-v3-large"
tokenizer = AutoTokenizer.from_pretrained(model_id)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



In [5]:
training_dataset = load_dataset("csv", data_files="train.csv")
training_dataset = training_dataset["train"].train_test_split(test_size=0.1, seed=42)
tokenized_dataset = training_dataset.map(
    preprocess_function, fn_kwargs={"tokenizer": tokenizer}, batched=True
)
tokenized_dataset

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/8588 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/955 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8588
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 955
    })
})

In [6]:
id2label = {0: "NEGATIVE", 1: "POSITIVE", 2: "NEUTRAL"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1, "NEUTRAL": 2}

model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=3, id2label=id2label, label2id=label2id
)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
training_args = TrainingArguments(
    output_dir="deberta-v3-large-ft",
    learning_rate=3.2413268041956336e-05,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    evaluation_strategy="steps",
    save_strategy="steps",
    logging_steps=100,
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Step,Training Loss,Validation Loss,Accuracy
100,0.6951,0.535861,0.771728
200,0.3795,0.282134,0.899476
300,0.2244,0.298254,0.900524
400,0.135,0.287766,0.921466


TrainOutput(global_step=405, training_loss=0.3565331353081597, metrics={'train_runtime': 264.0896, 'train_samples_per_second': 97.558, 'train_steps_per_second': 1.534, 'total_flos': 2738078484867432.0, 'train_loss': 0.3565331353081597, 'epoch': 3.0})

In [8]:
repository_id = f"{model_id.split('/')[1]}-twitter-text-classification"
trainer.save_model(repository_id)

In [9]:
classifier = pipeline("sentiment-analysis", model=f"./{repository_id}")

In [10]:
total = 0
correct = 0
evaluation_dataset = load_dataset("csv", data_files={"test": "test.csv"})
for i in evaluation_dataset["test"]:
    label = id2label[i["label"]]
    text = i["text"]
    prediction = classifier(text)[0]["label"]
    total += 1
    correct += label == prediction
print(f"total: {total} correct: {correct} accuracy: { round((correct/total)*100, 3)}%")

Generating test split: 0 examples [00:00, ? examples/s]

total: 2388 correct: 2187 accuracy: 91.583%
