In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_path = 'Data/spam.csv'
data = pd.read_csv(data_path, encoding='Windows-1252')
data = data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
data.columns = ['label', 'email']

In [3]:
model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
encoding = {"ham" : 0, "spam" : 1}
labels = data["label"].map(encoding)
X_main, X_test, y_main, y_test = train_test_split(data['email'], labels, test_size=0.2, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_main, y_main, test_size=0.25, random_state=0)

In [6]:
len(X_test)

1115

In [5]:
train_encodings = tokenizer(list(X_train), truncation=True, padding=True)
val_encodings = tokenizer(list(X_val),   truncation=True, padding=True)
test_encodings = tokenizer(list(X_test),  truncation=True, padding=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [6]:
class SpamDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = list(labels)
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, index):
        item = {k: torch.tensor(v[index]) for k, v in self.encodings.items()} 
        item['labels'] = torch.tensor(self.labels[index], dtype = torch.long)
        return item  

In [7]:
train_dataset = SpamDataset(train_encodings, y_train)
test_dataset = SpamDataset(test_encodings, y_test)
val_dataset = SpamDataset(val_encodings, y_val)

In [8]:
training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        eval_strategy="epoch",
        logging_dir="./logs",
        logging_steps=100,
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy"
    )

In [9]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [None]:
#Must run on Rosie. Will take 6 hours otherwise
trainer.train()

In [15]:
model_dir = './results'
loaded_tokenizer = AutoTokenizer.from_pretrained(model_dir, fix_mistral_regex=True)
loaded_model = AutoModelForSequenceClassification.from_pretrained(model_dir)

In [23]:
loaded_model.eval()
loaded_trainer = Trainer(model=loaded_model, tokenizer=loaded_tokenizer)
pred_output = loaded_trainer.predict(test_dataset)
print(compute_metrics(pred_output))


  loaded_trainer = Trainer(model=loaded_model, tokenizer=loaded_tokenizer)


{'accuracy': 0.9964125560538116, 'f1': 0.9879518072289156, 'precision': 0.9879518072289156, 'recall': 0.9879518072289156}
