In [11]:
import transformers
import sklearn
import pandas as pd
from tqdm import tqdm 
tqdm.pandas()

In [12]:
data = pd.read_csv('EmailsLlm.csv')

Drop uneeded cols and null values

In [13]:
keptFeatures = ["Subject","Date","Body","Label"]
data = data[keptFeatures]
data.dropna(inplace=True,how='any')
data

Unnamed: 0,Subject,Date,Body,Label
0,[ilug] stop the mlm insanity,"Fri, 02 Aug 2002 23:37:59 0530",greetings! you are receiving this letter becau...,spam
6,new product announcement,"Fri, 3 Jan 1997 17:24:47 -0700",new product announcement from: outsource eng.&...,spam
7,fw:,"Fri, 02 Jan 1998 04:30:44 -0400",thank you for your interest! judgment courses ...,spam
8,[sa] urgent help..............,"Mon, 5 Apr 1999 20:38:02 +0100",----------------------------------------------...,spam
10,finally collecct your judgment (71733),"Wed, 16 Aug 2000 17:38:13 -0400 (EDT)",yes we do purchase uncollected judicial judgem...,spam
...,...,...,...,...
4193,geocaching.com weekly cache notification,"Thu, 12 Sep 2002 19:24:11 -0700",greetings from geocaching.com - recent caches ...,ham
4194,securing multiple virtual hosts,"Fri, 13 Sep 2002 16:07:47 -0700",i am trying to secure three of four virtual ho...,ham
4195,javaserver pages updated,"Tue, 17 Sep 2002 17:07:02 -0700","filled with useful examples and the depth, cla...",ham
4196,linux-announce digest #180,"Sat, 7 Sep 2002 22:13:03 EDT","linux-announce digest #180, volume #4 sat, 7 s...",ham


In [14]:
from datetime import datetime
import re

def Label_to_int(label):
    if label == "ham":
        return 0
    else:
        return 1

data["Label"] = data["Label"].apply(Label_to_int)

In [15]:
data

Unnamed: 0,Subject,Date,Body,Label
0,[ilug] stop the mlm insanity,23,greetings! you are receiving this letter becau...,1
6,new product announcement,17,new product announcement from: outsource eng.&...,1
7,fw:,4,thank you for your interest! judgment courses ...,1
8,[sa] urgent help..............,20,----------------------------------------------...,1
10,finally collecct your judgment (71733),17,yes we do purchase uncollected judicial judgem...,1
...,...,...,...,...
4193,geocaching.com weekly cache notification,19,greetings from geocaching.com - recent caches ...,0
4194,securing multiple virtual hosts,16,i am trying to secure three of four virtual ho...,0
4195,javaserver pages updated,17,"filled with useful examples and the depth, cla...",0
4196,linux-announce digest #180,22,"linux-announce digest #180, volume #4 sat, 7 s...",0


In [30]:
model_name = 'distilbert/distilbert-base-uncased'

# Auto means that the type of model is automatically detected depending on the model name 
model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name, device_map = 'cuda')
#model = transformers.TFAutoModelForSequenceTranslation.from_pretrained(model_name,from_pt=True)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, device_map = 'cuda',truncate_long_texts=True)

#pipe = transformers.pipeline('text-classification', model=model, tokenizer=tokenizer)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
from datasets import Dataset

# Convert pandas DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(data)

In [41]:
# Tokenize the dataset
def tokenize_function(examples):
    # Tokenize the text
    tokenized_inputs = tokenizer(examples['Body'], padding='max_length', truncation=True)
    
    # Align labels
    labels = examples['Label']
    
    # Add labels to the tokenized inputs
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function,batched=True)

Map:   0%|          | 0/3415 [00:00<?, ? examples/s]

In [42]:
# Split dataset
train_test_dataset = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_dataset['train']
test_dataset = train_test_dataset['test']

In [43]:
training_args = transformers.TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

In [44]:
trainer = transformers.Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset             # evaluation dataset
)

trainer.train()

  0%|          | 0/1366 [00:00<?, ?it/s]

{'loss': 0.7026, 'grad_norm': 2.180034637451172, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.01}
{'loss': 0.7081, 'grad_norm': 2.9429116249084473, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.03}
{'loss': 0.6744, 'grad_norm': 2.742413282394409, 'learning_rate': 3e-06, 'epoch': 0.04}
{'loss': 0.6374, 'grad_norm': 2.693326711654663, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.06}
{'loss': 0.6077, 'grad_norm': 2.1169025897979736, 'learning_rate': 5e-06, 'epoch': 0.07}
{'loss': 0.636, 'grad_norm': 2.579866886138916, 'learning_rate': 6e-06, 'epoch': 0.09}
{'loss': 0.5548, 'grad_norm': 3.270555257797241, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.1}
{'loss': 0.52, 'grad_norm': 4.142918586730957, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.12}
{'loss': 0.4627, 'grad_norm': 3.5206491947174072, 'learning_rate': 9e-06, 'epoch': 0.13}
{'loss': 0.4869, 'grad_norm': 2.9806301593780518, 'learning_rate': 1e-05, 'epoch': 0.15}
{'loss': 0.4084, 'grad_norm': 3.182164

TrainOutput(global_step=1366, training_loss=0.14540701415553406, metrics={'train_runtime': 2283.4982, 'train_samples_per_second': 2.393, 'train_steps_per_second': 0.598, 'total_flos': 723801866256384.0, 'train_loss': 0.14540701415553406, 'epoch': 2.0})

In [46]:
trainer.evaluate()

  0%|          | 0/86 [00:00<?, ?it/s]

{'eval_loss': 0.07932686060667038,
 'eval_runtime': 60.3278,
 'eval_samples_per_second': 11.321,
 'eval_steps_per_second': 1.426,
 'epoch': 2.0}

In [47]:
model.save_pretrained("email_model")

In [51]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(p):
    preds = p.predictions.argmax(axis=-1)
    labels = p.label_ids
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

trainer = transformers.Trainer(
    model=model,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

tunedresults = trainer.evaluate()


  0%|          | 0/86 [00:00<?, ?it/s]

In [52]:
nonTrainedModel = transformers.AutoModelForSequenceClassification.from_pretrained(model_name, device_map = 'cuda')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(p):
    preds = p.predictions.argmax(axis=-1)
    labels = p.label_ids
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

trainer = transformers.Trainer(
    model=nonTrainedModel,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

results = trainer.evaluate()


  0%|          | 0/86 [00:00<?, ?it/s]

the base model wasn't made for text classification that's why the bse results are really low

In [54]:
print("Results of the non-tuned model:")
print(results)
print("Results of the tuned model:")
print(tunedresults)

Results of the non-tuned model:
{'eval_loss': 0.7070561051368713, 'eval_model_preparation_time': 0.001, 'eval_accuracy': 0.2342606149341142, 'eval_precision': 0.08478260869565217, 'eval_recall': 0.2765957446808511, 'eval_f1': 0.129783693843594, 'eval_runtime': 86.0916, 'eval_samples_per_second': 7.933, 'eval_steps_per_second': 0.999}
Results of the tuned model:
{'eval_loss': 0.07932686060667038, 'eval_model_preparation_time': 0.002, 'eval_accuracy': 0.9824304538799414, 'eval_precision': 0.9387755102040817, 'eval_recall': 0.9787234042553191, 'eval_f1': 0.9583333333333334, 'eval_runtime': 100.3388, 'eval_samples_per_second': 6.807, 'eval_steps_per_second': 0.857}
