# LLM Model for Phishing E-mail Detection

In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

### Step 1: Importing Data and First Look

In [2]:
phishing_df = pd.read_csv('phishing_email.csv')
phishing_df

Unnamed: 0,text_combined,label
0,hpl nom may 25 2001 see attached file hplno 52...,0
1,nom actual vols 24 th forwarded sabrae zajac h...,0
2,enron actuals march 30 april 1 201 estimated a...,0
3,hpl nom may 30 2001 see attached file hplno 53...,0
4,hpl nom june 1 2001 see attached file hplno 60...,0
...,...,...
82481,info advantageapartmentscom infoadvantageapart...,1
82482,monkeyorg helpdeskmonkeyorg monkeyorg hi josep...,1
82483,help center infohelpcentercoza_infohelpcenterc...,1
82484,metamask infosofamekarcom verify metamask wall...,1


In [3]:
# Checking if the gpu is available
print(torch.cuda.is_available())  # Should Return True

True


### Step 2: Splitting Data

In [None]:
# Split into train and test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    phishing_df['text_combined'].tolist(), phishing_df['label'].tolist(), test_size=0.2, random_state=42)

### Step 3: Tokenization and Dataset Creation with Hugging Face

In [None]:
# BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=512)


In [None]:
# Create Hugging Face datasets
dataset_train = Dataset.from_dict({'text': train_texts, 'label': train_labels}).map(lambda e: tokenize_function(e['text']), batched=True)
dataset_test = Dataset.from_dict({'text': test_texts, 'label': test_labels}).map(lambda e: tokenize_function(e['text']), batched=True)

Map: 100%|██████████| 65988/65988 [10:57<00:00, 100.41 examples/s]
Map: 100%|██████████| 16498/16498 [02:13<00:00, 123.79 examples/s]


### Step 4: BERT Model for Classification and Training Setup

In [None]:
# BERT model for classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Training setup
training_args = TrainingArguments(
    output_dir='/results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='/logs',
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torch import cuda

# Define if the GPU is available
device = "cuda" if cuda.is_available() else "cpu"

# Move the model to the GPU (if available)
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [10]:
print(model.device)

cuda:0


The output cuda:0 indicates that the model is running on the first available GPU. This means that the model has been successfully moved to the GPU and is using the cuda device instead of the cpu.

### Step 5: Model Training with Hugging Face Trainer

In [None]:
# Initialize the Trainer with model, training arguments, and datasets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
)

In [None]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.029,0.035074


TrainOutput(global_step=8249, training_loss=0.0812879099471885, metrics={'train_runtime': 56756.6513, 'train_samples_per_second': 1.163, 'train_steps_per_second': 0.145, 'total_flos': 1.736217232109568e+16, 'train_loss': 0.0812879099471885, 'epoch': 1.0})

### Step 6: Evaluation of Results

In [None]:
# Evaluation
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 0.0350743792951107, 'eval_runtime': 1510.7908, 'eval_samples_per_second': 10.92, 'eval_steps_per_second': 1.366, 'epoch': 1.0}


In [None]:
import numpy as np

# Predictions on the test set
predictions = trainer.predict(dataset_test)
pred_labels = np.argmax(predictions.predictions, axis=1)

# True labels
true_labels = predictions.label_ids

In [16]:
from sklearn.metrics import classification_report

# Classification report
report = classification_report(true_labels, pred_labels, target_names=['Classe 0', 'Classe 1'])
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

    Classe 0       1.00      0.99      0.99      7935
    Classe 1       0.99      1.00      0.99      8563

    accuracy                           0.99     16498
   macro avg       0.99      0.99      0.99     16498
weighted avg       0.99      0.99      0.99     16498

