# Load Data: Phishing Mail
link https://huggingface.co/datasets/zefang-liu/phishing-email-dataset


In [5]:
import pandas as pd

df = pd.read_csv("hf://datasets/zefang-liu/phishing-email-dataset/Phishing_Email.csv")

In [6]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18650 entries, 0 to 18649
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  18650 non-null  int64 
 1   Email Text  18634 non-null  object
 2   Email Type  18650 non-null  object
dtypes: int64(1), object(2)
memory usage: 437.2+ KB


# Preprocesing data

In [7]:
df['Email Type'] = df['Email Type'].map({'Safe Email' :1 ,'Phishing Email' : 0})
df.head(5)

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",1
1,1,the other side of * galicismos * * galicismo *...,1
2,2,re : equistar deal tickets are you still avail...,1
3,3,\nHello I am your hot lil horny toy.\n I am...,0
4,4,software at incredibly low prices ( 86 % lower...,0


In [8]:
# Rename Columns
df.rename(columns= {'Email Type':'target','Email Text':'text'}, inplace=True)

# Drop first column
df.drop(columns= 'Unnamed: 0' , inplace=True,axis=1)

# Check phishing email
df.target.value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1,11322
0,7328


In [9]:
# Check for Missing (NaN) Values
missing_count = df['text'].isna().sum()
print(f"Missing (NaN) values: {missing_count}")


Missing (NaN) values: 16


In [10]:
# Print NaN values
nan_rows = df[df['text'].isna()]
print(nan_rows)

      text  target
31     NaN       0
387    NaN       0
1883   NaN       0
2049   NaN       0
2451   NaN       0
2972   NaN       0
3627   NaN       0
3806   NaN       0
5763   NaN       0
6299   NaN       0
6821   NaN       0
8594   NaN       0
9999   NaN       0
11069  NaN       0
11320  NaN       0
13843  NaN       0


In [11]:
# Drop NaN
df = df.dropna(subset=['text'])

In [12]:
# Check for Truly Empty Strings
empty_string_count = (df['text'] == '').sum()
print(f"Empty strings (''): {empty_string_count}")

Empty strings (''): 0


In [13]:
# Check for Strings with Only Whitespace
whitespace_count = df['text'].str.strip().eq('').sum()
print(f"Whitespace-only strings: {whitespace_count}")


Whitespace-only strings: 3


In [14]:
# Check for Placeholder Text (like 'empty')
placeholder_count = (df['text'].str.lower() == 'empty').sum()
print(f"Rows with 'empty' as placeholder: {placeholder_count}")


Rows with 'empty' as placeholder: 533


In [15]:
empty_placeholder_rows = df[df['text'].str.lower() == 'empty']
print(empty_placeholder_rows)


        text  target
54     empty       0
106    empty       1
130    empty       0
244    empty       0
305    empty       0
...      ...     ...
18607  empty       1
18626  empty       1
18637  empty       0
18643  empty       1
18649  empty       0

[533 rows x 2 columns]


In [16]:
# Drop empty values
df = df[df['text'].str.lower() != 'empty']


In [17]:
import re

def clean_email_text(text):
    if isinstance(text, str):
        # Normalize whitespace characters
        text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
        text = re.sub(r'\s+', ' ', text)

        # Mask email addresses
        text = re.sub(r'\b[\w.-]+?@\w+?\.\w+?\b', ' [EMAIL] ', text)

        # Mask URLs
        text = re.sub(r'https?://\S+|www\.\S+', ' [URL] ', text)

        # Replace long numbers (IDs or phone numbers)
        text = re.sub(r'\d{5,}', ' [NUM] ', text)

        # Remove repeated punctuation like "!!!" or "??"
        text = re.sub(r'([!?.]){2,}', r'\1', text)

        # Remove unwanted characters but keep certain punctuations
        allowed_chars = r'[^\w\s\[\].,!?@-]'
        text = re.sub(allowed_chars, '', text)

        # Convert to lowercase and trim
        return text.strip().lower()

    return ""

# Apply the function
df['clean_text'] = df['text'].map(clean_email_text)


In [18]:
# df['text'] = df['text'].astype(str).str.slice(0, 512)

# BERT Evaluation

In [19]:
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
from tqdm import tqdm

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')
model.eval()

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")
# Batched embedding function
def get_embeddings_batched(texts, batch_size=32):
    all_embeddings = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i + batch_size]

        # Tokenize with padding and truncation
        inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)

        # Mean pool across tokens for each sequence
        last_hidden = outputs.last_hidden_state  # shape: [batch_size, seq_len, hidden_size]
        batch_embeddings = last_hidden.mean(dim=1).cpu().numpy()  # shape: [batch_size, hidden_size]

        all_embeddings.append(batch_embeddings)

    return np.vstack(all_embeddings)

# Apply to your DataFrame
texts = df['clean_text'].tolist()
embeddings = get_embeddings_batched(texts, batch_size=32)


Using device: cuda


100%|██████████| 566/566 [09:06<00:00,  1.04it/s]


In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


X_train, X_test, y_train, y_test = train_test_split(embeddings, df['target'], test_size=0.3, stratify=df['target'], random_state=42)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.97      0.93      0.95      2093
           1       0.96      0.98      0.97      3338

    accuracy                           0.96      5431
   macro avg       0.96      0.96      0.96      5431
weighted avg       0.96      0.96      0.96      5431



# Transformer all-MiniLM-L6-v2

In [21]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')  # fast + good quality
embeddings = model.encode(df['clean_text'].tolist(), show_progress_bar=True)


Batches:   0%|          | 0/566 [00:00<?, ?it/s]

In [22]:
X_train, X_test, y_train, y_test = train_test_split(embeddings, df['target'], test_size=0.3, stratify=df['target'], random_state=42)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.97      0.87      0.92      2093
           1       0.92      0.99      0.95      3338

    accuracy                           0.94      5431
   macro avg       0.95      0.93      0.94      5431
weighted avg       0.94      0.94      0.94      5431



Questions:


1.   I have 512 max lenght characters
2.   I have random forest, should I use other one?





