# 03 - Feature Engineering

This notebook covers:
- TF-IDF Vectorization
- BERT Embeddings
- Doc2Vec Embeddings
- Train/Test splitting for each method

## Import Libraries

In [1]:
!pip install nltk



In [2]:
!pip install gensim




In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import torch
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
import os
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
import warnings

warnings.filterwarnings('ignore')
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Load Preprocessed Data

In [4]:
df = pd.read_csv("/content/sample_data/Phishing_Email_Preprocessed.csv")
df.head()

Unnamed: 0,Email Text,Email Type,Email Text processed
0,"re : 6 . 1100 , disc : uniformitarianism , re ...",1,re 6 1100 disc uniformitarianism re 1086 sex l...
1,the other side of * galicismos * * galicismo *...,1,the other side of galicismos galicismo is a sp...
2,re : equistar deal tickets are you still avail...,1,re equistar deal tickets are you still availab...
3,\nHello I am your hot lil horny toy.\n I am...,0,hello i am your hot lil horny toy i am the one...
4,software at incredibly low prices ( 86 % lower...,0,software at incredibly low prices 86 lower dra...


## Feature Engineering Method 1: TF-IDF Vectorizer

In [5]:
df = df.dropna(subset=["Email Text processed"])


In [6]:
tf = TfidfVectorizer(stop_words="english",max_features=5000,min_df=5)

feature_x = (df["Email Text processed"])

In [7]:
y_tf = np.array(df['Email Type'])

### Splitting into Train and Test (TF-IDF)

In [8]:
x_full_train,x_full_test,y_full_train,y_full_test = train_test_split(feature_x,y_tf,train_size=0.8,random_state=0)

In [9]:
x_full_train = tf.fit_transform(x_full_train)
x_full_test = tf.transform(x_full_test)

## Feature Engineering Method 2: BERT Embeddings

In [None]:
import torch
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
import numpy as np
import os

emails = df["Email Text processed"].astype(str).tolist()
labels = np.array(df['Email Type'])

def get_bert_cls_embeddings(emails, batch_size=32, device=None):

    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)
    bert_model.eval()

    embeddings = []

    for i in tqdm(range(0, len(emails), batch_size), desc="Generating BERT embeddings"):
        batch_texts = emails[i:i + batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = bert_model(**inputs)
            cls_tokens = outputs.last_hidden_state[:, 0, :]
            embeddings.append(cls_tokens.cpu().numpy())

    return np.vstack(embeddings)

def get_bert_cls_embedding(email, device=None):
    return get_bert_cls_embeddings([email], batch_size=1, device=device)[0]

embedding_path = "email_embeddings.npy"

if os.path.exists(embedding_path):
    print("✅ Loading precomputed email embeddings...")
    email_embeddings = np.load(embedding_path)
else:
    print("⚙️  Generating BERT embeddings...")
    email_embeddings = get_bert_cls_embeddings(emails, batch_size=32)
    np.save(embedding_path, email_embeddings)
    print("✅ Saved embeddings to disk.")

X = np.array(email_embeddings)
y = np.array(labels)

### Splitting into Train and Test (BERT)

In [None]:
X_train_bert, X_test_bert, y_train_bert, y_test_bert = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train_bert.shape

## Feature Engineering Method 3: Doc2Vec

In [11]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [12]:
import re
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import nltk

nltk.download('punkt')
emails = df["Email Text processed"].astype(str).tolist()
labels = np.array(df['Email Type'])

tagged_data = [TaggedDocument(words=word_tokenize(doc), tags=[str(i)]) for i, doc in enumerate(emails)]

model = Doc2Vec(vector_size=100, window=5, min_count=2, workers=4, epochs=40)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

X_vectors = [model.infer_vector(word_tokenize(doc)) for doc in emails]
X = np.array(X_vectors)
y = labels

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Splitting into Train and Test (Doc2Vec)

In [13]:
X_train_doc, X_test_doc, y_train_doc, y_test_doc = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Train set shape :", X_train_doc.shape)
print("Test set shape :", X_test_doc.shape)

Train set shape : (14028, 100)
Test set shape : (3508, 100)


## Save Feature Sets

In [15]:
import pickle

with open('tfidf_features.pkl', 'wb') as f:
    pickle.dump((x_full_train, x_full_test, y_full_train, y_full_test, tf), f)



np.save('doc2vec_train.npy', X_train_doc)
np.save('doc2vec_test.npy', X_test_doc)
np.save('doc2vec_train_labels.npy', y_train_doc)
np.save('doc2vec_test_labels.npy', y_test_doc)

print("All feature sets saved successfully!")

All feature sets saved successfully!


In [None]:
import pickle

with open('tfidf_features.pkl', 'wb') as f:
    pickle.dump((x_full_train, x_full_test, y_full_train, y_full_test, tf), f)

np.save('bert_train.npy', X_train_bert)
np.save('bert_test.npy', X_test_bert)
np.save('bert_train_labels.npy', y_train_bert)
np.save('bert_test_labels.npy', y_test_bert)

np.save('doc2vec_train.npy', X_train_doc)
np.save('doc2vec_test.npy', X_test_doc)
np.save('doc2vec_train_labels.npy', y_train_doc)
np.save('doc2vec_test_labels.npy', y_test_doc)

print("All feature sets saved successfully!")