<a href="https://colab.research.google.com/github/gptchat12370-ai/DLI/blob/main/2_feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git config --global user.email "almnaryb139@gmail.com"
!git config --global user.name "shihab1"

!git clone https://github.com/shihab1/DLI.git
%cd DLI

Cloning into 'DLI'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 6 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (6/6), 8.30 KiB | 1.19 MiB/s, done.
/content/DLI


In [1]:
# --- Commit 1: Text cleaning and preprocessing ---

# 1. Setup
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')  # Optional: kept for future use

# 2. Load cleaned dataset from previous step
df = pd.read_csv("data/cleaned_data.csv")

# 3. Merge subject and body into one text column
df['text'] = df['subject'].fillna('') + ' ' + df['body'].fillna('')

# 4. Phishing-aware text cleaner
def clean_text(text):
    text = str(text).lower()

    # Replace URLs with placeholder
    text = re.sub(r'https?://\S+|www\.\S+', ' httpurl ', text)

    # Replace email addresses
    text = re.sub(r'\S+@\S+', ' emailaddr ', text)

    # Replace money patterns
    text = re.sub(r'\$\d+|\d+%', ' moneytoken ', text)

    # Keep useful symbols (emails, links) and remove others
    text = re.sub(r'[^a-zA-Z0-9@:\./\s]', '', text)

    # Normalize extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# 5. Apply cleaning
df['clean_text'] = df['text'].apply(clean_text)

# Adding real-world safe emails to improve model generalization
extra_emails = [
     "Hey Alex, are you free for coffee next week?",
     "Thanks again for your help!",
     "Just checking in. Hope you're doing well!",
     "Can we reschedule our meeting?",
     "Hope everything is going great with you!"
 ]
extra_labels = [0] * len(extra_emails)
df_extra = pd.DataFrame({'subject': '', 'body': extra_emails, 'label': extra_labels})
df_extra['text'] = df_extra['subject'] + ' ' + df_extra['body']
df_extra['clean_text'] = df_extra['text'].apply(clean_text)
df = pd.concat([df, df_extra], ignore_index=True)

print("Cleaned text sample:")
print(df[['text', 'clean_text']].head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Cleaned text sample:
                                                text  \
0  FW: June 29 -- BNA, Inc. Daily Labor Report Us...   
1  NGX failover plan. Hi Chris, Tonight we are ro...   
2  RE: Intranet Site Rika r these new? -----Origi...   
3  FW: ENA Upstream Company information John/Gera...   
4  New Master Physical Gerald and Stacy - Attache...   

                                          clean_text  
0  fw: june 29 bna inc. daily labor report user i...  
1  ngx failover plan. hi chris tonight we are rol...  
2  re: intranet site rika r these new original me...  
3  fw: ena upstream company information john/gera...  
4  new master physical gerald and stacy attached ...  


In [3]:
# --- Commit 2: Tokenization and sequence preparation ---

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

# 6. Fit tokenizer on clean text
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['clean_text'])

# 7. Convert to sequences and pad
sequences = tokenizer.texts_to_sequences(df['clean_text'])
X = pad_sequences(sequences, maxlen=200)

# Save tokenizer
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

print("Sample tokenized sequence:", sequences[0])
print("Padded shape:", X.shape)

Sample tokenized sequence: [162, 754, 447, 4463, 97, 407, 2176, 111, 867, 522, 1, 1, 1, 56, 40, 15, 4463, 3071, 47, 188, 754, 399, 34, 80, 60, 93, 3, 4463, 3071, 46, 754, 447, 4463, 97, 407, 2176, 111, 407, 2176, 111, 3071, 1164, 5, 1675, 754, 447, 34, 8340, 1, 792, 375, 1255, 52, 292, 2, 454, 1202, 5, 90, 2464, 28, 536, 2, 2129, 550, 5659, 65, 71, 3028, 6, 3193, 19, 3045, 69, 8, 6, 122, 375, 2201, 9, 196, 25, 32, 19, 137, 4463, 443, 1955, 25, 1, 2074, 1966, 148, 180, 68, 143, 85, 93, 1499, 3071, 305, 184, 4326, 1, 8833, 845, 12, 9429, 2851, 3039, 6, 345, 1107, 150, 9429, 3039, 1, 1, 4, 9241, 1347, 4, 1, 1554, 4, 612, 298, 29, 3349, 7513, 2176, 2122, 2, 2057, 5, 2961, 4326, 956, 1, 6, 539, 2176, 1955, 489, 845, 1, 1, 1, 1, 517, 6, 97, 715, 8833, 305, 184, 6545, 54, 1, 152, 257, 63, 2, 489, 742, 2, 9429, 2851, 3039, 5, 1, 1, 1, 1452, 517, 6, 97, 1, 17, 468, 3209, 5, 88, 368, 218, 2, 539, 2176, 1955, 414, 2, 489, 3582, 13, 6, 586, 4669, 16, 742, 3, 1690, 622, 1, 468, 7, 1, 2910, 896, 3, 

In [4]:
# --- Commit 3: Label preparation and saving features ---

# 8. Labels
y = df['label'].astype(int).values

# 9. Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 10. Save for model training
np.save("data/X_train.npy", X_train)
np.save("data/X_test.npy", X_test)
np.save("data/y_train.npy", y_train)
np.save("data/y_test.npy", y_test)

print("✅ Tokenization & features saved:")
print("X_train shape:", X_train.shape)
print("y_train distribution:", np.bincount(y_train))

✅ Tokenization & features saved:
X_train shape: (41015, 200)
y_train distribution: [24407 16608]
