In [None]:
# Step 1: Install Required Libraries
!pip install transformers datasets scikit-learn

# Step 2: Upload the Dataset
from google.colab import files
import pandas as pd
import io

# Upload the dataset
uploaded = files.upload()

# Load the dataset
file_name = list(uploaded.keys())[0]  # Get the name of the uploaded file
try:
    df = pd.read_csv(io.BytesIO(uploaded[file_name]))
except UnicodeDecodeError:
    df = pd.read_csv(io.BytesIO(uploaded[file_name]), encoding='latin1') #if you have encoding issues

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(df.head())

# Display the column names
print("\nColumn names in the dataset:")
print(df.columns)

# Step 3: Preprocess the Data
# Adjust the column names based on the actual dataset
# Replace "Email Text" and "Email Type" with the correct column names from your dataset
# Example:
if "Email Text" in df.columns and "Email Type" in df.columns:
    df = df.rename(columns={"Email Text": "text", "Email Type": "label"})
elif "text" in df.columns and "label" in df.columns:
    pass #columns already named correctly
else:
    print("Error: Please make sure your dataframe has columns named 'Email Text' and 'Email Type' or 'text' and 'label'")
    print("or change the column names in the code to match your data.")
    exit()

# Convert labels to binary format (0 for "Safe Email", 1 for "Phishing Email")
if isinstance(df['label'].iloc[0], str): # check if label is string
    df["label"] = df["label"].map({"Safe Email": 0, "Phishing Email": 1})
elif isinstance(df['label'].iloc[0], int):
    pass #labels are already integers.
else:
    print("Error: Label column should contain strings or integers")
    exit()

# Keep only relevant columns
df = df[["text", "label"]]

# Display the first few rows of the preprocessed dataset
print("\nFirst few rows of the preprocessed dataset:")
print(df.head())

# Step 4: Split the Dataset
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.15, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.15, random_state=42)

# Convert pandas DataFrames to Hugging Face Dataset objects
from datasets import Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Step 5: Tokenize the Data
from transformers import AutoTokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    # Ensure the input is a list of strings
    texts = [str(text) for text in examples["text"]]  # Convert to strings
    return tokenizer(texts, padding="max_length", truncation=True, max_length=512)

# Apply tokenization to the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True, batch_size=16)
val_dataset = val_dataset.map(tokenize_function, batched=True, batch_size=16)
test_dataset = test_dataset.map(tokenize_function, batched=True, batch_size=16)

# Step 6: Load a Pre-trained LLM for Sequence Classification
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Step 7: Define Training Arguments
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    metric_for_best_model="f1", #add this line
    load_best_model_at_end=True #add this line
)

# Step 8: Define Metrics for Evaluation
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    try:
        auc = roc_auc_score(labels, logits[:, 1])  # Use the probability of the positive class
    except ValueError:
        auc = 0.5 # if only one label is predicted, auc is undefined
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "auc": auc,
    }

# Step 9: Train the Model
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# Step 10: Evaluate the Model on the Test Set
test_results = trainer.evaluate(test_dataset)
print("Test Results:", test_results)

# Step 11: Save the Model and Tokenizer
model.save_pretrained("./phishing_detection_model")
tokenizer.save_pretrained("./phishing_detection_tokenizer")

# Step 12: Load the Model for Inference
from transformers import AutoModelForSequenceClassification, AutoTokenizer
loaded_model = AutoModelForSequenceClassification.from_pretrained("./phishing_detection_model")
loaded_tokenizer = AutoTokenizer.from_pretrained("./phishing_detection_tokenizer")

# Example Inference
def predict_phishing(email_text):
    inputs = loaded_tokenizer(email_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = loaded_model(**inputs)
    probs = outputs.logits.softmax(dim=-1)
    return "Phishing" if probs.argmax().item() == 1 else "Legitimate"

# Test the function with a sample email
sample_email = "Congratulations! You've won a $1000 gift card. Click here to claim your prize."
print("Prediction:", predict_phishing(sample_email))


Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xx

Saving Phishing_Email 4.csv to Phishing_Email 4.csv
First few rows of the dataset:
    sn                                         Email Text      Email Type
0  0.0  re : 6 . 1100 , disc : uniformitarianism , re ...      Safe Email
1  1.0  the other side of * galicismos * * galicismo *...      Safe Email
2  2.0  re : equistar deal tickets are you still avail...      Safe Email
3  3.0  \nHello I am your hot lil horny toy.\n    I am...  Phishing Email
4  4.0  software at incredibly low prices ( 86 % lower...  Phishing Email

Column names in the dataset:
Index(['sn', 'Email Text', 'Email Type'], dtype='object')

First few rows of the preprocessed dataset:
                                                text  label
0  re : 6 . 1100 , disc : uniformitarianism , re ...    0.0
1  the other side of * galicismos * * galicismo *...    0.0
2  re : equistar deal tickets are you still avail...    0.0
3  \nHello I am your hot lil horny toy.\n    I am...    1.0
4  software at incredibly low prices ( 8

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/2501 [00:00<?, ? examples/s]

Map:   0%|          | 0/442 [00:00<?, ? examples/s]

Map:   0%|          | 0/520 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mteamnagid[0m ([33mteamnagid-teamscribe-ng[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


ValueError: Target size (torch.Size([16])) must be the same as input size (torch.Size([16, 2]))

In [None]:
# Step 1: Install Required Libraries
!pip install transformers datasets scikit-learn

# Step 2: Upload the Dataset
from google.colab import files
import pandas as pd
import io
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset

# Upload the dataset
uploaded = files.upload()

# Load the dataset
file_name = list(uploaded.keys())[0]
try:
    df = pd.read_csv(io.BytesIO(uploaded[file_name]))
except UnicodeDecodeError:
    df = pd.read_csv(io.BytesIO(uploaded[file_name]), encoding='latin1')

# Preprocessing
if "Email Text" in df.columns and "Email Type" in df.columns:
    df = df.rename(columns={"Email Text": "text", "Email Type": "label"})
elif "text" in df.columns and "label" in df.columns:
    pass
else:
    print("Error: Please check column names.")
    exit()

if isinstance(df['label'].iloc[0], str):
    df["label"] = df["label"].map({"Safe Email": 0, "Phishing Email": 1})
elif isinstance(df['label'].iloc[0], int):
    pass
elif isinstance(df['label'].iloc[0], float):
    df["label"] = df["label"].astype(int)
else:
    print("Error: Label column should contain strings or integers")
    exit()

df = df[["text", "label"]]

# Split the Dataset
train_df, test_df = train_test_split(df, test_size=0.15, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.15, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenize the Data
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    texts = [str(text) for text in examples["text"]]
    return tokenizer(texts, padding="max_length", truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True, batch_size=16)
val_dataset = val_dataset.map(tokenize_function, batched=True, batch_size=16)
test_dataset = test_dataset.map(tokenize_function, batched=True, batch_size=16)

# Load Pre-trained LLM
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
)

# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    try:
        auc = roc_auc_score(labels, logits[:, 1])
    except ValueError:
        auc = 0.5
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "auc": auc,
    }

# Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# Evaluate on Test Set
test_results = trainer.evaluate(test_dataset)
print("Test Results:", test_results)

# Save Model
model.save_pretrained("./phishing_detection_model")
tokenizer.save_pretrained("./phishing_detection_tokenizer")

# Load Model for Inference
loaded_model = AutoModelForSequenceClassification.from_pretrained("./phishing_detection_model")
loaded_tokenizer = AutoTokenizer.from_pretrained("./phishing_detection_tokenizer")

def predict_phishing(email_text):
    inputs = loaded_tokenizer(email_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = loaded_model(**inputs)
    probs = outputs.logits.softmax(dim=-1)
    return "Phishing" if probs.argmax().item() == 1 else "Legitimate"

sample_email = "Congratulations! You've won a $1000 gift card. Click here to claim your prize."
print("Prediction:", predict_phishing(sample_email))

# Extract and Print Tables of Results
print("\nTraining Results:")
training_metrics = trainer.state.log_history
for entry in training_metrics:
    if "eval_loss" in entry:
        print(entry)

print("\nTest Results Table:")
test_table = pd.DataFrame([test_results])
print(test_table)

# Create a sample table from the training history.
eval_results = []
for entry in training_metrics:
  if 'eval_loss' in entry:
    eval_results.append(entry)

if eval_results:
  eval_df = pd.DataFrame(eval_results)
  print("\nEvaluation Results During Training:")
  print(eval_df)

else:
  print("\nNo Evaluation Results During Training to display as a table.")



Saving Phishing_Email 4.csv to Phishing_Email 4 (2).csv


Map:   0%|          | 0/2501 [00:00<?, ? examples/s]

Map:   0%|          | 0/442 [00:00<?, ? examples/s]

Map:   0%|          | 0/520 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Step 1: Install Required Libraries
!pip install transformers datasets scikit-learn

# Step 2: Upload the Dataset
from google.colab import files
import pandas as pd
import io
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset

# Upload the dataset
uploaded = files.upload()

# Load the dataset
file_name = list(uploaded.keys())[0]
try:
    df = pd.read_csv(io.BytesIO(uploaded[file_name]))
except UnicodeDecodeError:
    df = pd.read_csv(io.BytesIO(uploaded[file_name]), encoding='latin1')

# Preprocessing
if "Email Text" in df.columns and "Email Type" in df.columns:
    df = df.rename(columns={"Email Text": "text", "Email Type": "label"})
elif "text" in df.columns and "label" in df.columns:
    pass
else:
    print("Error: Please check column names.")
    exit()

if isinstance(df['label'].iloc[0], str):
    df["label"] = df["label"].map({"Safe Email": 0, "Phishing Email": 1})
elif isinstance(df['label'].iloc[0], int):
    pass
elif isinstance(df['label'].iloc[0], float):
    df["label"] = df["label"].astype(int)
else:
    print("Error: Label column should contain strings or integers")
    exit()

df = df[["text", "label"]]

# Split the Dataset
train_df, test_df = train_test_split(df, test_size=0.15, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.15, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenize the Data
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    texts = [str(text) for text in examples["text"]]
    return tokenizer(texts, padding="max_length", truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True, batch_size=16)
val_dataset = val_dataset.map(tokenize_function, batched=True, batch_size=16)
test_dataset = test_dataset.map(tokenize_function, batched=True, batch_size=16)

# Load Pre-trained LLM
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
)

# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    try:
        auc = roc_auc_score(labels, logits[:, 1])
    except ValueError:
        auc = 0.5
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "auc": auc,
    }

# Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# Evaluate on Test Set
test_results = trainer.evaluate(test_dataset)
print("Test Results:", test_results)

# Save Model
model.save_pretrained("./phishing_detection_model")
tokenizer.save_pretrained("./phishing_detection_tokenizer")

# Load Model for Inference
loaded_model = AutoModelForSequenceClassification.from_pretrained("./phishing_detection_model")
loaded_tokenizer = AutoTokenizer.from_pretrained("./phishing_detection_tokenizer")

def predict_phishing(email_text):
    inputs = loaded_tokenizer(email_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = loaded_model(**inputs)
    probs = outputs.logits.softmax(dim=-1)
    return "Phishing" if probs.argmax().item() == 1 else "Legitimate"

sample_email = "Congratulations! You've won a $1000 gift card. Click here to claim your prize."
print("Prediction:", predict_phishing(sample_email))

# Extract and Print Tables of Results
print("\nTraining Results:")
training_metrics = trainer.state.log_history
for entry in training_metrics:
    if "eval_loss" in entry:
        print(entry)

print("\nTest Results Table:")
test_table = pd.DataFrame([test_results])
print(test_table)

# Create a sample table from the training history.
eval_results = []
for entry in training_metrics:
  if 'eval_loss' in entry:
    eval_results.append(entry)

if eval_results:
  eval_df = pd.DataFrame(eval_results)
  print("\nEvaluation Results During Training:")
  print(eval_df)

else:
  print("\nNo Evaluation Results During Training to display as a table.")



Saving Phishing_Email 4.csv to Phishing_Email 4 (1).csv


Map:   0%|          | 0/2501 [00:00<?, ? examples/s]

Map:   0%|          | 0/442 [00:00<?, ? examples/s]

Map:   0%|          | 0/520 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: Target size (torch.Size([16])) must be the same as input size (torch.Size([16, 2]))

In [None]:
# Step 1: Install Required Libraries
!pip install transformers datasets scikit-learn

# Step 2: Upload the Dataset
from google.colab import files
import pandas as pd
import io
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset

# Upload the dataset
uploaded = files.upload()

# Load the dataset
file_name = list(uploaded.keys())[0]
try:
    df = pd.read_csv(io.BytesIO(uploaded[file_name]))
except UnicodeDecodeError:
    df = pd.read_csv(io.BytesIO(uploaded[file_name]), encoding='latin1')

# Preprocessing
if "Email Text" in df.columns and "Email Type" in df.columns:
    df = df.rename(columns={"Email Text": "text", "Email Type": "label"})
elif "text" in df.columns and "label" in df.columns:
    pass
else:
    print("Error: Please check column names.")
    exit()

if isinstance(df['label'].iloc[0], str):
    df["label"] = df["label"].map({"Safe Email": 0, "Phishing Email": 1})
elif isinstance(df['label'].iloc[0], int):
    pass
elif isinstance(df['label'].iloc[0], float):
    df["label"] = df["label"].astype(int)
else:
    print("Error: Label column should contain strings or integers")
    exit()

df = df[["text", "label"]]

# Split the Dataset
train_df, test_df = train_test_split(df, test_size=0.15, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.15, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenize the Data
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    texts = [str(text) for text in examples["text"]]
    return tokenizer(texts, padding="max_length", truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True, batch_size=16)
val_dataset = val_dataset.map(tokenize_function, batched=True, batch_size=16)
test_dataset = test_dataset.map(tokenize_function, batched=True, batch_size=16)

# Load Pre-trained LLM
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
    eval_strategy="epoch" #Replaced evaluation_strategy with eval_strategy
)

# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    try:
        auc = roc_auc_score(labels, logits[:, 1])
    except ValueError:
        auc = 0.5
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "auc": auc,
    }

# Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# Evaluate on Test Set
test_results = trainer.evaluate(test_dataset)
print("Test Results:", test_results)

# Save Model
model.save_pretrained("./phishing_detection_model")
tokenizer.save_pretrained("./phishing_detection_tokenizer")

# Load Model for Inference
loaded_model = AutoModelForSequenceClassification.from_pretrained("./phishing_detection_model")
loaded_tokenizer = AutoTokenizer.from_pretrained("./phishing_detection_tokenizer")

def predict_phishing(email_text):
    inputs = loaded_tokenizer(email_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = loaded_model(**inputs)
    probs = outputs.logits.softmax(dim=-1)
    return "Phishing" if probs.argmax().item() == 1 else "Legitimate"

sample_email = "Congratulations! You've won a $1000 gift card. Click here to claim your prize."
print("Prediction:", predict_phishing(sample_email))

# Extract and Print Tables of Results
print("\nTraining Results:")
training_metrics = trainer.state.log_history
for entry in training_metrics:
    if "eval_loss" in entry:
        print(entry)

print("\nTest Results Table:")
test_table = pd.DataFrame([test_results])
print(test_table)

# Create a sample table from the training history.
eval_results = []
for entry in training_metrics:
  if 'eval_loss' in entry:
    eval_results.append(entry)

if eval_results:
  eval_df = pd.DataFrame(eval_results)
  print("\nEvaluation Results During Training:")
  print(eval_df)

else:
  print("\nNo Evaluation Results During Training to display as a table.")

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

Saving Phishing_Email 4.csv to Phishing_Email 4.csv


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/2501 [00:00<?, ? examples/s]

Map:   0%|          | 0/442 [00:00<?, ? examples/s]

Map:   0%|          | 0/520 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mteamnagid[0m ([33mteamnagid-teamscribe-ng[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


ValueError: Target size (torch.Size([16])) must be the same as input size (torch.Size([16, 2]))

In [None]:
# prompt: GIVE FULL VIEW OF DATASET

print(df.head())
print(df.info())
print(df.describe())


                                                text  label
0  re : 6 . 1100 , disc : uniformitarianism , re ...    0.0
1  the other side of * galicismos * * galicismo *...    0.0
2  re : equistar deal tickets are you still avail...    0.0
3  \nHello I am your hot lil horny toy.\n    I am...    1.0
4  software at incredibly low prices ( 86 % lower...    1.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3463 entries, 0 to 3462
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   text    3457 non-null   object 
 1   label   3455 non-null   float64
dtypes: float64(1), object(1)
memory usage: 54.2+ KB
None
             label
count  3455.000000
mean      0.402026
std       0.490378
min       0.000000
25%       0.000000
50%       0.000000
75%       1.000000
max       1.000000


In [3]:
# Step 1: Install Required Libraries
!pip install transformers datasets scikit-learn

# Step 2: Upload the Dataset
from google.colab import files
import pandas as pd
import io
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset

# Upload the dataset
uploaded = files.upload()

# Load the dataset
file_name = list(uploaded.keys())[0]
try:
    df = pd.read_csv(io.BytesIO(uploaded[file_name]))
except UnicodeDecodeError:
    df = pd.read_csv(io.BytesIO(uploaded[file_name]), encoding='latin1')

# Preprocessing
# Drop rows with missing values
df.dropna(inplace=True)

# Convert labels to integers
df['label'] = df['label'].astype(int)

# Rename Columns if needed.
if "text" not in df.columns or "label" not in df.columns:
  print("Error: DataFrame must have 'text' and 'label' columns.")
  exit()

# Split the Dataset
train_df, test_df = train_test_split(df, test_size=0.15, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.15, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenize the Data
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    texts = [str(text) for text in examples["text"]]
    return tokenizer(texts, padding="max_length", truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True, batch_size=16)
val_dataset = val_dataset.map(tokenize_function, batched=True, batch_size=16)
test_dataset = test_dataset.map(tokenize_function, batched=True, batch_size=16)

# Load Pre-trained LLM
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
    eval_strategy="epoch"
)

# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    try:
        auc = roc_auc_score(labels, logits[:, 1])
    except ValueError:
        auc = 0.5
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "auc": auc,
    }

# Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# Evaluate on Test Set
test_results = trainer.evaluate(test_dataset)
print("Test Results:", test_results)

# Save Model
model.save_pretrained("./phishing_detection_model")
tokenizer.save_pretrained("./phishing_detection_tokenizer")

# Load Model for Inference
loaded_model = AutoModelForSequenceClassification.from_pretrained("./phishing_detection_model")
loaded_tokenizer = AutoTokenizer.from_pretrained("./phishing_detection_tokenizer")

def predict_phishing(email_text):
    inputs = loaded_tokenizer(email_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = loaded_model(**inputs)
    probs = outputs.logits.softmax(dim=-1)
    return "Phishing" if probs.argmax().item() == 1 else "Legitimate"

sample_email = "Congratulations! You've won a $1000 gift card. Click here to claim your prize."
print("Prediction:", predict_phishing(sample_email))

# Extract and Print Tables of Results
print("\nTraining Results:")
training_metrics = trainer.state.log_history
for entry in training_metrics:
    if "eval_loss" in entry:
        print(entry)

print("\nTest Results Table:")
test_table = pd.DataFrame([test_results])
print(test_table)

# Create a sample table from the training history.
eval_results = []
for entry in training_metrics:
  if 'eval_loss' in entry:
    eval_results.append(entry)

if eval_results:
  eval_df = pd.DataFrame(eval_results)
  print("\nEvaluation Results During Training:")
  print(eval_df)

else:
  print("\nNo Evaluation Results During Training to display as a table.")

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xx

Saving Phishing_Email 4.csv to Phishing_Email 4.csv


KeyError: 'label'

In [4]:
print(df.columns)

Index(['sn', 'Email Text', 'Email Type'], dtype='object')


In [5]:
# Rename 'Email Type' to 'label'
df = df.rename(columns={'Email Type': 'label', 'Email Text': 'text'})

# Drop the 'sn' column, as it's likely an index column.
df = df.drop('sn', axis=1)

# Convert labels to integers (if needed)
if df['label'].dtype == 'object':
    df["label"] = df["label"].map({"Safe Email": 0, "Phishing Email": 1})
elif df['label'].dtype == 'float64':
    df['label'] = df['label'].astype(int)

# Inspect column names after renaming
print("Columns after renaming:", df.columns)

# Preprocessing
# Drop rows with missing values
df.dropna(inplace=True)

# Convert labels to integers
df['label'] = df['label'].astype(int)

# Rename Columns if needed.
if "text" not in df.columns or "label" not in df.columns:
  print("Error: DataFrame must have 'text' and 'label' columns.")
  exit()

# Split the Dataset
train_df, test_df = train_test_split(df, test_size=0.15, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.15, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenize the Data
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    texts = [str(text) for text in examples["text"]]
    return tokenizer(texts, padding="max_length", truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True, batch_size=16)
val_dataset = val_dataset.map(tokenize_function, batched=True, batch_size=16)
test_dataset = test_dataset.map(tokenize_function, batched=True, batch_size=16)

# Load Pre-trained LLM
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
    eval_strategy="epoch"
)

# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    try:
        auc = roc_auc_score(labels, logits[:, 1])
    except ValueError:
        auc = 0.5
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "auc": auc,
    }

# Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# Evaluate on Test Set
test_results = trainer.evaluate(test_dataset)
print("Test Results:", test_results)

# Save Model
model.save_pretrained("./phishing_detection_model")
tokenizer.save_pretrained("./phishing_detection_tokenizer")

# Load Model for Inference
loaded_model = AutoModelForSequenceClassification.from_pretrained("./phishing_detection_model")
loaded_tokenizer = AutoTokenizer.from_pretrained("./phishing_detection_tokenizer")

def predict_phishing(email_text):
    inputs = loaded_tokenizer(email_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = loaded_model(**inputs)
    probs = outputs.logits.softmax(dim=-1)
    return "Phishing" if probs.argmax().item() == 1 else "Legitimate"

sample_email = "Congratulations! You've won a $1000 gift card. Click here to claim your prize."
print("Prediction:", predict_phishing(sample_email))

# Extract and Print Tables of Results
print("\nTraining Results:")
training_metrics = trainer.state.log_history
for entry in training_metrics:
    if "eval_loss" in entry:
        print(entry)

print("\nTest Results Table:")
test_table = pd.DataFrame([test_results])
print(test_table)

# Create a sample table from the training history.
eval_results = []
for entry in training_metrics:
  if 'eval_loss' in entry:
    eval_results.append(entry)

if eval_results:
  eval_df = pd.DataFrame(eval_results)
  print("\nEvaluation Results During Training:")
  print(eval_df)

else:
  print("\nNo Evaluation Results During Training to display as a table.")

Columns after renaming: Index(['text', 'label'], dtype='object')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/2495 [00:00<?, ? examples/s]

Map:   0%|          | 0/441 [00:00<?, ? examples/s]

Map:   0%|          | 0/519 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mteamnagid[0m ([33mteamnagid-teamscribe-ng[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.2692,0.142378,0.93424,0.981818,0.861702,0.917847,0.991779
2,0.0245,0.145253,0.950113,0.951087,0.930851,0.94086,0.993503
3,0.0078,0.109405,0.954649,0.946809,0.946809,0.946809,0.995206


Test Results: {'eval_loss': 0.1341036707162857, 'eval_accuracy': 0.9595375722543352, 'eval_precision': 0.9439252336448598, 'eval_recall': 0.957345971563981, 'eval_f1': 0.9505882352941176, 'eval_auc': 0.993537268418785, 'eval_runtime': 557.137, 'eval_samples_per_second': 0.932, 'eval_steps_per_second': 0.059, 'epoch': 3.0}
Prediction: Phishing

Training Results:
{'eval_loss': 0.14237776398658752, 'eval_accuracy': 0.9342403628117913, 'eval_precision': 0.9818181818181818, 'eval_recall': 0.8617021276595744, 'eval_f1': 0.9178470254957507, 'eval_auc': 0.991779497098646, 'eval_runtime': 479.1691, 'eval_samples_per_second': 0.92, 'eval_steps_per_second': 0.058, 'epoch': 1.0, 'step': 156}
{'eval_loss': 0.1452527791261673, 'eval_accuracy': 0.9501133786848073, 'eval_precision': 0.9510869565217391, 'eval_recall': 0.9308510638297872, 'eval_f1': 0.9408602150537635, 'eval_auc': 0.9935034900344799, 'eval_runtime': 475.2998, 'eval_samples_per_second': 0.928, 'eval_steps_per_second': 0.059, 'epoch': 2.0