In [1]:
pip install -q transformers datasets evaluate torch

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
from transformers import DataCollatorWithPadding

In [3]:
# Load the dataset from Hugging Face
dataset = load_dataset("imanoop7/phishing_url_classification")

In [4]:
# Check available splits and create train/validation/test splits if necessary
if "train" not in dataset or "validation" not in dataset or "test" not in dataset:
    # Combine all available data
    combined_data = dataset["train"] if "train" in dataset else dataset[list(dataset.keys())[0]]

    # Create train/validation/test splits
    splits = combined_data.train_test_split(test_size=0.2, seed=42)
    train_valid = splits["train"]
    test = splits["test"]
    splits = train_valid.train_test_split(test_size=0.1, seed=42)
    train = splits["train"]
    validation = splits["test"]

    dataset_dict = {
        "train": train,
        "validation": validation,
        "test": test
    }
else:
    dataset_dict = dataset

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 100000
    })
})

In [6]:
dataset_dict

{'train': Dataset({
     features: ['text', 'label'],
     num_rows: 72000
 }),
 'validation': Dataset({
     features: ['text', 'label'],
     num_rows: 8000
 }),
 'test': Dataset({
     features: ['text', 'label'],
     num_rows: 20000
 })}

In [7]:
dataset_dict['train'][0]

{'text': 'http://twitterfacebook.net/profile', 'label': 1}

In [8]:
# Define pre-trained model path
model_path = "bert-base-uncased"

# Load model tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
# Load model with binary classification head
id2label = {0: "Safe", 1: "Not Safe"}
label2id = {"Safe": 0, "Not Safe": 1}
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Freeze base model parameters and unfreeze pooling layers
for name, param in model.base_model.named_parameters():
    param.requires_grad = False
    if "pooler" in name:
        param.requires_grad = True

In [10]:
# Define text preprocessing function
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [12]:
# Preprocess all datasets
tokenized_data = {}
for split, dataset in dataset_dict.items():
    tokenized_data[split] = dataset.map(preprocess_function, batched=True)

# Create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load metrics
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")

Map:   0%|          | 0/72000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [13]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, keepdims=True)
    positive_class_probs = probabilities[:, 1]
    auc = np.round(auc_score.compute(prediction_scores=positive_class_probs, references=labels)['roc_auc'], 3)
    predicted_classes = np.argmax(predictions, axis=1)
    acc = np.round(accuracy.compute(predictions=predicted_classes, references=labels)['accuracy'], 3)
    return {"Accuracy": acc, "AUC": auc}

In [14]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="bert-phishing-classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)



In [15]:
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [16]:

# Train the model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,Auc
1,0.0033,0.00013,1.0,1.0
2,0.0009,9e-06,1.0,1.0
3,0.0005,2e-06,1.0,1.0
4,0.0003,2e-06,1.0,1.0
5,0.0003,1e-06,1.0,1.0


TrainOutput(global_step=22500, training_loss=0.011802440856562721, metrics={'train_runtime': 622.6083, 'train_samples_per_second': 578.213, 'train_steps_per_second': 36.138, 'total_flos': 2758694748788160.0, 'train_loss': 0.011802440856562721, 'epoch': 5.0})

In [17]:
# Evaluate on test dataset
print("Evaluating on test dataset...")
test_results = trainer.evaluate(tokenized_data["test"])
print("Test results:")
print(test_results)

Evaluating on test dataset...


Test results:
{'eval_loss': 1.0388200735178543e-06, 'eval_Accuracy': 1.0, 'eval_AUC': 1.0, 'eval_runtime': 25.0431, 'eval_samples_per_second': 798.625, 'eval_steps_per_second': 49.914, 'epoch': 5.0}
