In [112]:
import math

import numpy as np
import pandas as pd
import torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
from datasets import Dataset

In [113]:
# import torch.cuda

# device = "cuda:0" if torch.cuda.is_available() else "cpu"
# device

## Reading in Data

In [114]:
original_df = pd.read_csv("malicious_phish.csv")
original_df.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [115]:
len(original_df)

651191

In [116]:
original_df["url"][:5]

0                                     br-icloud.com.br
1                  mp3raid.com/music/krizz_kaliko.html
2                      bopsecrets.org/rexroth/cr/1.htm
3    http://www.garage-pirenne.be/index.php?option=...
4    http://adventure-nicaragua.net/index.php?optio...
Name: url, dtype: object

## Sampling the Data

In [117]:
# PERCENTAGE = 0.005
# df = original_df.sample(frac=PERCENTAGE, replace=False, random_state=42)

In [118]:
NUM_SAMPLES = 1000
df = original_df.sample(n=NUM_SAMPLES, replace=False, random_state=42)

In [119]:
df.shape

(1000, 2)

## BERT Setup

In [120]:
MODEL = "./bert-base-uncased/"

In [121]:
METRIC = "accuracy"

In [122]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [123]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./bert-base-uncased/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Loading Dataset

In [124]:
dataset = Dataset.from_pandas(df)

In [125]:
dataset = dataset.remove_columns(["__index_level_0__"])

In [126]:
dataset = dataset.class_encode_column("type")

Casting to class labels: 100%|███| 1000/1000 [00:00<00:00, 206891.14 examples/s]


In [127]:
dataset = dataset.rename_column("url", "text")
dataset = dataset.rename_column("type", "label")

In [128]:
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 1000
})

## BERT Tokenization

In [129]:
def tokenize(data):
    return tokenizer(data["text"], padding="max_length", return_tensors="pt")

In [130]:
tokenized_dataset = dataset.map(tokenize, batched=True)

Map: 100%|█████████████████████████| 1000/1000 [00:00<00:00, 3632.19 examples/s]


In [131]:
tokenized_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1000
})

## Dataset Split

In [132]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.2)

In [133]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 800
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 200
    })
})

In [134]:
train_dataset = split_dataset["train"]

In [135]:
test_dataset = split_dataset["test"]

In [136]:
train_dataset, test_dataset

(Dataset({
     features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 800
 }),
 Dataset({
     features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 200
 }))

## BERT Fine-Tuning

In [137]:
training_args = TrainingArguments(output_dir="results", 
                                  num_train_epochs=10,
                                  logging_steps=1,
                                  evaluation_strategy="epoch")

In [138]:
# metric = evaluate.load(METRIC)

Cross-Entropy Loss

In [139]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    sum = 0
    for i in range(len(labels)):
        raw_logits = logits[i]
        exp_logits = np.exp(raw_logits)
        norm_logits = exp_logits / np.sum(exp_logits, axis=0)
        chosen_value = norm_logits[labels[i]]
        sum += np.log(chosen_value)
    sum /= -len(labels)
    return {"cross_entropy": sum}

In [140]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [141]:
trainer_output = trainer.train()

Epoch,Training Loss,Validation Loss,Cross Entropy
1,0.5511,0.492621,0.492621
2,0.2043,0.501091,0.501091
3,0.6657,0.524949,0.524949
4,0.0043,0.605397,0.605397
5,0.0025,0.585057,0.585057
6,0.0021,0.66997,0.66997
7,0.001,0.670499,0.670499
8,0.001,0.656261,0.656261
9,0.0012,0.691514,0.691514
10,0.0005,0.687948,0.687948


In [142]:
trainer_output

TrainOutput(global_step=1000, training_loss=0.2060786658008583, metrics={'train_runtime': 525.8251, 'train_samples_per_second': 15.214, 'train_steps_per_second': 1.902, 'total_flos': 2104926240768000.0, 'train_loss': 0.2060786658008583, 'epoch': 10.0})