# **Phishing URL Detection**

phishing url detection with machine learning

In [None]:
import os
import pandas as pd
import numpy as np

import keras
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model

from models.phishing import simple_bilstm

PHISHING_DATA_DIR_PATH = "./data/phishing_url"
PHISHING_CACHE_DIR_PATH = "./cache/phishing_url"

model_file_path = os.path.join(PHISHING_CACHE_DIR_PATH, "simple_bilstm.h5")

: 

## read data

In [16]:
def read_data():
    blacklist = os.path.join(PHISHING_DATA_DIR_PATH, 'phishing_database.csv')
    whitelist = os.path.join(PHISHING_DATA_DIR_PATH, 'whitelist.txt')

    urls = {}

    blacklist = pd.read_csv(blacklist)

    # Assign 0 for non-malicious and 1 as malicious for supervised learning.
    for url in blacklist['url']:
        urls[url] = 1

    with open(whitelist, 'r') as f:
        lines = f.read().splitlines()
        for url in lines:
            urls[url] = 0

    return urls

urls = read_data()

In [17]:
samples = []
labels = []
for k, v in urls.items():
    samples.append(k)
    labels.append(v)
    #print(k, v)

print("label == 1: ", labels.count(1))
print("label == 0: ", labels.count(0))

max_chars, maxlen, num_chars, embedding_vector_length, sequences = simple_bilstm.build_tokenizer(
        samples)

data = pad_sequences(sequences, maxlen=maxlen)

labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

label == 1:  29769
label == 0:  38228
Found 69 unique tokens.
Shape of data tensor: (67997, 128)
Shape of label tensor: (67997,)


In [18]:
# Divide data between training, cross-validation, and test data.
training_samples = int(len(samples) * 0.95)
validation_samples = int(len(labels) * 0.05)
print("training_samples: ", training_samples)
print("validation_samples: ", validation_samples)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

'''
x = data
y = labels
'''
x = data[:training_samples]
y = labels[:training_samples]
x_test = data[training_samples: training_samples + validation_samples]
y_test = labels[training_samples: training_samples + validation_samples]


training_samples:  64597
validation_samples:  3399


# Train model

In [None]:
# Define callbacks for Keras.
callbacks_list = [
    keras.callbacks.ModelCheckpoint(
        filepath=model_file_path,
        monitor='val_loss',
        save_best_only=True
    ),
    keras.callbacks.EarlyStopping(
        monitor='val_loss',
        min_delta=0,
        patience=2,
        mode='auto',
        baseline=None,
    )
]

model = simple_bilstm.build_model(
    num_chars, embedding_vector_length, maxlen)

# Train.
model.fit(x, y,
            epochs=10,
            batch_size=32,
            callbacks=callbacks_list,
            validation_split=0.20,
            shuffle=True
            )

# Eval Model

In [21]:
# Evaluate model on test data.
model = load_model(model_file_path)
score, acc = model.evaluate(x_test, y_test, verbose=1, batch_size=1024)

print("Model Accuracy: {:0.2f}%".format(acc * 100))

Model Accuracy: 99.82%


In [23]:
from sklearn.metrics import classification_report
pred = model.predict_classes(x_test)
print(classification_report(y_test, pred, digits=5))



              precision    recall  f1-score   support

           0    0.99790   0.99895   0.99843      1904
           1    0.99866   0.99732   0.99799      1495

    accuracy                        0.99823      3399
   macro avg    0.99828   0.99814   0.99821      3399
weighted avg    0.99824   0.99823   0.99823      3399

