In [1]:
import pandas as pd
import numpy as np
from string import printable
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf  
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, LSTM, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import plot_model
import warnings
warnings.filterwarnings("ignore")


In [2]:
df = pd.read_csv('dataset/dataset.csv')
url_int_tokens = [[printable.index(x) + 1 for x in url if x in printable] for url in df.url]
max_len = 75
X = sequence.pad_sequences(url_int_tokens, maxlen=max_len)
target = np.array(df.isMalicious)


In [3]:
X_train, X_test, target_train, target_test = train_test_split(X, target, test_size=0.2, random_state=42)


In [5]:
#new tuned model


def lstm_conv(max_len=75, emb_dim=32, max_vocab_len=100, lstm_output_size=64, W_reg=tf.keras.regularizers.l2(1e-4)):
    main_input = Input(shape=(max_len,), dtype=tf.int32, name='main_input')
    emb = Embedding(input_dim=max_vocab_len, output_dim=emb_dim, embeddings_regularizer=W_reg)(main_input)
    emb = Dropout(0.25)(emb)

    conv = Conv1D(filters=512, kernel_size=5, padding='same')(emb)
    conv = tf.keras.layers.ELU()(conv)
    conv = MaxPooling1D(pool_size=4)(conv)
    conv = Dropout(0.5)(conv)

    conv = Conv1D(filters=512, kernel_size=6, padding='same')(conv)
    conv = tf.keras.layers.ELU()(conv)
    conv = MaxPooling1D(pool_size=4)(conv)
    conv = Dropout(0.5)(conv)

    conv = Conv1D(filters=512, kernel_size=7, padding='same')(conv)
    conv = tf.keras.layers.ELU()(conv)
    conv = MaxPooling1D(pool_size=4)(conv)
    conv = Dropout(0.5)(conv)

    lstm = LSTM(lstm_output_size)(conv)
    lstm = Dropout(0.5)(lstm)

    dense = Dense(64, activation='relu')(lstm)
    dense = Dropout(0.5)(dense)

    output = Dense(1, activation='sigmoid', name='output')(dense)

    model = Model(inputs=[main_input], outputs=[output])
    adam = Adam(learning_rate=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [4]:
#old model

def lstm_conv(max_len=75, emb_dim=32, max_vocab_len=100, lstm_output_size=32, W_reg=tf.keras.regularizers.l2(1e-4)):
    main_input = Input(shape=(max_len,), dtype=tf.int32, name='main_input')
    emb = Embedding(input_dim=max_vocab_len, output_dim=emb_dim, embeddings_regularizer=W_reg)(main_input)
    emb = Dropout(0.25)(emb)
    conv = Conv1D(filters=256, kernel_size=5, padding='same')(emb)
    conv = tf.keras.layers.ELU()(conv)
    conv = MaxPooling1D(pool_size=4)(conv)
    conv = Dropout(0.5)(conv)
    conv = Conv1D(filters=256, kernel_size=6, padding='same')(emb)
    conv = tf.keras.layers.ELU()(conv)
    conv = MaxPooling1D(pool_size=4)(conv)
    conv = Dropout(0.5)(conv)

    conv = Conv1D(filters=256, kernel_size=7, padding='same')(emb)
    conv = tf.keras.layers.ELU()(conv)
    conv = MaxPooling1D(pool_size=4)(conv)
    conv = Dropout(0.5)(conv)

    lstm = LSTM(lstm_output_size)(conv)
    lstm = Dropout(0.5)(lstm)

    output = Dense(1, activation='sigmoid', name='output')(lstm)

    model = Model(inputs=[main_input], outputs=[output])
    adam = Adam(learning_rate=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    return model


# 20 Epochs =================================================================

In [None]:
epochs = 20
batch_size = 32
model = lstm_conv()
model.fit(X_train, target_train, epochs=epochs, batch_size=batch_size)


In [None]:
loss, accuracy = model.evaluate(X_test, target_test, verbose=0)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)


In [17]:
# Save the model 20 epoch in keras format
try:
    model.save('models/model_20.keras')
    print("Model saved")
except:
    print("Saving failed")

Model saved


# 40 Epochs ==============================================================

In [6]:
#40 epoch in keras
epochs = 40
batch_size = 32
model = lstm_conv()
model.fit(X_train, target_train, epochs=epochs, batch_size=batch_size)


2024-01-29 19:20:29.078832: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-01-29 19:20:29.078856: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-01-29 19:20:29.078862: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-01-29 19:20:29.078903: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-01-29 19:20:29.078927: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/40


2024-01-29 19:20:30.241801: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m4871/4871[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m449s[0m 92ms/step - accuracy: 0.7297 - loss: 0.5011
Epoch 2/40
[1m4871/4871[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m433s[0m 89ms/step - accuracy: 0.8882 - loss: 0.2815
Epoch 3/40
[1m4871/4871[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m395s[0m 81ms/step - accuracy: 0.9041 - loss: 0.2430
Epoch 4/40
[1m4871/4871[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m309s[0m 63ms/step - accuracy: 0.9138 - loss: 0.2188
Epoch 5/40
[1m4871/4871[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m309s[0m 64ms/step - accuracy: 0.9184 - loss: 0.2071
Epoch 6/40
[1m4871/4871[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m309s[0m 63ms/step - accuracy: 0.9249 - loss: 0.1944
Epoch 7/40
[1m4871/4871[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m308s[0m 63ms/step - accuracy: 0.9270 - loss: 0.1868
Epoch 8/40
[1m4871/4871[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m310s[0m 64ms/step - accuracy: 0.9288 - loss: 0.1806
Epoch 9/40


<keras.src.callbacks.history.History at 0x2cd054490>

In [7]:
loss, accuracy = model.evaluate(X_test, target_test, verbose=0)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

Test Loss: 0.11938583105802536
Test Accuracy: 0.9585443139076233


In [8]:
# Save the model 40 epoch in keras format
try:
    model.save('models/model_40.keras')
    print("Model saved")
except:
    print("Saving failed")

Model saved


In [18]:
# Load the model
loaded_model = tf.keras.models.load_model('models/model_40.keras')

In [4]:
# Load the model
loaded_model = tf.keras.models.load_model('models/model_40.keras')

# Evaluate the loaded model on the test data
loss, accuracy = loaded_model.evaluate(X_test, target_test, verbose=1)
print('\nFinal Evaluation Accuracy:', accuracy, '\n')

# Get probabilities of target predictions
probabilities = loaded_model.predict(X_test)

# Display the first few probabilities
print('Probabilities of Target Predictions:')
print(probabilities[:5])


NameError: name 'X_test' is not defined

In [3]:
def print_result(proba):
    if proba > 0.5:
        return "malicious"
    else:
        return "safe"

In [2]:
url=input("Enter URL")

url_int_tokens = [[printable.index(x) + 1 for x in url if x in printable]]

max_len=75
X = sequence.pad_sequences(url_int_tokens, maxlen=max_len)
probab=loaded_model.predict(X,batch_size=1)
print(print_result(probab))

NameError: name 'printable' is not defined

In [1]:
import tkinter as tk
from tkinter import messagebox
from tensorflow.keras.preprocessing import sequence
import tensorflow as tf
import string  

# Load the model
loaded_model = tf.keras.models.load_model('models/model_20.keras')

def check_url():
    url = entry.get()

    # Define printable here
    printable = string.printable
    
    # Preprocess the URL
    url_int_tokens = [[printable.index(x) + 1 for x in url if x in printable]]
    X_input = sequence.pad_sequences(url_int_tokens, maxlen=75)
    
    # Get the probability of being malicious
    probability = loaded_model.predict(X_input)[0][0]
    
    # Display the result
    if probability > 0.5:
        result = "Malicious"
    else:
        result = "Safe"
    
    messagebox.showinfo("Result", f"The URL is predicted as: {result}\nProbability: {probability:.2f}")

# Create the main window
root = tk.Tk()
root.title("URL Checker")

# Create UI elements
label = tk.Label(root, text="Enter URL:")
label.pack(pady=10)

entry = tk.Entry(root, width=40)
entry.pack(pady=10)

button = tk.Button(root, text="Check URL", command=check_url)
button.pack(pady=20)

# Run the Tkinter event loop
root.mainloop()


2024-01-30 13:36:11.115569: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-01-30 13:36:11.115597: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-01-30 13:36:11.115602: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-01-30 13:36:11.115638: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-01-30 13:36:11.115659: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
  trackable.load_own_variables(weights_store.get(inner_path))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 234ms/step


2024-01-30 13:36:21.038472: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.
