In [None]:
# extract the domain name from a URL (Extract the top level domain (TLD) from the URL given).

!pip install tld

<a id="1"></a> <br>
# üì• Importing Libraries

In [None]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import tree

from colorama import Fore  #Colorama is a module to color the python outputs

from urllib.parse import urlparse
# This module defines a standard interface to break Uniform Resource Locator (URL) 
# strings up in components (addressing scheme, network location, path etc.), 
# to combine the components back into a URL string, 
# and to convert a ‚Äúrelative URL‚Äù to an absolute URL given a ‚Äúbase URL.‚Äù

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from tld import get_tld, is_tld

<a id="2"></a> <br>
# üóÉÔ∏è Load Dataset

In [None]:
data = pd.read_csv('../input/malicious-urls-dataset/malicious_phish.csv')
data.head(20)

<a id="3"></a> <br>
# üìù Meta information of Dataframe

In [None]:
data.info()

<a id="4"></a>
# üîé Checking for NaN values

In [None]:
data.isnull().sum()

In [None]:
count = data.type.value_counts()
count

In [None]:
x=count.index
x

In [None]:
sns.barplot(x=count.index, y=count)
plt.xlabel('Types')
plt.ylabel('Count');

In [None]:
data['url'] = data['url'].replace('www.', '', regex=True)
data


In [None]:
data.head(10)

In [None]:
rem = {"Category": {"benign": 0, "defacement": 1, "phishing":2, "malware":3}}
data['Category'] = data['type']
data = data.replace(rem)
data.head(20)

In [None]:
data['url_len'] = data['url'].apply(lambda x: len(str(x)))

In [None]:
data.head()

In [None]:
def process_tld(url):
    try:
#         Extract the top level domain (TLD) from the URL given
        res = get_tld(url, as_object = True, fail_silently=False,fix_protocol=True)
        pri_domain= res.parsed_url.netloc
    except :
        pri_domain= None
    return pri_domain



# netloc : Contains the network location - which includes the domain itself (and subdomain if present), 
# the port number, along with an optional credentials in form of username:password . Together it may take
# form of username:password@domain.com:80

In [None]:
data['domain'] = data['url'].apply(lambda i: process_tld(i))

In [None]:
data.head()

In [None]:
feature = ['@','?','-','=','.','#','%','+','$','!','*',',','//']
for a in feature:
    data[a] = data['url'].apply(lambda i: i.count(a))

In [None]:
data.head()

In [None]:
def abnormal_url(url):
    hostname = urlparse(url).hostname
    hostname = str(hostname)
    match = re.search(hostname, url)
    if match:
        # print match.group()
        return 1
    else:
        # print 'No matching pattern found'
        return 0
    
    

    

# urlparse():This function parses a URL into six components, returning a 6-tuple. 
# This corresponds to the general structure of a URL. Each tuple item is a string. 
# The components are not broken up in smaller parts 
#(for example, the network location is a single string), and % escapes are not expanded.

In [None]:
data['abnormal_url'] = data['url'].apply(lambda i: abnormal_url(i))

In [None]:
data.head(10)

In [None]:
sns.countplot(x='abnormal_url', data=data);

In [None]:
def httpSecure(url):
    htp = urlparse(url).scheme #It supports the following URL schemes: file , ftp , gopher , hdl , 
                               #http , https ... from urllib.parse
    match = str(htp)
    if match=='https':
        # print match.group()
        return 1
    else:
        # print 'No matching pattern found'
        return 0

In [None]:
data['https'] = data['url'].apply(lambda i: httpSecure(i))

In [None]:
data.head(20)

In [None]:
sns.countplot(x='https', data=data);

<h4>Counts the number of digit characters in a URL</h4>

In [None]:
def digit_count(url):
    digits = 0
    for i in url:
        if i.isnumeric():
            digits = digits + 1
    return digits

In [None]:
data['digits']= data['url'].apply(lambda i: digit_count(i))

<h4>Counts the number of letter characters in a URL</h4>

In [None]:
data.head()

In [None]:
def letter_count(url):
    letters = 0
    for i in url:
        if i.isalpha():
            letters = letters + 1
    return letters

# The isalpha() method returns True if all the characters are alphabet letters (a-z). 
# Example of characters that are not alphabet letters: (space)!

In [None]:
data['letters']= data['url'].apply(lambda i: letter_count(i))

<h4>Checks to see whether URL contains a shortening service</h4>

In [None]:
def Shortining_Service(url):
    match = re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                      'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                      'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                      'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                      'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                      'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                      'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|'
                      'tr\.im|link\.zip\.net',
                      url)
    if match:
        return 1
    else:
        return 0
    

# A URL shortening service is a third-party website that converts that long URL to a short, 
# case-sensitive alphanumeric code. Simply put, this means that a URL shortening service takes 
# ridiculously long URLs (web addresses) and makes them short.

In [None]:
data['Shortining_Service'] = data['url'].apply(lambda x: Shortining_Service(x))

In [None]:
data.head(20)

In [None]:
sns.countplot(x='Shortining_Service', data=data);

In [None]:
def having_ip_address(url):
    match = re.search(
        '(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.'
        '([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\/)|'  # IPv4
        '(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.'
        '([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\/)|'  # IPv4 with port
        '((0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\/)' # IPv4 in hexadecimal
        '(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}|'
        '([0-9]+(?:\.[0-9]+){3}:[0-9]+)|'
        '((?:(?:\d|[01]?\d\d|2[0-4]\d|25[0-5])\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d|\d)(?:\/\d{1,2})?)', url)  # Ipv6
    if match:
        return 1
    else:
        return 0

In [None]:
data['having_ip_address'] = data['url'].apply(lambda i: having_ip_address(i))

In [None]:
data.head()

In [None]:
data['having_ip_address'].value_counts()

In [None]:
plt.figure(figsize=(15, 15))
sns.heatmap(data.corr(), linewidths=.5)

In [None]:
X = data.drop(['url','type','Category','domain'],axis=1)#,'type_code'
y = data['Category']

In [None]:
X

In [None]:
y

<a id="6"></a> <br>
# ‚úÇÔ∏è Train & Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_test

<a id="7"></a> <br>
# <img src="https://cdn-icons-png.flaticon.com/32/4149/4149680.png"/> Training models

In [None]:
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier, RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Updated model list
models = [
    AdaBoostClassifier,
    KNeighborsClassifier,
    SGDClassifier,
    GaussianNB,
    MLPClassifier,
    RidgeClassifier,
    XGBClassifier
]

accuracy_test = []

for m in models:
    print('#############################################')
    print('######-Model =>\033[07m {} \033[0m'.format(m.__name__))
    model_ = m()
    model_.fit(X_train, y_train)
    pred = model_.predict(X_test)
    acc = accuracy_score(y_test, pred)
    accuracy_test.append(acc)
    print('Test Accuracy :\033[32m \033[01m {:.2f}% \033[30m \033[0m'.format(acc*100))
    print('\033[01m              Classification_report \033[0m')
    print(classification_report(y_test, pred))
    print('\033[01m             Confusion_matrix \033[0m')
    cf_matrix = confusion_matrix(y_test, pred)
    sns.heatmap(cf_matrix / np.sum(cf_matrix), annot=True, fmt='0.2%')
    plt.show()
    print('\033[31m###################- End -###################\033[0m')


In [None]:
output = pd.DataFrame({
    "Model": [
        'AdaBoost Classifier',
        'KNeighbors Classifier',
        'SGD Classifier',
        'Gaussian NB',
        'MLP Classifier',
        'Ridge Classifier',
        'XGBoost Classifier'
    ],
    "Accuracy": accuracy_test
})


In [None]:
X_test

In [None]:
plt.figure(figsize=(10, 5))
plots = sns.barplot(x='Model', y='Accuracy', data=output)
for bar in plots.patches:
    plots.annotate(format(bar.get_height(), '.2f'),
                   (bar.get_x() + bar.get_width() / 2,
                    bar.get_height()), ha='center', va='center',
                   size=15, xytext=(0, 8),
                   textcoords='offset points')

plt.xlabel("Models", size=14)
plt.xticks(rotation=20);
plt.ylabel("Accuracy", size=14)
plt.show()

In [None]:
def URL_Converter(urls):
    data= pd.DataFrame()
    data['url'] = pd.Series(urls)

    
    data['url_len'] = data['url'].apply(lambda x: len(str(x)))
    data['domain'] = data['url'].apply(lambda i: process_tld(i))
    feature = ['@','?','-','=','.','#','%','+','$','!','*',',','//']
    for a in feature:
        data[a] = data['url'].apply(lambda i: i.count(a))  
    data['abnormal_url'] = data['url'].apply(lambda i: abnormal_url(i))
    data['https'] = data['url'].apply(lambda i: httpSecure(i))
    data['digits']= data['url'].apply(lambda i: digit_count(i))
    data['letters']= data['url'].apply(lambda i: letter_count(i))
    data['Shortining_Service'] = data['url'].apply(lambda x: Shortining_Service(x))
    data['having_ip_address'] = data['url'].apply(lambda i: having_ip_address(i))
    print(data.columns)
    X = data.drop(['url','domain'],axis=1)
    
    return X
    

In [None]:
urls= ['diaryofagameaddict.com',
'espdesign.com.au',
'iamagameaddict.com',
'kalantzis.net',
'slightlyoffcenter.net',
'toddscarwash.com',
'tubemoviez.com',
'ipl.hk',
'crackspider.us/toolbar/install.php?pack=exe',
'pos-kupang.com/',
'rupor.info',
'svision-online.de/mgfi/administrator/components/com_babackup/classes/fx29id1.txt',
'officeon.ch.ma/office.js?google_ad_format=728x90_as',
'sn-gzzx.com',
'sunlux.net/company/about.html',
'outporn.com',
'timothycopus.aimoo.com',
'xindalawyer.com',
'freeserials.spb.ru/key/68703.htm',
'deletespyware-adware.com',
'orbowlada.strefa.pl/text396.htm',
'ruiyangcn.com',
'zkic.com',
'adserving.favorit-network.com/eas?camp=19320;cre=mu&grpid=1738&tag_id=618&nums=FGApbjFAAA',
'cracks.vg/d1.php',
'juicypussyclips.com',
'nuptialimages.com',
'andysgame.com',
'bezproudoff.cz',
'ceskarepublika.net',
'hotspot.cz',
'gmcjjh.org/DHL',
'nerez-schodiste-zabradli.com',
'nordiccountry.cz',
'nowina.info',
'obada-konstruktiwa.org',
'otylkaaotesanek.cz',
'pb-webdesign.net',
'pension-helene.cz',
'podzemi.myotis.info',
'smrcek.com',
'spekband.com',
'm2132.ehgaugysd.net/zyso.cgi?18',
'webcom-software.ws/links/?153646e8b0a88',
'worldgymperu.com',
'zgsysz.com',
'oknarai.ru',
'realinnovation.com/css/menu.js']

In [None]:
test_data= URL_Converter(urls)

In [None]:
models

In [None]:
for m in models:
    print('#############################################')
    print('######-Model =>\033[07m {} \033[0m'.format(m))
    model_ = m()
    model_.fit(X_train, y_train)
    pred = model_.predict(test_data)
    print(pred)

**8 - Final Rreport**

In [None]:
output = pd.DataFrame({
    "Model": [
        'AdaBoost Classifier',
        'KNeighbors Classifier',
        'SGD Classifier',
        'Gaussian NB',
        'MLP Classifier',
        'Ridge Classifier',
        'XGBoost Classifier'
    ],
    "Accuracy": accuracy_test
})

print(output)


In [None]:
import joblib
import os

# Create directory to save models
os.makedirs("saved_models", exist_ok=True)

# Save each trained model
for i, m in enumerate(models):
    model_name = m.__name__
    model_instance = m()
    model_instance.fit(X_train, y_train)
    joblib.dump(model_instance, f"saved_models/{model_name}.pkl")
    print(f"‚úÖ Saved: saved_models/{model_name}.pkl")


In [None]:
import shutil

# Folder you want to zip
folder_to_zip = '/kaggle/working/saved_models'
zip_filename = '/kaggle/working/saved_models.zip'

# Create ZIP file
shutil.make_archive(base_name=zip_filename.replace('.zip', ''), format='zip', root_dir=folder_to_zip)

print(f"‚úÖ Folder '{folder_to_zip}' zipped as '{zip_filename}'")


In [None]:
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import StackingClassifier
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# === Clean column names to remove special characters ===
X_train.columns = [re.sub(r'[^\w\s]', '_', str(col)) for col in X_train.columns]
X_test.columns = [re.sub(r'[^\w\s]', '_', str(col)) for col in X_test.columns]

# === DL setup for raw URL sequences ===
MAX_LEN = 100
VOCAB_SIZE = 1000

tokenizer = Tokenizer(char_level=True, lower=True, num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(X_train)  # separate raw URL text
X_seq_train = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_LEN)
X_seq_test = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=MAX_LEN)

# === DL Model ===
def build_dl_model():
    model = Sequential([
        Embedding(VOCAB_SIZE, 32, input_length=MAX_LEN),
        Conv1D(64, kernel_size=3, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# === ML Model Evaluation ===
accuracy_test = []
model_names = []

ml_models = [
    ('Random Forest Classifier', RandomForestClassifier()),
    ('Gradient Boosting Classifier', GradientBoostingClassifier()),
    ('Extra Trees Classifier', ExtraTreesClassifier()),
    ('Logistic Regression', LogisticRegression(solver='liblinear')),
    ('LGBM Classifier', LGBMClassifier()),
    ('CatBoost Classifier', CatBoostClassifier(verbose=0))
]

for name, model in ml_models:
    print('#############################################')
    print(f'######-Model => \033[07m {name} \033[0m')
    try:
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        acc = accuracy_score(y_test, pred)
        accuracy_test.append(acc)
        model_names.append(name)
        print(f'Test Accuracy :\033[32m \033[01m {acc * 100:.2f}% \033[0m')
        print('\033[01mClassification Report:\033[0m')
        print(classification_report(y_test, pred))
        print('\033[01mConfusion Matrix:\033[0m')
        cf_matrix = confusion_matrix(y_test, pred)
        sns.heatmap(cf_matrix / np.sum(cf_matrix), annot=True, fmt='0.2%')
        plt.show()
    except Exception as e:
        print(f"Error with model {name}: {e}")
    print('\033[31m###################- End -###################\033[0m')

# === DL Model Training ===
print('\n\033[34m############### Training Deep Learning Model ###############\033[0m')
dl_model = build_dl_model()
dl_model.fit(X_seq_train, y_train, epochs=5, batch_size=32, validation_split=0.1, verbose=1)

dl_pred_probs = dl_model.predict(X_seq_test)
dl_preds = (dl_pred_probs > 0.5).astype(int).flatten()

acc_dl = accuracy_score(y_test, dl_preds)
print('\033[32mDL Test Accuracy: {:.2f}%\033[0m'.format(acc_dl * 100))
print(classification_report(y_test, dl_preds))
sns.heatmap(confusion_matrix(y_test, dl_preds) / np.sum(confusion_matrix(y_test, dl_preds)), annot=True, fmt='0.2%')
plt.show()

# Add DL model to output
model_names.append("Deep Learning Model (CNN)")
accuracy_test.append(acc_dl)

# === Hybrid Model (Stack ML + DL) ===
print('\n\033[36m############### Hybrid Model (Stacked) ###############\033[0m')

base_learners = [
    ('lgbm', LGBMClassifier()),
    ('rf', RandomForestClassifier())
]
meta_model = LogisticRegression()

stack_model = StackingClassifier(estimators=base_learners, final_estimator=meta_model)
stack_model.fit(X_train, y_train)
stack_pred = stack_model.predict(X_test)

combined_preds = ((stack_pred + dl_preds) > 1).astype(int)

acc_combined = accuracy_score(y_test, combined_preds)
print('\033[35mHybrid Accuracy (DL + ML): {:.2f}%\033[0m'.format(acc_combined * 100))
print(classification_report(y_test, combined_preds))
sns.heatmap(confusion_matrix(y_test, combined_preds) / np.sum(confusion_matrix(y_test, combined_preds)), annot=True, fmt='0.2%')
plt.show()

# Add Hybrid model to output
model_names.append("Hybrid Model (Stacked + DL)")
accuracy_test.append(acc_combined)

# === Final Output Table ===
output = pd.DataFrame({"Model": model_names, "Accuracy": accuracy_test})
print("\n\033[01mFinal Accuracy Summary:\033[0m")
print(output.sort_values(by="Accuracy", ascending=False))


In [None]:
# === Imports ===
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# === Deep Learning Tokenization ===
MAX_LEN = 100
VOCAB_SIZE = 1000

tokenizer = Tokenizer(char_level=True, lower=True, num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(X_train)  # X_raw_train: list of raw URL strings

X_seq_train = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_LEN)
X_seq_test = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=MAX_LEN)

# === Deep Learning Model ===
def build_dl_model():
    model = Sequential([
        Embedding(VOCAB_SIZE, 32, input_length=MAX_LEN),
        Conv1D(64, kernel_size=3, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# === ML Model List ===
models = [
    RandomForestClassifier,
    GradientBoostingClassifier,
    ExtraTreesClassifier,
    LogisticRegression,
    LGBMClassifier,
    CatBoostClassifier
]

# === Train and Evaluate ML Models ===
accuracy_test = []

for m in models:
    print('#############################################')
    print(f'######-Model =>\033[07m {m.__name__} \033[0m')
    try:
        model_ = m()
        model_.fit(X_train, y_train)
        pred = model_.predict(X_test)
        acc = accuracy_score(y_test, pred)
        accuracy_test.append(acc)
        print(f'Test Accuracy :\033[32m \033[01m {acc*100:.2f}% \033[0m')
        print('\033[01mClassification Report:\033[0m')
        print(classification_report(y_test, pred))
        print('\033[01mConfusion Matrix:\033[0m')
        cf_matrix = confusion_matrix(y_test, pred)
        sns.heatmap(cf_matrix / np.sum(cf_matrix), annot=True, fmt='0.2%')
        plt.show()
    except Exception as e:
        print(f"Error with model {m.__name__}: {e}")
    print('\033[31m###################- End -###################\033[0m')

# === Train and Evaluate DL Model ===
print('\n\033[34m############### Training Deep Learning Model ###############\033[0m')
dl_model = build_dl_model()
dl_model.fit(X_seq_train, y_train, epochs=5, batch_size=32, validation_split=0.1, verbose=1)

dl_pred_probs = dl_model.predict(X_seq_test)
dl_preds = (dl_pred_probs > 0.5).astype(int).flatten()

acc_dl = accuracy_score(y_test, dl_preds)
print('\033[32mDL Test Accuracy: {:.2f}%\033[0m'.format(acc_dl * 100))
print(classification_report(y_test, dl_preds))
sns.heatmap(confusion_matrix(y_test, dl_preds) / np.sum(confusion_matrix(y_test, dl_preds)), annot=True, fmt='0.2%')
plt.show()

# === Hybrid Stacked Model ===
print('\n\033[36m############### Hybrid Model (Stacked ML + DL) ###############\033[0m')

# Base ML models
base_learners = [
    ('lgbm', LGBMClassifier()),
    ('rf', RandomForestClassifier())
]
meta_model = LogisticRegression()

stack_model = StackingClassifier(estimators=base_learners, final_estimator=meta_model)
stack_model.fit(X_train, y_train)
stack_pred = stack_model.predict(X_test)

# Combine DL + Stacking
combined_preds = ((stack_pred + dl_preds) > 1).astype(int)

acc_combined = accuracy_score(y_test, combined_preds)
print('\033[35mHybrid Accuracy (DL + ML): {:.2f}%\033[0m'.format(acc_combined * 100))
print(classification_report(y_test, combined_preds))
sns.heatmap(confusion_matrix(y_test, combined_preds) / np.sum(confusion_matrix(y_test, combined_preds)), annot=True, fmt='0.2%')
plt.show()


Test

In [None]:
!pip install lightgbm catboost xgboost tensorflow keras

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, LSTM, Embedding, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from xgboost import XGBClassifier

# Re-import DL tools if needed
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Scale X for DL models
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
# Convert to tensors
X_dl = np.array(X_scaled)
y_dl = to_categorical(y)

In [None]:
# Splits again for DL usage
X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(X_dl, y_dl, test_size=0.2, random_state=2)


In [None]:
# Machine Learning models
ml_models = {
    'LogisticRegression': LogisticRegression(solver='liblinear', max_iter=1000),
    'RandomForest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    'CatBoost': CatBoostClassifier(verbose=0)
}


In [None]:
accuracy_test = []

In [None]:
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)

for name, model in ml_models.items():
    print('#############################################')
    print(f'######-Model =>\033[07m {name} \033[0m')
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    acc = accuracy_score(y_test, pred)
    accuracy_test.append(acc)
    print('Test Accuracy :\033[32m \033[01m {:.2f}% \033[30m \033[0m'.format(acc*100))
    print('\033[01m              Classification_report \033[0m')
    print(classification_report(y_test, pred))
    print('\033[01m             Confusion_matrix \033[0m')
    cf_matrix = confusion_matrix(y_test, pred)
    sns.heatmap(cf_matrix / np.sum(cf_matrix), annot=True, fmt='0.2%')
    plt.show()
    print('\033[31m###################- End -###################\033[0m')


In [None]:
# Feed Forward Neural Network (MLP)
def build_mlp(input_dim, num_classes):
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=input_dim))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

print("Training MLP Neural Network...")
mlp_model = build_mlp(X_train_dl.shape[1], y_dl.shape[1])
mlp_model.fit(X_train_dl, y_train_dl, epochs=10, batch_size=32, verbose=1, validation_data=(X_test_dl, y_test_dl))
loss, acc = mlp_model.evaluate(X_test_dl, y_test_dl, verbose=0)
print(f"MLP Accuracy: \033[32m{acc*100:.2f}%\033[0m")


In [None]:
# CNN model
X_cnn = X_dl.reshape(X_dl.shape[0], X_dl.shape[1], 1)
X_train_cnn, X_test_cnn, y_train_cnn, y_test_cnn = train_test_split(X_cnn, y_dl, test_size=0.2, random_state=2)

def build_cnn(input_shape, num_classes):
    model = Sequential()
    model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=input_shape))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(50, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

print("Training CNN model...")
cnn_model = build_cnn((X_cnn.shape[1], 1), y_dl.shape[1])
cnn_model.fit(X_train_cnn, y_train_cnn, epochs=10, batch_size=32, validation_data=(X_test_cnn, y_test_cnn), verbose=1)
loss, acc = cnn_model.evaluate(X_test_cnn, y_test_cnn, verbose=0)
print(f"CNN Accuracy: \033[32m{acc*100:.2f}%\033[0m")

In [None]:
# ------------------------ HYBRID (DL Features + ML Classifier) ------------------------ #
# Example: Use last hidden layer of MLP as input to XGBoost
from keras.models import Model

extract_model = Model(inputs=mlp_model.input, outputs=mlp_model.layers[-2].output)
X_train_feats = extract_model.predict(X_train_dl)
X_test_feats = extract_model.predict(X_test_dl)

xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
y_train_mapped = np.argmax(y_train_dl, axis=1)
y_test_mapped = np.argmax(y_test_dl, axis=1)
xgb.fit(X_train_feats, y_train_mapped)
pred = xgb.predict(X_test_feats)

acc = accuracy_score(y_test_mapped, pred)
print(f"Hybrid DL+ML (MLP + XGBoost) Accuracy: \033[35m{acc*100:.2f}%\033[0m")

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# Use the correct DataFrame and column
text_column = 'url'  # character-tokenize the 'url' column

# Character-level tokenization
char_tokenizer = Tokenizer(char_level=True)
char_tokenizer.fit_on_texts(data[text_column].astype(str))

# Convert to sequences and pad
sequences = char_tokenizer.texts_to_sequences(data[text_column].astype(str))
MAX_LEN = 300  # You can adjust based on URL length distribution
X_char = pad_sequences(sequences, maxlen=MAX_LEN, padding='post')

# Target variable
y_char = to_categorical(data['Category'])  # assuming 'Category' is the label

# Train-test split
X_train_char, X_test_char, y_train_char, y_test_char = train_test_split(
    X_char, y_char, test_size=0.2, random_state=2
)

# Vocabulary size
vocab_size = len(char_tokenizer.word_index) + 1

In [None]:
# LSTM Model
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=MAX_LEN))
lstm_model.add(LSTM(128, return_sequences=False))
lstm_model.add(Dropout(0.3))
lstm_model.add(Dense(64, activation='relu'))
lstm_model.add(Dense(y_char.shape[1], activation='softmax'))

lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print("Training LSTM model...")
lstm_model.fit(X_train_char, y_train_char, epochs=10, batch_size=32, validation_data=(X_test_char, y_test_char), verbose=1)

# Evaluate
loss, acc = lstm_model.evaluate(X_test_char, y_test_char, verbose=0)
print(f"\033[36mLSTM Accuracy (Char-level): {acc*100:.2f}%\033[0m")

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Predict class labels from the model
y_pred_probs = lstm_model.predict(X_test_char)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test_char, axis=1)

# Confusion Matrix
cf_matrix = confusion_matrix(y_true, y_pred)

# Plot
plt.figure(figsize=(6, 4))
sns.heatmap(cf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - LSTM (Char-level)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Optional: Classification report
print("\033[01mClassification Report:\033[0m")
print(classification_report(y_true, y_pred))


In [None]:
# Save the trained model
lstm_model.save('cnn_lstm_model.h5')  # You can change the name as needed
print("Model saved to 'cnn_lstm_model.h5'")


In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Predict class labels from the LSTM model
y_pred_probs = lstm_model.predict(X_test_char)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test_char, axis=1)

# Confusion Matrix
print('\033[01m             Confusion_matrix \033[0m')
cf_matrix = confusion_matrix(y_true, y_pred)

# Normalize and plot the confusion matrix
sns.heatmap(cf_matrix / np.sum(cf_matrix), annot=True, fmt='0.2%', cmap='Blues')

# Show the plot
plt.show()

# Optional: Classification report
print("\033[01mClassification Report:\033[0m")
print(classification_report(y_true, y_pred))

# End of Confusion Matrix
print('\033[31m###################- End -###################\033[0m')


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Store LSTM accuracy from earlier
lstm_accuracy = acc  # assuming `acc` is already defined from LSTM evaluation

# 2. Create DataFrame for LSTM only
output = pd.DataFrame({
    "Model": ['LSTM (Char-level)'],
    "Accuracy": [lstm_accuracy]
})

# 3. Plot accuracy
plt.figure(figsize=(6, 4))
plot = sns.barplot(x='Model', y='Accuracy', data=output, palette='Blues_d')
for bar in plot.patches:
    plot.annotate(format(bar.get_height(), '.2f'),
                  (bar.get_x() + bar.get_width() / 2, bar.get_height()),
                  ha='center', va='center', size=14, xytext=(0, 8),
                  textcoords='offset points')

plt.xlabel("Model", size=12)
plt.ylabel("Accuracy", size=12)
plt.title("LSTM Model Accuracy", size=14)
plt.ylim(0, 1)  # Accuracy ranges from 0 to 1
plt.show()


In [None]:
# Test URLs
urls = [
    'diaryofagameaddict.com',
'espdesign.com.au',
'iamagameaddict.com',
'kalantzis.net',
'slightlyoffcenter.net',
'toddscarwash.com',
'tubemoviez.com',
'ipl.hk',
'crackspider.us/toolbar/install.php?pack=exe',
'pos-kupang.com/',
'rupor.info',
'svision-online.de/mgfi/administrator/components/com_babackup/classes/fx29id1.txt',
'officeon.ch.ma/office.js?google_ad_format=728x90_as',
'sn-gzzx.com',
'sunlux.net/company/about.html',
'outporn.com',
'timothycopus.aimoo.com',
'xindalawyer.com',
'freeserials.spb.ru/key/68703.htm',
'deletespyware-adware.com',
'orbowlada.strefa.pl/text396.htm',
'ruiyangcn.com',
'zkic.com',
'adserving.favorit-network.com/eas?camp=19320;cre=mu&grpid=1738&tag_id=618&nums=FGApbjFAAA',
'cracks.vg/d1.php',
'juicypussyclips.com',
'nuptialimages.com',
'andysgame.com',
'bezproudoff.cz',
'ceskarepublika.net',
'hotspot.cz',
'gmcjjh.org/DHL',
'nerez-schodiste-zabradli.com',
'nordiccountry.cz',
'nowina.info',
'obada-konstruktiwa.org',
'otylkaaotesanek.cz',
'pb-webdesign.net',
'pension-helene.cz',
'podzemi.myotis.info',
'smrcek.com',
'spekband.com',
'm2132.ehgaugysd.net/zyso.cgi?18',
'webcom-software.ws/links/?153646e8b0a88',
'worldgymperu.com',
'zgsysz.com',
'oknarai.ru',
'realinnovation.com/css/menu.js'
]

In [None]:


# Convert URLs to character-level sequences
test_sequences = char_tokenizer.texts_to_sequences(urls)
test_padded = pad_sequences(test_sequences, maxlen=MAX_LEN, padding='post')

# Predict using trained LSTM model
predictions = lstm_model.predict(test_padded)
predicted_classes = predictions.argmax(axis=1)

# Output predictions
for url, pred_class in zip(urls, predicted_classes):
    print(f"URL: {url} --> Predicted Class: {pred_class}")


In [None]:
print('#############################################')
print('######-Model =>\033[07m LSTM (Char-level) \033[0m')

# Convert URLs into char-level sequences
test_sequences = char_tokenizer.texts_to_sequences(urls)
test_padded = pad_sequences(test_sequences, maxlen=MAX_LEN, padding='post')

# Predict
pred_probs = lstm_model.predict(test_padded)
predicted_classes = pred_probs.argmax(axis=1)

# Output predictions
for url, pred_class in zip(urls, predicted_classes):
    print(f"URL: {url} --> Predicted Class: {pred_class}")


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Define model
cnn_lstm_model = Sequential()
cnn_lstm_model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=MAX_LEN))
cnn_lstm_model.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
cnn_lstm_model.add(MaxPooling1D(pool_size=2))
cnn_lstm_model.add(LSTM(64, return_sequences=False))
cnn_lstm_model.add(Dropout(0.3))
cnn_lstm_model.add(Dense(64, activation='relu'))
cnn_lstm_model.add(Dense(y_char.shape[1], activation='softmax'))

# Compile model
cnn_lstm_model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

# Train
print("Training CNN + LSTM model...")
cnn_lstm_model.fit(X_train_char, y_train_char, epochs=10, batch_size=32, validation_data=(X_test_char, y_test_char), verbose=1)

# Evaluate
loss, acc = cnn_lstm_model.evaluate(X_test_char, y_test_char, verbose=0)
print(f"\033[36mCNN + LSTM Accuracy (Char-level): {acc*100:.2f}%\033[0m")


In [None]:
print('#############################################')
print('######-Model =>\033[07m CNN + LSTM (Char-level) \033[0m')

# Prepare input
test_sequences = char_tokenizer.texts_to_sequences(urls)
test_padded = pad_sequences(test_sequences, maxlen=MAX_LEN, padding='post')

# Predict
pred_probs = cnn_lstm_model.predict(test_padded)
predicted_classes = pred_probs.argmax(axis=1)

# Output
for url, pred_class in zip(urls, predicted_classes):
    print(f"URL: {url} --> Predicted Class: {pred_class}")


In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Predict class labels from the LSTM model
y_pred_probs = cnn_lstm_model.predict(X_test_char)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test_char, axis=1)

# Confusion Matrix
print('\033[01m             Confusion_matrix \033[0m')
cf_matrix = confusion_matrix(y_true, y_pred)

# Normalize and plot the confusion matrix
sns.heatmap(cf_matrix / np.sum(cf_matrix), annot=True, fmt='0.2%', cmap='Blues')

# Show the plot
plt.show()

# Optional: Classification report
print("\033[01mClassification Report:\033[0m")
print(classification_report(y_true, y_pred))

# End of Confusion Matrix
print('\033[31m###################- End -###################\033[0m')


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, LayerNormalization, Dropout
from tensorflow.keras.layers import MultiHeadAttention, GlobalAveragePooling1D, Add

In [None]:
# --- Positional Encoding ---
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, max_len, d_model):
        super().__init__()
        pos = np.arange(max_len)[:, np.newaxis]
        i = np.arange(d_model)[np.newaxis, :]
        angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
        angle_rads = pos * angle_rates

        # Apply sin to even indices; cos to odd
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

        self.pos_encoding = tf.cast(angle_rads[np.newaxis, ...], dtype=tf.float32)

    def call(self, x):
        return x + self.pos_encoding[:, :tf.shape(x)[1], :]

# --- Transformer Block ---
def transformer_block(x, num_heads, ff_dim, dropout=0.1):
    attn_output = MultiHeadAttention(num_heads=num_heads, key_dim=ff_dim)(x, x)
    attn_output = Dropout(dropout)(attn_output)
    out1 = LayerNormalization(epsilon=1e-6)(x + attn_output)

    ff_output = Dense(ff_dim, activation='relu')(out1)
    ff_output = Dense(x.shape[-1])(ff_output)
    ff_output = Dropout(dropout)(ff_output)
    return LayerNormalization(epsilon=1e-6)(out1 + ff_output)

# --- Build Model ---
def build_transformer_model(vocab_size, max_len, num_classes):
    inputs = Input(shape=(max_len,))
    x = Embedding(input_dim=vocab_size, output_dim=128)(inputs)
    x = PositionalEncoding(max_len, 128)(x)
    x = transformer_block(x, num_heads=2, ff_dim=64, dropout=0.1)
    x = GlobalAveragePooling1D()(x)
    x = Dropout(0.3)(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.3)(x)
    outputs = Dense(num_classes, activation='softmax')(x)

    model = Model(inputs=inputs, outputs=outputs)
    return model

In [None]:
# --- Instantiate and Compile ---
transformer_model = build_transformer_model(vocab_size, MAX_LEN, y_char.shape[1])
transformer_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# --- Train ---
print("\033[34mTraining Transformer Model...\033[0m")
transformer_model.fit(X_train_char, y_train_char, 
                      epochs=20, batch_size=32, 
                      validation_data=(X_test_char, y_test_char),
                      callbacks=[tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)],
                      verbose=1)



In [None]:
# --- Evaluate ---
loss, acc = transformer_model.evaluate(X_test_char, y_test_char, verbose=0)
print(f"\033[36mTransformer Accuracy (Char-level): {acc*100:.2f}%\033[0m")

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Predict class probabilities on test set
y_pred_probs = transformer_model.predict(X_test_char)

# Convert one-hot predictions and labels to class indices
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test_char, axis=1)

# Create confusion matrix
cf_matrix = confusion_matrix(y_true, y_pred)

# Normalize confusion matrix by total values
cf_matrix_normalized = cf_matrix.astype('float') / cf_matrix.sum()

# Print label
print('\033[01m             Confusion Matrix \033[0m')

# Plot
plt.figure(figsize=(6, 4))
sns.heatmap(cf_matrix_normalized, annot=True, fmt='.2%', cmap="Blues")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Transformer Model - Confusion Matrix')
plt.show()

print('\033[31m###################- End -###################\033[0m')


In [None]:
# Classification Report
print('\n\033[01mClassification Report:\033[0m')
print(classification_report(y_true, y_pred))

In [None]:
lstm_loss, lstm_acc = lstm_model.evaluate(X_test_char, y_test_char, verbose=0)
cnn_lstm_loss, cnn_lstm_acc = cnn_lstm_model.evaluate(X_test_char, y_test_char, verbose=0)
transformer_loss, transformer_acc = transformer_model.evaluate(X_test_char, y_test_char, verbose=0)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Automatically use the accuracy variables
model_names = ['LSTM', 'CNN + LSTM', 'Transformer']
accuracies = [lstm_acc * 100, cnn_lstm_acc * 100, transformer_acc * 100]  # convert to percentage

# DataFrame
results_df = pd.DataFrame({
    'Model': model_names,
    'Accuracy': accuracies
})

# Plotting
plt.figure(figsize=(8, 5))
sns.set_style("whitegrid")
plot = sns.barplot(x='Model', y='Accuracy', data=results_df, palette='mako')
plot.set_ylim(90, 100)

# Annotate bars
for bar in plot.patches:
    plot.annotate(format(bar.get_height(), '.2f') + '%',
                  (bar.get_x() + bar.get_width() / 2, bar.get_height()),
                  ha='center', va='bottom', size=12)

plt.title("Model Accuracy Comparison", fontsize=16)
plt.xlabel("Model", fontsize=12)
plt.ylabel("Accuracy (%)", fontsize=12)
plt.tight_layout()
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.colors as mcolors

def plot_confusion_and_report(model, X_test, y_test, model_name="Model", class_names=None):
    y_true = np.argmax(y_test, axis=1)
    y_pred = np.argmax(model.predict(X_test), axis=1)

    # Classification report
    print(f"\n\033[1mClassification Report - {model_name}:\033[0m")
    if class_names:
        print(classification_report(y_true, y_pred, target_names=class_names))
    else:
        print(classification_report(y_true, y_pred))

    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    # Custom colormap
    colors = ["#F5F5F5", "#FFD700", "#FFA500", "#FF4500", "#8B0000"]
    cmap = mcolors.ListedColormap(colors)

    plt.figure(figsize=(6, 4))
    sns.heatmap(cm_normalized, annot=True, fmt='.2%', cmap=cmap,
                xticklabels=class_names if class_names else None,
                yticklabels=class_names if class_names else None)
    plt.title(f'Normalized Confusion Matrix - {model_name}')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.tight_layout()
    plt.show()


In [None]:
class_names = ['benign','phishing' , 'defacement', 'malware']

plot_confusion_and_report(lstm_model, X_test_char, y_test_char, model_name="LSTM", class_names=class_names)
plot_confusion_and_report(cnn_lstm_model, X_test_char, y_test_char, model_name="CNN + LSTM", class_names=class_names)
plot_confusion_and_report(transformer_model, X_test_char, y_test_char, model_name="Transformer", class_names=class_names)
