In [1]:
import pandas as pd

df_main = pd.read_csv(r"C:\Users\ASUS\Desktop\Phishing_URL_detection\dataset.zip")
df_safe = pd.read_csv(r"C:\Users\ASUS\Desktop\Phishing_URL_detection\safe_urls.csv")
df = pd.concat([df_main, df_safe], ignore_index=True)

df.head()


Unnamed: 0,URL,Label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad
3,mail.printakid.com/www.online.americanexpress....,bad
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad


In [2]:
df.columns


Index(['URL', 'Label'], dtype='object')

In [3]:
df['Label'] = df['Label'].replace({
    'good': 0, 'benign': 0, 'legitimate': 0, 'safe': 0,
    'bad': 1, 'phishing': 1, 'malicious': 1
})

df = df[['URL', 'Label']]  # keep only needed columns
df.head()


  df['Label'] = df['Label'].replace({


Unnamed: 0,URL,Label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,1
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,1
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,1
3,mail.printakid.com/www.online.americanexpress....,1
4,thewhiskeydregs.com/wp-content/themes/widescre...,1


In [4]:
df['Label'].value_counts()


Label
0    392934
1    156422
Name: count, dtype: int64

In [5]:
import re

def extract_features(url):
    features = {}

    features['url_length'] = len(url)
    features['count_dots'] = url.count('.')
    features['has_at'] = 1 if '@' in url else 0
    features['has_https'] = 1 if "https" in url.lower() else 0
    features['count_digits'] = sum(c.isdigit() for c in url)
    features['count_hyphens'] = url.count('-')

    suspicious_keywords = ['login', 'secure', 'update', 'account', 'verify', 'bank', 'signin']
    features['suspicious_keywords'] = any(k in url.lower() for k in suspicious_keywords)

    features['suspicious_keywords'] = int(features['suspicious_keywords'])

    return features


In [6]:
feature_list = []

for url in df['URL']:
    feature_list.append(extract_features(url))

feature_df = pd.DataFrame(feature_list)
feature_df['Label'] = df['Label']  # Attach labels

feature_df.head()


Unnamed: 0,url_length,count_dots,has_at,has_https,count_digits,count_hyphens,suspicious_keywords,Label
0,225,6,0,0,58,4,1,1
1,81,5,0,0,1,2,0,1
2,177,7,0,0,47,1,1,1
3,60,6,0,0,0,0,0,1
4,116,1,0,0,21,1,0,1


In [7]:
X = feature_df.drop('Label', axis=1)
y = feature_df['Label']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [8]:
from sklearn.preprocessing import StandardScaler


In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [10]:
from tensorflow.keras import models, layers

model = models.Sequential([
    layers.Input(shape=(X_train_scaled.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


In [11]:
history = model.fit(
    X_train_scaled, y_train,
    validation_split=0.2,
    epochs=10,
    batch_size=32
)


Epoch 1/10
[1m10988/10988[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 1ms/step - accuracy: 0.8089 - loss: 0.4374 - val_accuracy: 0.8156 - val_loss: 0.4223
Epoch 2/10
[1m10988/10988[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 1ms/step - accuracy: 0.8213 - loss: 0.4164 - val_accuracy: 0.8228 - val_loss: 0.4138
Epoch 3/10
[1m10988/10988[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 1ms/step - accuracy: 0.8243 - loss: 0.4124 - val_accuracy: 0.8249 - val_loss: 0.4141
Epoch 4/10
[1m10988/10988[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 1ms/step - accuracy: 0.8262 - loss: 0.4101 - val_accuracy: 0.8257 - val_loss: 0.4094
Epoch 5/10
[1m10988/10988[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 1ms/step - accuracy: 0.8270 - loss: 0.4089 - val_accuracy: 0.8268 - val_loss: 0.4087
Epoch 6/10
[1m10988/10988[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 1ms/step - accuracy: 0.8276 - loss: 0.4080 - val_accuracy: 0.8272 - val_loss: 0.410

In [12]:
loss, accuracy = model.evaluate(X_test_scaled, y_test)
print("Test Accuracy:", accuracy)


[1m3434/3434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 986us/step - accuracy: 0.8267 - loss: 0.4073
Test Accuracy: 0.8266710638999939


In [13]:
model.save("phishing_model.h5")
import joblib
joblib.dump(scaler, "scaler.pkl")

print("Model & scaler saved successfully!")




Model & scaler saved successfully!
