In [1]:
import pandas as pd
import numpy as np
import re
import socket
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [2]:
# 1. Load dataset
# -----------------------------
df = pd.read_csv("phishing.csv")

In [3]:
# Selected important features
selected_features = [
    "length_url", "length_hostname", "ip", "nb_dots", "nb_hyphens",
    "https_token", "prefix_suffix", "shortening_service", "nb_subdomains",
    "web_traffic", "page_rank", "dns_record"
]

In [4]:
X = df[selected_features]
y = df["status"].map({"legitimate": 0, "phishing": 1})  # Encode target

In [5]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [6]:
# Build pipeline (scaler + RandomForest)
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", RandomForestClassifier(n_estimators=100, random_state=42))
])

In [7]:
# Train the model
pipeline.fit(X_train, y_train)

print("✅ Model trained with accuracy on test set:", pipeline.score(X_test, y_test))

✅ Model trained with accuracy on test set: 0.9278215223097113


In [8]:
# 2. Feature extraction from raw URL
# -----------------------------
def extract_features(url: str):
    parsed = urlparse(url)
    hostname = parsed.netloc if parsed.netloc else url
    
    features = {}
    features["length_url"] = len(url)
    features["length_hostname"] = len(hostname)
    
    # Check if IP address is used instead of domain
    try:
        socket.inet_aton(hostname)
        features["ip"] = 1
    except:
        features["ip"] = 0
    
    features["nb_dots"] = url.count(".")
    features["nb_hyphens"] = url.count("-")
    features["https_token"] = 1 if "https" in hostname and not url.startswith("https") else 0
    features["prefix_suffix"] = 1 if "-" in hostname else 0
    
    # URL Shorteners (bit.ly, tinyurl, etc.)
    shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd"
    features["shortening_service"] = 1 if re.search(shortening_services, url) else 0
    
    features["nb_subdomains"] = hostname.count(".")
    
    # Placeholders for unavailable real-time features
    features["web_traffic"] = 0
    features["page_rank"] = 0
    features["dns_record"] = 1  # Assume DNS record exists
    
    return pd.DataFrame([features])

In [9]:
# 3. Prediction function
# -----------------------------
def predict_url(url: str):
    X_new = extract_features(url)
    pred = pipeline.predict(X_new)[0]
    proba = pipeline.predict_proba(X_new)[0][pred]
    label = "phishing" if pred == 1 else "legitimate"
    return {"url": url, "prediction": label, "confidence": round(proba, 2)}

In [10]:
# -----------------------------
# 4. Test on some URLs
# -----------------------------
test_urls = [
    "https://google.com",
    "http://192.168.0.1/login",   # suspicious: IP address
    "http://paypal-secure-login.com",  # suspicious: hyphenated fake domain
    "https://bit.ly/2kd92d"  # suspicious: URL shortener
]

In [11]:

for u in test_urls:
    print(predict_url(u))


{'url': 'https://google.com', 'prediction': 'phishing', 'confidence': 0.76}
{'url': 'http://192.168.0.1/login', 'prediction': 'phishing', 'confidence': 0.98}
{'url': 'http://paypal-secure-login.com', 'prediction': 'phishing', 'confidence': 0.93}
{'url': 'https://bit.ly/2kd92d', 'prediction': 'phishing', 'confidence': 0.81}
