In [7]:
import pickle
import pandas as pd
from urllib.parse import urlparse
import re
import whois
import datetime
import socket
import requests
from bs4 import BeautifulSoup

# Load the trained phishing model
with open("phishing_model_v2.pkl", "rb") as file:
    model = pickle.load(file)

# Function to extract features correctly
def extract_features(url):
    parsed_url = urlparse(url)
    
    features = {
        "UsingIP": 1 if re.match(r"\d+\.\d+\.\d+\.\d+", parsed_url.netloc) else -1,
        "LongURL": 1 if len(url) > 75 else 0 if len(url) > 54 else -1,
        "ShortURL": 1 if "bit.ly" in url or "tinyurl" in url else -1,
        "Symbol@": 1 if "@" in url else -1,
        "Redirecting//": 1 if "//" in url[7:] else -1,
        "PrefixSuffix-": 1 if "-" in parsed_url.netloc else -1,
        "SubDomains": 1 if parsed_url.netloc.count(".") > 2 else 0 if parsed_url.netloc.count(".") == 2 else -1,
        "HTTPS": 1 if parsed_url.scheme == "https" else -1,
        "DomainRegLen": get_domain_age(url),
        "NonStdPort": check_non_standard_port(parsed_url.netloc),
        "HTTPSDomainURL": 1 if "https" in parsed_url.netloc else -1,
        "RequestURL": get_anchor_url_ratio(url),
        "AnchorURL": get_anchor_url_ratio(url),
        "LinksInScriptTags": check_status_bar(url),
        "ServerFormHandler": 0,  # Placeholder (Needs Web Scraping)
        "InfoEmail": 1 if "@" in url else -1,
        "AbnormalURL": check_abnormal_url(url),
        "WebsiteForwarding": 0,  # Placeholder (Needs Web Scraping)
        "StatusBarCust": check_status_bar(url),
        "DisableRightClick": 0,  # Placeholder (Needs Web Scraping)
        "UsingPopupWindow": -1,  # Placeholder (Needs Web Scraping)
        "IframeRedirection": get_iframe_redirection(url),
        "AgeofDomain": get_domain_age(url),
        "DNSRecording": get_dns_record(url),
        "WebsiteTraffic": -1,  # No API, using default value
        "PageRank": -1,  # No API, using default value
        "GoogleIndex": get_google_index(url),
        "LinksPointingToPage": -1,  # Placeholder (Needs Web Scraping)
        "StatsReport": 0,  # Placeholder (Needs Web Scraping)
    }
    
    return pd.DataFrame([features])

# Helper functions
def check_non_standard_port(domain):
    return 1 if ":" in domain else -1

def get_domain_age(url):
    """Fetch the domain age in days."""
    try:
        domain_info = whois.whois(url)
        creation_date = domain_info.creation_date
        if isinstance(creation_date, list):  
            creation_date = creation_date[0]
        age = (datetime.datetime.now() - creation_date).days if creation_date else 0
        return 1 if age > 365 else -1
    except:
        return -1

def get_dns_record(url):
    """Check if DNS record exists."""
    try:
        socket.gethostbyname(urlparse(url).netloc)
        return 1
    except:
        return -1

def get_google_index(url):
    """Check if a site is indexed by Google."""
    return 1  # Defaulting to indexed

def check_abnormal_url(url):
    """Check if the URL has suspicious characteristics."""
    suspicious_keywords = ["login", "bank", "secure", "verify", "update"]
    if any(word in url.lower() for word in suspicious_keywords) or re.search(r"\d+\.\d+\.\d+\.\d+", url):
        return 1  # Phishing characteristics
    return -1  # Legitimate

def get_anchor_url_ratio(url):
    """Check the percentage of external links in anchor tags."""
    try:
        response = requests.get(url, timeout=5)
        soup = BeautifulSoup(response.text, "html.parser")
        total_links = soup.find_all("a")
        external_links = [link for link in total_links if link.get("href") and not link.get("href").startswith(url)]
        ratio = len(external_links) / len(total_links) if total_links else 0
        return 1 if ratio > 0.7 else 0 if ratio > 0.3 else -1
    except:
        return -1

def check_status_bar(url):
    """Check if the status bar is being manipulated."""
    try:
        response = requests.get(url, timeout=5)
        if "onmouseover" in response.text.lower():
            return 1  # Phishing characteristic
        return -1
    except:
        return -1

def get_iframe_redirection(url):
    """Detects iframe-based redirection attacks."""
    try:
        response = requests.get(url, timeout=5)
        soup = BeautifulSoup(response.text, "html.parser")
        iframes = soup.find_all("iframe")
        return 1 if len(iframes) > 0 else -1
    except:
        return -1

# --- Prediction Process ---
if __name__ == "__main__":
    website_url = input("Enter a website URL: ")

    # Extract features
    website_features = extract_features(website_url)

    # Ensure features match training data
    expected_features = model.feature_names_in_
    website_features = website_features.reindex(columns=expected_features, fill_value=-1)

    # Predict
    prediction = model.predict(website_features)
    result = "Phishing" if prediction[0] == -1 else "Legitimate"

    print(f"\nThe website {website_url} is: {result}")


Enter a website URL:  WWW.google.com



The website WWW.google.com is: Legitimate


In [9]:
websites = ["https://github.com", "https://bbc.com", "https://amoozon.com", "http://phishinghjvhvgsite.com"]
for site in websites:
    features = extract_features(site)
    features = features.reindex(columns=model.feature_names_in_, fill_value=-1)
    prediction = model.predict(features)
    print(f"{site} is: {'Phishing' if prediction[0] == -1 else 'Legitimate'}")


https://github.com is: Legitimate
https://bbc.com is: Legitimate
https://amoozon.com is: Legitimate
http://phishinghjvhvgsite.com is: Legitimate


In [11]:
print(website_features)

   Index  UsingIP  LongURL  ShortURL  Symbol@  Redirecting//  PrefixSuffix-  \
0     -1       -1       -1        -1       -1             -1             -1   

   SubDomains  HTTPS  DomainRegLen  ...  DisableRightClick  UsingPopupWindow  \
0          -1     -1             1  ...                  0                -1   

   IframeRedirection  AgeofDomain  DNSRecording  WebsiteTraffic  PageRank  \
0                 -1            1             1              -1        -1   

   GoogleIndex  LinksPointingToPage  StatsReport  
0            1                   -1            0  

[1 rows x 31 columns]
