In [258]:
import warnings
from bs4 import MarkupResemblesLocatorWarning
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
import re
import socket
import requests
import whois
import dns.resolver
import datetime
import pandas as pd
from bs4 import BeautifulSoup
from tldextract import extract

In [260]:
def having_ip(url):
     return 1 if re.findall(r'[0-9]+(?:\.[0-9]+){3}', url) else -1

In [262]:
def url_length(url):
    if len(url) < 54: return -1
    elif len(url) <= 75: return 0
    else: return 1

In [264]:
def url_shortening(url):
    shortening_services = ["bit.ly","goo.gl","tinyurl","ow.ly","t.co","is.gd","buff.ly"]
    return 1 if any(service in url for service in shortening_services) else -1

In [266]:
def having_at_symbol(url): 
    return 1 if "@" in url else -1

In [268]:
def double_slash_redirect(url): 
    return 1 if url.count("//") > 1 else -1

In [270]:
def prefix_suffix(url):
    subDomain, domain, suffix = extract(url)
    return 1 if "-" in domain else -1

In [272]:
def sub_domain(url):
    subDomain, domain, suffix = extract(url)
    dots = subDomain.count('.')
    if dots == 0: return -1
    elif dots == 1: return 0
    else: return 1

In [274]:
def https_token(url):
    subDomain, domain, suffix = extract(url)
    if "https" in subDomain or "https" in domain: return 1
    return -1

In [276]:
def ssl_final_state(url):
    return 1 if url.startswith("https") else -1

In [278]:
def domain_registration_length(url):
    try:
        w = whois.whois(url)
        if w.expiration_date and w.creation_date:
            exp = w.expiration_date[0] if isinstance(w.expiration_date, list) else w.expiration_date
            crt = w.creation_date[0] if isinstance(w.creation_date, list) else w.creation_date
            age = (exp - crt).days
            return -1 if age > 365 else 1
    except: return 1
    return 1

In [280]:
def favicon(url):
    try:
        r = requests.get(url, timeout=3)
        soup = BeautifulSoup(r.text, "html.parser")
        for link in soup.find_all("link", rel="icon"):
            if url not in link.get("href", ""): return 1
        return -1
    except: return 1

In [282]:
def port(url):
    try:
        hostname = extract(url).domain + "." + extract(url).suffix
        socket.create_connection((hostname, 80), timeout=2)
        return -1
    except: return 1

In [284]:
def request_url(url):
    try:
        r = requests.get(url, timeout=3)
        soup = BeautifulSoup(r.text, "html.parser")
        imgs = soup.find_all("img", src=True)
        total, outside = len(imgs), 0
        for img in imgs:
            if url not in img['src']: outside += 1
        if total == 0: return -1
        percent = outside/total * 100
        if percent < 22: return -1
        elif percent <= 61: return 0
        else: return 1
    except: return 0

In [286]:
def url_of_anchor(url):
    try:
        r = requests.get(url, timeout=3)
        soup = BeautifulSoup(r.text, "html.parser")
        anchors = soup.find_all("a", href=True)
        total, outside = len(anchors), 0
        for a in anchors:
            if url not in a['href']: outside += 1
        if total == 0: return -1
        percent = outside/total * 100
        if percent < 31: return -1
        elif percent <= 67: return 0
        else: return 1
    except: return 0

In [288]:
def links_in_tags(url):
    try:
        r = requests.get(url, timeout=3)
        soup = BeautifulSoup(r.text, "html.parser")
        total = len(soup.find_all("meta")) + len(soup.find_all("link")) + len(soup.find_all("script"))
        return -1 if total < 10 else 1
    except: return 0

In [290]:
def sfh(url): return -1
def submitting_to_email(url): return 1 if "mailto:" in url else -1
def abnormal_url(url): return 1 if len(url) < 10 else -1
def redirect(url): return 1 if url.count("//") > 3 else -1
def on_mouseover(url): return -1
def right_click(url): return -1
def popup_window(url): return -1
def iframe(url): return -1

In [292]:
def age_of_domain(url):
    try:
        w = whois.whois(url)
        crt = w.creation_date[0] if isinstance(w.creation_date, list) else w.creation_date
        if crt:
            age = (datetime.datetime.now() - crt).days
            return -1 if age > 180 else 1
    except: return 1
    return 1

In [294]:
def dns_record(url):
    try:
        hostname = extract(url).domain + "." + extract(url).suffix
        dns.resolver.resolve(hostname, 'A')
        return -1
    except: return 1

In [296]:
def web_traffic(url): return 0   # Placeholder, requires Alexa API
def page_rank(url): return 0

In [298]:
def google_index(url):
    try:
        r = requests.get("https://www.google.com/search?q=site:" + url, timeout=3)
        return -1 if "did not match any documents" not in r.text else 1
    except: return 0

In [300]:
def links_pointing(url): return 0
def statistical_report(url): return 0

In [302]:
def extract_features(url):
    return {
        "having_IP_Address": having_ip(url),
        "URL_Length": url_length(url),
        "Shortining_Service": url_shortening(url),
        "having_At_Symbol": having_at_symbol(url),
        "double_slash_redirecting": double_slash_redirect(url),
        "Prefix_Suffix": prefix_suffix(url),
        "having_Sub_Domain": sub_domain(url),
        "HTTPS_token": https_token(url),
        "SSLfinal_State": ssl_final_state(url),
        "Domain_registeration_length": domain_registration_length(url),
        "Favicon": favicon(url),
        "Port": port(url),
        "Request_URL": request_url(url),
        "URL_of_Anchor": url_of_anchor(url),
        "Links_in_tags": links_in_tags(url),
        "SFH": sfh(url),
        "Submitting_to_email": submitting_to_email(url),
        "Abnormal_URL": abnormal_url(url),
        "Redirect": redirect(url),
        "on_mouseover": on_mouseover(url),
        "RightClick": right_click(url),
        "popUpWidnow": popup_window(url),
        "Iframe": iframe(url),
        "Age_of_domain": age_of_domain(url),
        "DNSRecord": dns_record(url),
        "web_traffic": web_traffic(url),
        "Page_Rank": page_rank(url),
        "Google_Index": google_index(url),
        "Links_pointing_to_page": links_pointing(url),
        "Statistical_report": statistical_report(url)
    }

In [304]:
urls = [
    ("https://www.google.com", 1),                      
    ("http://192.168.1.1/login", -1),                    
    ("https://bit.ly/phishlink", -1),                    
    ("https://www.wikipedia.org", 1),                    
    ("http://213.87.99.88/malicious", -1),               
    ("http://tinyurl.com/badsite", -1),   
]

data = []
for url, label in urls:
    row = extract_features(url)
    row["URL"] = url
    row["Result"] = label
    data.append(row)

df = pd.DataFrame(data)
df.to_csv("phishing_30features_real.csv", index=False)
print("✅ Dataset created with 30 real features -> phishing_30features_real.csv")
print(df.head())

✅ Dataset created with 30 real features -> phishing_30features_real.csv
   having_IP_Address  URL_Length  Shortining_Service  having_At_Symbol  \
0                 -1          -1                  -1                -1   
1                  1          -1                  -1                -1   
2                 -1          -1                   1                -1   
3                 -1          -1                  -1                -1   
4                  1          -1                  -1                -1   

   double_slash_redirecting  Prefix_Suffix  having_Sub_Domain  HTTPS_token  \
0                        -1             -1                 -1           -1   
1                        -1             -1                 -1           -1   
2                        -1             -1                 -1           -1   
3                        -1             -1                 -1           -1   
4                        -1             -1                 -1           -1   

   SSLfinal_St