# Train Model

In [2]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
import joblib
import sys

def train_and_save_model(dataset_path='dataset.csv', model_output_path='phishing_gradient_boosting_model.joblib'):
    """
    Loads the dataset, trains the Gradient Boosting model, and saves it to a file.
    """
    # Load the dataset
    try:
        df = pd.read_csv(dataset_path)
    except FileNotFoundError:
        print(f"Error: {dataset_path} not found.")
        sys.exit(1)

    # Prepare data
    if 'index' in df.columns:
        df = df.drop('index', axis=1)
    X = df.drop('Result', axis=1)
    y = df['Result']

    # --- MODEL CHANGE ---
    # We've replaced RandomForestClassifier with GradientBoostingClassifier
    # and given it stronger starting parameters.
    print("Training the Gradient Boosting model...")
    model = GradientBoostingClassifier(n_estimators=150, learning_rate=0.1, max_depth=5, random_state=42)
    
    # Using .values is not strictly necessary for newer scikit-learn versions but is safe
    model.fit(X.values, y)
    print("Model training completed.")

    # Save the trained model to a file
    joblib.dump(model, model_output_path)
    print(f"Model saved successfully to {model_output_path}")

if __name__ == "__main__":
    train_and_save_model()

Training the Gradient Boosting model...
Model training completed.
Model saved successfully to phishing_gradient_boosting_model.joblib


# Test Model

In [None]:
import joblib
import whois
import requests
from bs4 import BeautifulSoup
import urllib.parse
from datetime import datetime
import dns.resolver
import re
import sys
import numpy as np # Needed for array operations, especially with probabilities
from collections import defaultdict # For easier data collection


# --- Helper Functions ---

def get_domain(url):
    try:
        return urllib.parse.urlparse(url).netloc
    except:
        return None

# ==============================================================================
# === NEW FUNCTION TO CHECK IF URL IS ACCESSIBLE ===
# ==============================================================================
def is_url_accessible(url):
    """
    Checks if a URL is reachable by sending a HEAD request.
    Returns True if the URL responds with a successful status code, False otherwise.
    """
    try:
        # Send a HEAD request to get only headers, which is faster.
        # Set a timeout to avoid waiting indefinitely.
        # Add a common User-Agent to avoid being blocked by some servers.
        response = requests.head(url, timeout=5, headers={'User-Agent': 'Mozilla/5.0'})
        
        # Check for a successful status code (e.g., 200 OK)
        # We consider any 4xx (client error) or 5xx (server error) as inaccessible.
        if response.status_code < 400:
            return True
        else:
            return False
    except requests.exceptions.RequestException:
        # Catches connection errors, timeouts, etc.
        return False

# Modified to use User-Agent
def get_soup(url):
    try:
        response = requests.get(url, timeout=5, headers={'User-Agent': 'Mozilla/5.0'})
        response.raise_for_status()
        return BeautifulSoup(response.text, 'html.parser')
    except requests.exceptions.RequestException:
        return None

# ==============================================================================
# === ADDITIONAL HELPER FUNCTIONS FOR DETAILED REPORTING ===
# ==============================================================================

def get_ip_address(domain):
    """Resolves a domain name to its primary IPv4 address."""
    try:
        answers = dns.resolver.resolve(domain, 'A')
        return answers[0].address
    except (dns.resolver.NoAnswer, dns.resolver.NXDOMAIN, dns.exception.Timeout):
        return "N/A"
    except Exception:
        return "N/A"

def get_all_ip_addresses(domain):
    """Resolves a domain name to all its IPv4 addresses."""
    ips = []
    try:
        answers = dns.resolver.resolve(domain, 'A')
        for rdata in answers:
            ips.append(rdata.address)
        return ips
    except (dns.resolver.NoAnswer, dns.resolver.NXDOMAIN, dns.exception.Timeout):
        return []
    except Exception:
        return []

def get_http_details(url):
    """Fetches HTTP status code, page size, web server, and content type."""
    details = {
        'HTTP Status Code': 'N/A',
        'Page Size': 'N/A',
        'Web Server': 'N/A',
        'Content Type': 'N/A',
    }
    try:
        response = requests.get(url, timeout=5, headers={'User-Agent': 'Mozilla/5.0'})
        response.raise_for_status() # Raise an exception for HTTP errors
        
        details['HTTP Status Code'] = response.status_code
        details['Page Size'] = len(response.content) # Page size in bytes
        details['Web Server'] = response.headers.get('Server', 'N/A')
        details['Content Type'] = response.headers.get('Content-Type', 'N/A').split(';')[0] # Remove charset info
        
    except requests.exceptions.RequestException:
        pass # Keep N/A for connection/HTTP errors
    return details

def get_whois_domain_age(domain):
    """Extracts creation date and calculates age from WHOIS."""
    try:
        w = whois.whois(domain)
        if w.creation_date:
            cre_date = w.creation_date[0] if isinstance(w.creation_date, list) else w.creation_date
            if isinstance(cre_date, datetime):
                age_delta = datetime.now() - cre_date
                years = age_delta.days // 365
                if years == 0:
                    return f"{age_delta.days} days ago"
                elif years == 1:
                    return "1 year ago"
                else:
                    return f"{years} years ago"
        return "N/A"
    except Exception:
        return "N/A"

def check_spf_dmarc(domain):
    """Checks for SPF and DMARC DNS records."""
    spf_found = False
    dmarc_found = False
    
    try:
        # Check SPF records (TXT records starting with "v=spf1")
        txt_records = dns.resolver.resolve(domain, 'TXT')
        for rdata in txt_records:
            for txt_string in rdata.strings:
                if b"v=spf1" in txt_string.lower():
                    spf_found = True
                    break
            if spf_found: break
    except (dns.resolver.NoAnswer, dns.resolver.NXDOMAIN, dns.exception.Timeout):
        pass # No TXT records or domain doesn't exist

    try:
        # Check DMARC record (_dmarc.domain TXT record)
        dmarc_records = dns.resolver.resolve(f"_dmarc.{domain}", 'TXT')
        for rdata in dmarc_records:
            for txt_string in rdata.strings:
                if b"v=dmarc1" in txt_string.lower():
                    dmarc_found = True
                    break
            if dmarc_found: break
    except (dns.resolver.NoAnswer, dns.resolver.NXDOMAIN, dns.exception.Timeout):
        pass # No DMARC record or domain doesn't exist
    
    if spf_found and dmarc_found:
        return "SPF and DMARC Found"
    elif spf_found:
        return "SPF Found"
    elif dmarc_found:
        return "DMARC Found"
    else:
        return "Not found"

# Placeholder for Free Hosted Content - requires more complex checks or a database
def is_free_hosted(domain):
    # This is a complex check, often involving lists of known free hosting providers.
    # For now, we'll return a placeholder.
    # Example: if "blogspot.com" in domain or "wixsite.com" in domain etc.
    free_hosts = ["blogspot.com", "wixsite.com", "weebly.com", "github.io", "netlify.app", "surge.sh"]
    if any(fh in domain for fh in free_hosts):
        return True
    return False # Default to false, as a comprehensive check is hard without external data

# ==============================================================================
# === FEATURE EXTRACTION FUNCTIONS (UNCHANGED) ===
# ==============================================================================

def having_ip_address(url):
    try:
        domain = get_domain(url)
        if re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", domain): return -1
        return 1
    except: return -1

def url_length(url):
    if len(url) < 54: return 1
    elif 54 <= len(url) <= 75: return 0
    else: return -1

def shortening_service(url):
    shortening_services = ["bit.ly", "goo.gl", "t.co", "tinyurl.com", "is.gd", "cli.gs", "tr.im", "ow.ly", "tiny.cc"]
    domain = get_domain(url)
    if domain in shortening_services: return -1
    return 1

def having_at_symbol(url):
    if "@" in url: return -1
    return 1

def double_slash_redirecting(url):
    path = urllib.parse.urlparse(url).path
    if "//" in path and path.find("//") > 0: # More precise to catch internal //
        return -1
    return 1

def prefix_suffix(url):
    if "-" in get_domain(url): return -1
    return 1

def having_sub_domain(url):
    dots = get_domain(url).count('.')
    if dots == 2: return 0
    elif dots > 2: return -1
    return 1

def ssl_final_state(url):
    try:
        if url.startswith("https"): return 1
        return -1
    except: return -1

def domain_registration_length(url):
    try:
        domain = get_domain(url)
        w = whois.whois(domain)
        if w.expiration_date and w.creation_date:
            exp_date = w.expiration_date[0] if isinstance(w.expiration_date, list) else w.expiration_date
            cre_date = w.creation_date[0] if isinstance(w.creation_date, list) else w.creation_date
            if exp_date and cre_date:
                if (exp_date - cre_date).days / 365 <= 1: return -1
        return 1
    except: return -1

def age_of_domain(url): # This function now just returns -1/1 for model, not display string
    try:
        domain = get_domain(url)
        w = whois.whois(domain)
        if w.creation_date:
            cre_date = w.creation_date[0] if isinstance(w.creation_date, list) else w.creation_date
            if (datetime.now() - cre_date).days < 180: return -1
        return 1
    except: return -1

def dns_record(url):
    try:
        dns.resolver.resolve(get_domain(url), 'A'); return 1
    except: return -1

def abnormal_url(url):
    try:
        domain = get_domain(url)
        w = whois.whois(domain)
        if domain.lower() not in str(w).lower(): return -1
        return 1
    except: return -1

def web_traffic(url): return 0
def page_rank(url): return 0
def google_index(url): return 1
def links_pointing_to_page(url): return 0
def statistical_report(url): return 1
def favicon(url): return 1
def port(url): return 1
def https_token(url): return 1

def request_url(url):
    soup = get_soup(url)
    if not soup: return -1
    domain = get_domain(url)
    image_count = 0
    external_image_count = 0
    for img in soup.find_all('img'):
        if img.has_attr('src'):
            image_count += 1
            src_domain = get_domain(urllib.parse.urljoin(url, img['src']))
            if src_domain != domain:
                external_image_count += 1
    if image_count == 0: return 1
    percentage = (external_image_count / image_count) * 100
    if percentage < 22.0: return 1
    elif 22.0 <= percentage < 61.0: return 0
    else: return -1

def url_of_anchor(url):
    soup = get_soup(url)
    if not soup: return -1
    domain = get_domain(url)
    anchor_count = 0
    external_anchor_count = 0
    for a in soup.find_all('a'):
        if a.has_attr('href'):
            anchor_count += 1
            href = a['href']
            if href.startswith('#') or href.startswith('mailto:') or 'javascript:void(0)' in href.lower():
                anchor_count -= 1
                continue
            href_domain = get_domain(urllib.parse.urljoin(url, href))
            if href_domain != domain:
                external_anchor_count += 1
    if anchor_count == 0: return 1
    percentage = (external_anchor_count / anchor_count) * 100
    if percentage < 31.0: return 1
    elif 31.0 <= percentage < 67.0: return 0
    else: return -1

def links_in_tags(url): # Placeholder for this feature (adjust if you have real logic)
    # This feature usually involves examining link distribution in HTML tags like <A>, <Link>, <Meta>
    # For now, a neutral score.
    return 0 

def sfh(url): # Placeholder for SFH (Server Form Handler)
    # Checks if form action is to an external domain or looks suspicious
    # Requires parsing forms and their actions. For now, neutral.
    return 0

def submitting_to_email(url): # Placeholder
    # Checks for mailto: links in form actions.
    return 1

def redirect(url): # Placeholder
    # Checks for multiple redirections. requests library handles this internally.
    return 1

def on_mouseover(url): # Placeholder
    # Checks for status bar changes on mouseover (JS based). Hard to detect without browser.
    return 1

def right_click(url): # Placeholder
    # Checks for disabled right-click. Hard to detect without browser.
    return 1

def popup_window(url): # Placeholder
    # Checks for pop-up windows. Hard to detect without browser.
    return 1

def iframe(url): # Placeholder
    # Checks for iframes.
    soup = get_soup(url)
    if soup and soup.find('iframe'):
        return -1 # Presence of iframe can sometimes be suspicious
    return 1


# --- NEW CONTENT-BASED FEATURES (for rule-based enhancement) ---

def has_suspicious_keywords(url):
    soup = get_soup(url)
    if not soup: return 0
    suspicious_words = ["login", "password", "verify", "account", "update", "security", "confirm", "bank", "credit card", "paypal", "alert", "urgent"]
    page_text = soup.get_text().lower()
    found_count = sum(1 for word in suspicious_words if word in page_text)
    if found_count >= 3: return -1
    elif found_count >= 1: return 0
    else: return 1

def has_login_form(url):
    soup = get_soup(url)
    if not soup: return 0
    forms = soup.find_all('form')
    if not forms: return 1
    for form in forms:
        password_input = form.find('input', {'type': 'password'})
        email_input = form.find('input', {'type': 'email'})
        text_input_name = form.find('input', {'type': 'text', 'name': re.compile(r'user|login|account|email|username', re.I)})
        if password_input or email_input or text_input_name:
            action = form.get('action')
            if action:
                action_domain = get_domain(urllib.parse.urljoin(url, action))
                if action_domain != get_domain(url) or not action_domain: return -1
            else: return -1
    return 1

def external_link_ratio(url):
    soup = get_soup(url)
    if not soup: return 0
    main_domain = get_domain(url)
    total_links = 0
    external_domains = set()
    for a in soup.find_all('a', href=True):
        href = a['href']
        if href.startswith('#') or href.startswith('mailto:') or 'javascript:void(0)' in href.lower(): continue
        full_href = urllib.parse.urljoin(url, href)
        link_domain = get_domain(full_href)
        if link_domain and link_domain != main_domain: external_domains.add(link_domain)
        total_links += 1
    if total_links == 0: return 1
    ratio = len(external_domains) / total_links
    if ratio > 0.4: return -1
    elif ratio > 0.1: return 0
    else: return 1


# --- Define the order of your original 30 features ---
# This order MUST match the order of features your model was trained on from dataset.csv
ORIGINAL_30_FEATURES_ORDER = [
    having_ip_address, url_length, shortening_service, having_at_symbol,
    double_slash_redirecting, prefix_suffix, having_sub_domain, ssl_final_state,
    domain_registration_length, favicon, port, https_token, request_url,
    url_of_anchor, links_in_tags, sfh, submitting_to_email, abnormal_url,
    redirect, on_mouseover, right_click, popup_window, iframe,
    age_of_domain, dns_record, web_traffic, page_rank, google_index,
    links_pointing_to_page, statistical_report
]

# --- Define the 3 ADDITIONAL content-based features for rule-based override ---
ADDITIONAL_3_FEATURES = [
    has_suspicious_keywords, has_login_form, external_link_ratio
]


# =================================================================
# === MODIFIED PREDICTION LOGIC WITH 30-FEATURE MODEL + RULE-BASED OVERRIDE ===
# =================================================================
def predict_url_with_enhancement(model_30_features, url, safe_confidence_threshold=0.94):
    """
    Predicts the class of a URL using a 30-feature model, then enhances the decision
    with 3 additional content-based features through a rule-based system.
    Also collects comprehensive URL details for a structured report.
    """
    
    report_details = defaultdict(lambda: 'N/A')
    report_details['Domain'] = get_domain(url)
    report_details['Full URL'] = url

    # --- Step 1: Accessibility Check ---
    if not is_url_accessible(url):
        prob_dist = {-1: 1.0, 0: 0.0, 1: 0.0}
        report_details['Overall URL risk classification'] = "Unreachable"
        report_details['Suspicious Activity'] = "URL Unreachable"
        return "Unsafe (URL Unreachable)", prob_dist, 1.0, "URL Unreachable", report_details

    # --- Collect Additional Report Details Proactively ---
    # These calls might be redundant if the feature functions already do this,
    # but ensures they are captured for the report.
    domain = get_domain(url)
    
    report_details['IP Address'] = get_ip_address(domain)
    report_details['DNS A Records'] = ", ".join(get_all_ip_addresses(domain)) if get_all_ip_addresses(domain) else "Not found"
    
    http_details = get_http_details(url)
    report_details['HTTP Status Code'] = http_details['HTTP Status Code']
    report_details['Page Size'] = f"{http_details['Page Size']} bytes" if http_details['Page Size'] != 'N/A' else 'N/A'
    report_details['Web Server'] = http_details['Web Server']
    report_details['Content Type'] = http_details['Content Type']
    
    report_details['Domain Age'] = get_whois_domain_age(domain)
    report_details['Free Hosted Content'] = is_free_hosted(domain)
    report_details['SPF/DMARC Record'] = check_spf_dmarc(domain)


    # --- Step 2: Extract ORIGINAL 30 Features for the model ---
    features_30 = [func(url) for func in ORIGINAL_30_FEATURES_ORDER]
    
    initial_label = "Unknown"
    confidence = 0.0
    prob_dist = {-1: 0.0, 0: 0.0, 1: 0.0} # Default for error case

    try:
        features_array_30 = np.array(features_30).reshape(1, -1)
        
        # Get prediction and probabilities from the 30-feature model
        prediction_30 = model_30_features.predict(features_array_30)[0]
        probabilities_array_30 = model_30_features.predict_proba(features_array_30)[0]
        
        prob_dist = {model_30_features.classes_[i]: probabilities_array_30[i] 
                     for i in range(len(model_30_features.classes_))}
        confidence = max(probabilities_array_30)
        
        label_map = {1: "Safe", 0: "Neutral", -1: "Unsafe (Phishing)"}
        initial_label = label_map.get(prediction_30, "Unknown")
        
    except Exception as e:
        print(f"Error during 30-feature model prediction: {e}", file=sys.stderr)
        report_details['Overall URL risk classification'] = "Error"
        return f"Error: {e}", {-1: 0.0, 0: 0.0, 1: 0.0}, 0.0, "Model Error", report_details

    # --- Step 3: Apply Conservative Confidence Threshold on initial prediction ---
    final_label = initial_label
    decision_reason = "30-Feature Model Prediction"
    classification_status = initial_label # For overall status

    if initial_label == "Safe" and confidence < safe_confidence_threshold:
        final_label = "Unsafe (Phishing - Low Confidence Safety)"
        decision_reason = "Conservative Confidence Threshold"
        classification_status = "Suspicious"

    # --- Step 4: Extract and use the 3 ADDITIONAL features for further override ---
    # This section gets executed AFTER the 30-feature model's prediction and initial thresholding.
    
    # Only if the current decision is not already "Unsafe" (or an error),
    # do we check the additional features to potentially override to "Unsafe".
    if "Unsafe" not in final_label and "Error" not in final_label:
        
        keyword_score = has_suspicious_keywords(url)
        login_form_score = has_login_form(url)
        external_link_ratio_score = external_link_ratio(url)
        
        suspicious_activity_reasons = []

        if keyword_score == -1: suspicious_activity_reasons.append("Suspicious Keywords Detected")
        if login_form_score == -1: suspicious_activity_reasons.append("Login Form Detected")
        if external_link_ratio_score == -1: suspicious_activity_reasons.append("High External Link Ratio")

        if suspicious_activity_reasons:
            final_label = "Unsafe (Phishing - Content Alert!)"
            decision_reason = "Content-Based Feature Override"
            classification_status = "Suspicious"
            report_details['Suspicious Activity'] = ", ".join(suspicious_activity_reasons)
        elif keyword_score == 0 or login_form_score == 0 or external_link_ratio_score == 0:
            if "Suspicious Activity" == report_details['Suspicious Activity']: # If not already set by strong signals
                report_details['Suspicious Activity'] = "Some Content Features Neutral/Minor Suspicion"
            classification_status = "Suspicious"
            if "Unsafe" not in final_label: # Don't downgrade if already unsafe
                 final_label = "Neutral (Content Minor Suspicion)"


    # Map final label to risk classification for the report
    if "Unsafe" in final_label:
        report_details['Overall URL risk classification'] = "Suspicious"
        report_details['Phishing'] = "Phishing Suspected"
        if report_details['Suspicious Activity'] == 'N/A':
            report_details['Suspicious Activity'] = "Model Indication"
    elif "Neutral" in final_label:
        report_details['Overall URL risk classification'] = "Suspicious"
        report_details['Phishing'] = "Possibly benign, check manually"
        if report_details['Suspicious Activity'] == 'N/A':
            report_details['Suspicious Activity'] = "Neutral Model Indication"
    else: # Safe
        report_details['Overall URL risk classification'] = "Clean URL - SAFE"
        report_details['Phishing'] = "No Phishing Issues"
        report_details['Suspicious Activity'] = "No Suspicious Activity"

    # Risk Score: Invert confidence (0-100 scale)
    report_details['Risk Score'] = f"{int((1 - confidence) * 100)} - {report_details['Overall URL risk classification'].split(' - ')[-1]}"
    if report_details['Risk Score'] == "0 - SAFE": # Make sure it's not "0 - Suspicious"
        report_details['Risk Score'] = "0 - Clean"

    # Placeholder for Malware/Spamming/Parked
    report_details['Malware'] = "No Malware Issues" # This would need a separate malware scanning integration
    report_details['Spamming Domain'] = "No SPAM Issues" # This would need a separate spam detection integration
    report_details['Parked Domain'] = "Not Parked" # This would need more sophisticated detection

    return final_label, prob_dist, confidence, decision_reason, report_details


# --- Main Execution ---
if __name__ == "__main__":
    MODEL_PATH = 'phishing_gradient_boosting_model.joblib'
    try:
        # Load the model that was trained on 30 features
        model_30_features = joblib.load(MODEL_PATH)
        # CRITICAL CHECK: Ensure the loaded model truly expects 30 features
        expected_features_count = 30 # Assuming your original dataset.csv had exactly 30 feature columns
        if model_30_features.n_features_in_ != expected_features_count:
            print(f"Error: Loaded model expects {model_30_features.n_features_in_} features, but this script is designed for a {expected_features_count}-feature model.")
            print(f"Please ensure '{MODEL_PATH}' was trained using exactly {expected_features_count} features from your dataset.csv (excluding 'Result' and 'index').")
            print("You might need to re-run your 'train_and_save_model.py' if it was accidentally trained on a different number of features.")
            sys.exit(1)
            
    except FileNotFoundError:
        print(f"Model file '{MODEL_PATH}' not found. Please run your training script ('train_and_save_model.py') first to create it.")
        sys.exit(1)
    except Exception as e:
        print(f"Error loading model: {e}")
        sys.exit(1)

    print("Phishing URL Detector is ready. Enter a URL to check.")
    while True:
        test_url = input("Enter a URL (or type 'exit' to quit): ")
        if test_url.lower() == 'exit':
            break
        if not test_url.startswith(('http://', 'https://')):
            test_url = 'https://' + test_url
        
        final_label, prob_dist, confidence, decision_reason, report_details = predict_url_with_enhancement(
            model_30_features, test_url, safe_confidence_threshold=0.98 # Adjust as needed
        )
        
        # --- Print the Detailed Report ---
        print(f"\nURL Scanning Details for {report_details['Full URL']}")
        print(f"Overall URL risk classification. {report_details['Overall URL risk classification']}")
        print(f"Suspicious Activity. {report_details['Suspicious Activity']}")
        print(f"Domain. {report_details['Domain']}")
        print(f"IP Address. {report_details['IP Address']}")
        print(f"Malware. {report_details['Malware']}")
        print(f"Phishing. {report_details['Phishing']}")
        print(f"Risk Score. {report_details['Risk Score']}")
        print(f"Parked Domain. {report_details['Parked Domain']}")
        print(f"Spamming Domain. {report_details['Spamming Domain']}")
        print(f"Domain Trust Rating. Login to View") # Placeholder as before
        print(f"Domain Age. {report_details['Domain Age']}")
        print(f"HTTP Status Code. {report_details['HTTP Status Code']}")
        print(f"Page Size. {report_details['Page Size']}")
        print(f"Web Server. {report_details['Web Server']}")
        print(f"Content Type. {report_details['Content Type']}")
        print(f"Free Hosted Content. {report_details['Free Hosted Content']}")
        print(f"SPF/DMARC Record. {report_details['SPF/DMARC Record']}")
        print(f"DNS A Records. {report_details['DNS A Records']}")
        
        print("\n--- Model's Internal Details ---")
        print(f"Model's Final Decision: {final_label}")
        print(f"Decision Based On: {decision_reason}")
        print(f"Model's Highest Confidence (from 30 features): {confidence:.2%}")
        print("Model's Individual Class Probabilities (from 30 features):")
        for class_val in sorted(prob_dist.keys()):
            class_label_map = {1: "Safe", 0: "Neutral", -1: "Unsafe (Phishing)"}
            print(f"  {prob_dist[class_val]:.1%} chance of being {class_val} ({class_label_map.get(class_val, 'Unknown')})")
        print("-" * 30 + "\n")

Phishing URL Detector is ready. Enter a URL to check.


Enter a URL (or type 'exit' to quit):  https://www.youtube.com/



URL Scanning Details for https://www.youtube.com/
Overall URL risk classification. Clean URL - SAFE
Suspicious Activity. No Suspicious Activity
Domain. www.youtube.com
IP Address. 142.251.223.142
Malware. No Malware Issues
Phishing. No Phishing Issues
Risk Score. 0 - Clean
Parked Domain. Not Parked
Spamming Domain. No SPAM Issues
Domain Trust Rating. Login to View
Domain Age. 20 years ago
HTTP Status Code. 200
Page Size. 667486 bytes
Web Server. ESF
Content Type. text/html
Free Hosted Content. False
SPF/DMARC Record. Not found
DNS A Records. 142.250.193.46, 142.250.67.78, 142.251.223.142, 216.58.200.174, 216.58.196.110, 172.217.27.174, 142.250.182.110, 172.217.167.206, 172.217.166.14, 172.217.26.110, 142.250.193.14, 142.250.183.238, 142.250.182.46, 172.217.26.14, 142.250.193.142, 172.217.26.46

--- Model's Internal Details ---
Model's Final Decision: Safe
Decision Based On: 30-Feature Model Prediction
Model's Highest Confidence (from 30 features): 99.93%
Model's Individual Class Proba

Enter a URL (or type 'exit' to quit):  https://lms.vit.ac.in/login/index.php



URL Scanning Details for https://lms.vit.ac.in/login/index.php
Overall URL risk classification. Unreachable
Suspicious Activity. URL Unreachable
Domain. lms.vit.ac.in
IP Address. N/A
Malware. N/A
Phishing. N/A
Risk Score. N/A
Parked Domain. N/A
Spamming Domain. N/A
Domain Trust Rating. Login to View
Domain Age. N/A
HTTP Status Code. N/A
Page Size. N/A
Web Server. N/A
Content Type. N/A
Free Hosted Content. N/A
SPF/DMARC Record. N/A
DNS A Records. N/A

--- Model's Internal Details ---
Model's Final Decision: Unsafe (URL Unreachable)
Decision Based On: URL Unreachable
Model's Highest Confidence (from 30 features): 100.00%
Model's Individual Class Probabilities (from 30 features):
  100.0% chance of being -1 (Unsafe (Phishing))
  0.0% chance of being 0 (Neutral)
  0.0% chance of being 1 (Safe)
------------------------------



Enter a URL (or type 'exit' to quit):  https://ahcswh.com/v2/check



URL Scanning Details for https://ahcswh.com/v2/check
Overall URL risk classification. Clean URL - SAFE
Suspicious Activity. No Suspicious Activity
Domain. ahcswh.com
IP Address. 43.165.132.68
Malware. No Malware Issues
Phishing. No Phishing Issues
Risk Score. 0 - Clean
Parked Domain. Not Parked
Spamming Domain. No SPAM Issues
Domain Trust Rating. Login to View
Domain Age. 347 days ago
HTTP Status Code. 200
Page Size. 1621 bytes
Web Server. nginx/1.26.3
Content Type. text/html
Free Hosted Content. False
SPF/DMARC Record. Not found
DNS A Records. 43.165.132.68

--- Model's Internal Details ---
Model's Final Decision: Safe
Decision Based On: 30-Feature Model Prediction
Model's Highest Confidence (from 30 features): 99.92%
Model's Individual Class Probabilities (from 30 features):
  0.1% chance of being -1 (Unsafe (Phishing))
  99.9% chance of being 1 (Safe)
------------------------------



Enter a URL (or type 'exit' to quit):  https://github.com/



URL Scanning Details for https://github.com/
Overall URL risk classification. Suspicious
Suspicious Activity. Suspicious Keywords Detected
Domain. github.com
IP Address. 20.207.73.82
Malware. No Malware Issues
Phishing. Phishing Suspected
Risk Score. 0 - Suspicious
Parked Domain. Not Parked
Spamming Domain. No SPAM Issues
Domain Trust Rating. Login to View
Domain Age. 17 years ago
HTTP Status Code. 200
Page Size. 560602 bytes
Web Server. github.com
Content Type. text/html
Free Hosted Content. False
SPF/DMARC Record. SPF and DMARC Found
DNS A Records. 20.207.73.82

--- Model's Internal Details ---
Model's Final Decision: Unsafe (Phishing - Content Alert!)
Decision Based On: Content-Based Feature Override
Model's Highest Confidence (from 30 features): 99.94%
Model's Individual Class Probabilities (from 30 features):
  0.1% chance of being -1 (Unsafe (Phishing))
  99.9% chance of being 1 (Safe)
------------------------------



Enter a URL (or type 'exit' to quit):  https://baggio130.tithelysetup.com/pitcho



URL Scanning Details for https://baggio130.tithelysetup.com/pitcho
Overall URL risk classification. Suspicious
Suspicious Activity. Neutral Model Indication
Domain. baggio130.tithelysetup.com
IP Address. 52.35.132.113
Malware. No Malware Issues
Phishing. Possibly benign, check manually
Risk Score. 1 - Suspicious
Parked Domain. Not Parked
Spamming Domain. No SPAM Issues
Domain Trust Rating. Login to View
Domain Age. 6 years ago
HTTP Status Code. N/A
Page Size. N/A
Web Server. N/A
Content Type. N/A
Free Hosted Content. False
SPF/DMARC Record. Not found
DNS A Records. 52.35.132.113

--- Model's Internal Details ---
Model's Final Decision: Neutral (Content Minor Suspicion)
Decision Based On: 30-Feature Model Prediction
Model's Highest Confidence (from 30 features): 98.34%
Model's Individual Class Probabilities (from 30 features):
  1.7% chance of being -1 (Unsafe (Phishing))
  98.3% chance of being 1 (Safe)
------------------------------



Enter a URL (or type 'exit' to quit):  https://lombardi12.kinsta.cloud/patcho



URL Scanning Details for https://lombardi12.kinsta.cloud/patcho
Overall URL risk classification. Unreachable
Suspicious Activity. URL Unreachable
Domain. lombardi12.kinsta.cloud
IP Address. N/A
Malware. N/A
Phishing. N/A
Risk Score. N/A
Parked Domain. N/A
Spamming Domain. N/A
Domain Trust Rating. Login to View
Domain Age. N/A
HTTP Status Code. N/A
Page Size. N/A
Web Server. N/A
Content Type. N/A
Free Hosted Content. N/A
SPF/DMARC Record. N/A
DNS A Records. N/A

--- Model's Internal Details ---
Model's Final Decision: Unsafe (URL Unreachable)
Decision Based On: URL Unreachable
Model's Highest Confidence (from 30 features): 100.00%
Model's Individual Class Probabilities (from 30 features):
  100.0% chance of being -1 (Unsafe (Phishing))
  0.0% chance of being 0 (Neutral)
  0.0% chance of being 1 (Safe)
------------------------------



Enter a URL (or type 'exit' to quit):  https://vtopcc.vit.ac.in/



URL Scanning Details for https://vtopcc.vit.ac.in/
Overall URL risk classification. Unreachable
Suspicious Activity. URL Unreachable
Domain. vtopcc.vit.ac.in
IP Address. N/A
Malware. N/A
Phishing. N/A
Risk Score. N/A
Parked Domain. N/A
Spamming Domain. N/A
Domain Trust Rating. Login to View
Domain Age. N/A
HTTP Status Code. N/A
Page Size. N/A
Web Server. N/A
Content Type. N/A
Free Hosted Content. N/A
SPF/DMARC Record. N/A
DNS A Records. N/A

--- Model's Internal Details ---
Model's Final Decision: Unsafe (URL Unreachable)
Decision Based On: URL Unreachable
Model's Highest Confidence (from 30 features): 100.00%
Model's Individual Class Probabilities (from 30 features):
  100.0% chance of being -1 (Unsafe (Phishing))
  0.0% chance of being 0 (Neutral)
  0.0% chance of being 1 (Safe)
------------------------------



# Accuracy test

In [8]:
import pandas as pd
import joblib
import sys
from tqdm.notebook import tqdm  # Use tqdm.notebook for Jupyter!
import whois
import requests
from bs4 import BeautifulSoup
import urllib.parse
from datetime import datetime
import dns.resolver
import re

# ==============================================================================
# === 1. CONFIGURE YOUR FILE DETAILS AND SAMPLE SIZE HERE ===
# ==============================================================================
FILE_CONFIG = {
    "path": "balanced_urls.csv",      # <--- CHANGE THIS to your test file name
    "url_column": "url",              # <--- CHANGE THIS to the name of the URL column
    "label_column": "type",           # <--- CHANGE THIS to the name of the label column
    "benign_label_value": "benign"      # <--- CHANGE THIS to the value for a safe URL
}

# Set the number of random URLs you want to test
SAMPLE_SIZE = 30 # <--- YOU CAN CHANGE THIS NUMBER
# ==============================================================================
# === 2. ALL FEATURE EXTRACTION FUNCTIONS (Self-Contained) ===
# ==============================================================================

# --- Helper Functions ---
def get_domain(url):
    try: return urllib.parse.urlparse(url).netloc
    except: return None

def get_soup(url):
    try:
        response = requests.get(url, timeout=5, headers={'User-Agent': 'Mozilla/5.0'})
        response.raise_for_status()
        return BeautifulSoup(response.text, 'html.parser')
    except requests.exceptions.RequestException:
        return None

# --- Feature Functions ---
def having_ip_address(url):
    try:
        domain = get_domain(url)
        if re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", domain): return -1
        return 1
    except: return -1

def url_length(url):
    if len(url) < 54: return 1
    elif 54 <= len(url) <= 75: return 0
    else: return -1

def shortening_service(url):
    shortening_services = ["bit.ly", "goo.gl", "t.co", "tinyurl.com"]
    domain = get_domain(url)
    if domain in shortening_services: return -1
    return 1

def having_at_symbol(url):
    if "@" in url: return -1
    return 1

def double_slash_redirecting(url):
    if "//" in urllib.parse.urlparse(url).path: return -1
    return 1

def prefix_suffix(url):
    if "-" in get_domain(url): return -1
    return 1

def having_sub_domain(url):
    dots = get_domain(url).count('.')
    if dots == 2: return 0
    elif dots > 2: return -1
    return 1

def ssl_final_state(url):
    try:
        if url.startswith("https"): return 1
        return -1
    except: return -1

def domain_registration_length(url):
    try:
        domain = get_domain(url)
        w = whois.whois(domain)
        if w.expiration_date:
            exp_date = w.expiration_date[0] if isinstance(w.expiration_date, list) else w.expiration_date
            cre_date = w.creation_date[0] if isinstance(w.creation_date, list) else w.creation_date
            if exp_date and cre_date:
                if (exp_date - cre_date).days / 365 <= 1: return -1
        return 1
    except: return -1

def age_of_domain(url):
    try:
        domain = get_domain(url)
        w = whois.whois(domain)
        if w.creation_date:
            cre_date = w.creation_date[0] if isinstance(w.creation_date, list) else w.creation_date
            if (datetime.now() - cre_date).days < 180: return -1
        return 1
    except: return -1

def dns_record(url):
    try:
        dns.resolver.resolve(get_domain(url), 'A'); return 1
    except: return -1

def abnormal_url(url):
    try:
        domain = get_domain(url)
        w = whois.whois(domain)
        if domain.lower() not in str(w).lower():
            return -1
        return 1
    except:
        return -1

# Placeholders for complex features
def web_traffic(url): return 0
def page_rank(url): return 0
def google_index(url): return 1
def links_pointing_to_page(url): return 0
def statistical_report(url): return 1
def favicon(url): return 1
def port(url): return 1
def https_token(url): return 1
def request_url(url): return 1
def url_of_anchor(url): return 1
def links_in_tags(url): return 1
def sfh(url): return 1
def submitting_to_email(url): return 1
def redirect(url): return 1
def on_mouseover(url): return 1
def right_click(url): return 1
def popup_window(url): return 1
def iframe(url): return 1

# --- Prediction Wrapper Function ---
def predict_url(model, url):
    if dns_record(url) == -1:
        return "Unsafe (Phishing)", 1.0

    features = [
        having_ip_address(url), url_length(url), shortening_service(url),
        having_at_symbol(url), double_slash_redirecting(url), prefix_suffix(url),
        having_sub_domain(url), ssl_final_state(url), domain_registration_length(url),
        favicon(url), port(url), https_token(url), request_url(url), url_of_anchor(url),
        links_in_tags(url), sfh(url), submitting_to_email(url), abnormal_url(url),
        redirect(url), on_mouseover(url), right_click(url), popup_window(url),
        iframe(url), age_of_domain(url), dns_record(url), web_traffic(url),
        page_rank(url), google_index(url), links_pointing_to_page(url),
        statistical_report(url)
    ]
    
    try:
        prediction = model.predict([features])[0]
        label_map = {1: "Safe", 0: "Neutral", -1: "Unsafe (Phishing)"}
        label = label_map.get(prediction, "Unknown")
        return label, 0.0 # Confidence isn't used in this script, return dummy value
    except Exception as e:
        return f"Error", 0.0

# ==============================================================================
# === 3. MAIN EVALUATION LOGIC ===
# ==============================================================================
def evaluate_on_sample(model_path='phishing_gradient_boosting_model.joblib'):
    # Load model and dataset
    try:
        model = joblib.load(model_path)
    except FileNotFoundError:
        print(f"❌ Error: Model file '{model_path}' not found in the same directory as your notebook.")
        return # Use return instead of sys.exit in notebooks

    try:
        df_test = pd.read_csv(FILE_CONFIG["path"])
    except FileNotFoundError:
        print(f"❌ Error: Test dataset '{FILE_CONFIG['path']}' not found.")
        return

    # Take random sample
    if len(df_test) > SAMPLE_SIZE:
        df_sample = df_test.sample(n=SAMPLE_SIZE, random_state=42)
    else:
        df_sample = df_test
    
    print(f"✅ Starting evaluation on {len(df_sample)} random URLs...")
    
    true_labels = []
    mapped_predictions = []

    # Iterate, predict, and map
    for index, row in tqdm(df_sample.iterrows(), total=df_sample.shape[0]):
        url = row[FILE_CONFIG["url_column"]]
        true_label_text = row[FILE_CONFIG["label_column"]]
        
        model_label, _ = predict_url(model, url)
        
        # Apply the conservative mapping rule
        if model_label == "Safe":
            mapped_prediction_text = FILE_CONFIG["benign_label_value"]
        else: # Treat "Neutral" and "Unsafe" as malicious
            # Determine the opposite of the benign label
            malicious_label_value = "malicious" # A sensible default
            if true_label_text != FILE_CONFIG["benign_label_value"]:
                 malicious_label_value = true_label_text
            mapped_prediction_text = malicious_label_value
            
        mapped_predictions.append(mapped_prediction_text)
        true_labels.append(true_label_text)

    # Calculate and report accuracy
    from sklearn.metrics import accuracy_score, classification_report
    
    print("\n--- Model Performance Report on Sampled Dataset ---")
    accuracy = accuracy_score(true_labels, mapped_predictions)
    print(f"🎯 Overall Accuracy on {len(df_sample)} URLs: {accuracy:.2%}\n")
    
    print("📊 Classification Report:")
    print(classification_report(true_labels, mapped_predictions))

# --- Run the evaluation ---
evaluate_on_sample()

✅ Starting evaluation on 30 random URLs...


  0%|          | 0/30 [00:00<?, ?it/s]


--- Model Performance Report on Sampled Dataset ---
🎯 Overall Accuracy on 30 URLs: 60.00%

📊 Classification Report:
              precision    recall  f1-score   support

      benign       0.56      0.71      0.62        14
   malicious       0.67      0.50      0.57        16

    accuracy                           0.60        30
   macro avg       0.61      0.61      0.60        30
weighted avg       0.61      0.60      0.60        30



In [4]:
df1=pd.read_csv('dataset.csv')

In [5]:
df1.head()

Unnamed: 0,index,having_IPhaving_IP_Address,URLURL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,1,-1,1,1,1,-1,-1,-1,-1,-1,...,1,1,-1,-1,-1,-1,1,1,-1,-1
1,2,1,1,1,1,1,-1,0,1,-1,...,1,1,-1,-1,0,-1,1,1,1,-1
2,3,1,0,1,1,1,-1,-1,-1,-1,...,1,1,1,-1,1,-1,1,0,-1,-1
3,4,1,0,1,1,1,-1,-1,-1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
4,5,1,0,-1,1,1,-1,1,1,-1,...,-1,1,-1,-1,0,-1,1,1,1,1


In [7]:
df1.columns

Index(['index', 'having_IPhaving_IP_Address', 'URLURL_Length',
       'Shortining_Service', 'having_At_Symbol', 'double_slash_redirecting',
       'Prefix_Suffix', 'having_Sub_Domain', 'SSLfinal_State',
       'Domain_registeration_length', 'Favicon', 'port', 'HTTPS_token',
       'Request_URL', 'URL_of_Anchor', 'Links_in_tags', 'SFH',
       'Submitting_to_email', 'Abnormal_URL', 'Redirect', 'on_mouseover',
       'RightClick', 'popUpWidnow', 'Iframe', 'age_of_domain', 'DNSRecord',
       'web_traffic', 'Page_Rank', 'Google_Index', 'Links_pointing_to_page',
       'Statistical_report', 'Result'],
      dtype='object')