In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import joblib
import gradio as gr
from google.colab import files, drive
import warnings
warnings.filterwarnings('ignore')

try:
    drive.mount('/content/drive')
    print("Google Drive mounted successfully")
    drive_mounted = True
except:
    print("Google Drive mount failed, will save files locally")
    drive_mounted = False

try:
    uploaded = files.upload()
    print("File uploaded successfully")
    df = pd.read_csv(list(uploaded.keys())[0])
except:
    print("Trying to read existing file...")
    try:
        df = pd.read_csv("phishing websites.csv")
    except Exception as e:
        print(f"Error reading dataset: {e}")
        print("Please upload 'phishing websites.csv' file")
        raise

print(f"Dataset loaded with shape: {df.shape}")

# Check class distribution
print("Target class distribution:")
print(df['status'].value_counts())

# Make sure data is balanced enough for learning
if len(df['status'].unique()) != 2:
    raise ValueError("Dataset must have exactly two classes: 'legitimate' and 'phishing'")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully


In [None]:


# Feature engineering
X = df.drop(['url', 'status'], axis=1)
y = df['status'].map({'legitimate': 0, 'phishing': 1})

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Training Random Forest model...")
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model.fit(X_train_scaled, y_train)

# Evaluate on training data
train_pred = model.predict(X_train_scaled)
train_acc = accuracy_score(y_train, train_pred)
print(f"Training accuracy: {train_acc:.4f}")

# Evaluate on test data
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Test accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(report)
print("\nConfusion Matrix:")
print(conf_matrix)

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 important features:")
print(feature_importance.head(10))

# Save the model
if drive_mounted:
    model_path = '/content/drive/MyDrive/phishing_detection_model.joblib'
    scaler_path = '/content/drive/MyDrive/phishing_detection_scaler.joblib'
else:
    model_path = 'phishing_detection_model.joblib'
    scaler_path = 'phishing_detection_scaler.joblib'

joblib.dump(model, model_path)
joblib.dump(scaler, scaler_path)
print(f"Model saved to {model_path}")
print(f"Scaler saved to {scaler_path}")

def extract_features(url):
    # Initialize all features to be used
    features = {col: 0 for col in X.columns}

    try:
        # Basic URL features
        features['length_url'] = len(url)

        # Parse URL components
        if '//' in url:
            split_url = url.split('//')
            rest_url = split_url[1]
        else:
            rest_url = url

        domain = rest_url.split('/', 1)[0] if '/' in rest_url else rest_url

        features['length_hostname'] = len(domain)

        # IP-based features
        hostname_parts = domain.split('.')
        features['ip'] = 1 if all(part.isdigit() for part in hostname_parts if part) else 0

        # Character count features
        features['nb_dots'] = url.count('.')
        features['nb_hyphens'] = url.count('-')
        features['nb_at'] = url.count('@')
        features['nb_qm'] = url.count('?')
        features['nb_and'] = url.count('&')
        features['nb_eq'] = url.count('=')
        features['nb_underscore'] = url.count('_')
        features['nb_tilde'] = url.count('~')
        features['nb_percent'] = url.count('%')
        features['nb_slash'] = url.count('/')
        features['nb_star'] = url.count('*')
        features['nb_colon'] = url.count(':')
        features['nb_comma'] = url.count(',')
        features['nb_semicolumn'] = url.count(';')
        features['nb_dollar'] = url.count('$')
        features['nb_space'] = url.count(' ')
        features['nb_www'] = url.lower().count('www')
        features['nb_com'] = url.lower().count('.com')
        features['nb_dslash'] = url.count('//')

        # Path features
        if '/' in rest_url:
            path = rest_url.split('/', 1)[1]
            features['http_in_path'] = 1 if '//' in path else 0
        else:
            features['http_in_path'] = 0

        features['https_token'] = 1 if 'https' in url.lower() else 0

        # Ratio features
        features['ratio_digits_url'] = sum(c.isdigit() for c in url) / len(url) if len(url) > 0 else 0

        # Subdomain features
        features['nb_subdomains'] = len(domain.split('.')) - 1 if '.' in domain else 0

    except Exception as e:
        print(f"Error extracting features: {e}")

    # Create DataFrame with features
    df_features = pd.DataFrame([features])

    # Ensure all required columns are present
    df_features = df_features.reindex(columns=X.columns, fill_value=0)

    return df_features

def predict_url(url):
    # Extract features
    features = extract_features(url)

    # Scale features
    features_scaled = scaler.transform(features)

    # Make prediction
    prediction = model.predict(features_scaled)[0]
    probabilities = model.predict_proba(features_scaled)[0]

    # Interpret prediction
    legitimate_prob = probabilities[0]
    phishing_prob = probabilities[1]

    # Determine result
    if phishing_prob > 0.7:  # Higher threshold for classifying as phishing
        result = "Phishing"
        confidence = phishing_prob
    else:
        result = "Legitimate"
        confidence = legitimate_prob

    # Print diagnostic information
    print(f"URL: {url}")
    print(f"Features: {features.iloc[0].to_dict()}")
    print(f"Probabilities: Legitimate={legitimate_prob:.4f}, Phishing={phishing_prob:.4f}")

    return result, float(confidence)

def analyze_well_known_sites():
    known_legitimate = [
        "google.com",
        "facebook.com",
        "amazon.com",
        "microsoft.com",
        "apple.com"
    ]

    print("\nTesting well-known legitimate sites:")
    for site in known_legitimate:
        url = "https://" + site
        result, confidence = predict_url(url)
        print(f"{site}: {result} (Confidence: {confidence*100:.2f}%)")

    # Reset warnings to show potential issues
    warnings.resetwarnings()

# Test known legitimate sites to validate model
analyze_well_known_sites()

def gradio_interface(url):
    if not url or not url.strip():
        return "Please enter a valid URL"

    if not url.startswith('http'):
        url = 'http://' + url

    try:
        result, confidence = predict_url(url)

        if result == "Legitimate":
            return f"✅ {result} website (Confidence: {confidence*100:.2f}%)"
        else:
            return f"⚠️ {result} website (Confidence: {confidence*100:.2f}%)"
    except Exception as e:
        return f"Error analyzing URL: {str(e)}"

print("Creating Gradio interface...")
demo = gr.Interface(
    fn=gradio_interface,
    inputs=gr.Textbox(placeholder="Enter URL to check (e.g., example.com)"),
    outputs=gr.Textbox(),
    title="Phishing Website Detection",
    description="Enter a URL to check if it's a legitimate website or a phishing attempt.",
    examples=[
        ["google.com"],
        ["facebook.com"],
        ["paypal-secure-login.com"]
    ]
)

if __name__ == "__main__":
    demo.launch(share=True)
    print("Gradio interface launched. Click on the public URL to access it.")

Training Random Forest model...
Model Accuracy: 0.9679

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      1732
           1       0.97      0.96      0.97      1697

    accuracy                           0.97      3429
   macro avg       0.97      0.97      0.97      3429
weighted avg       0.97      0.97      0.97      3429

Model saved to /content/drive/MyDrive/phishing_detection_model.joblib
Scaler saved to /content/drive/MyDrive/phishing_detection_scaler.joblib
Creating Gradio interface...
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://4e49dbdfc4c0ee9314.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Gradio interface launched. Click on the public URL to access it.
