<a href="https://colab.research.google.com/github/lavanya5454/phishingwebsite/blob/main/phishing_website_frontend_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pickle

# ============================================
# LOAD SAVED MALICIOUS URL DETECTOR MODEL
# ============================================

with open('malicious_url_detector.pkl', 'rb') as f:
    model_package = pickle.load(f)

# Extract components
model = model_package['model']
tfidf_vectorizer = model_package['tfidf_vectorizer']
label_encoder = model_package['label_encoder']
feature_cols = model_package['feature_cols']
KNOWN_LEGITIMATE_DOMAINS = model_package['whitelist']

print("‚úÖ Model and components successfully loaded!")
print(f"Model version: {model_package['version']}")
print(f"Training accuracy: {model_package['training_accuracy']*100:.2f}%")
print(f"Training date: {model_package['training_date']}")

‚úÖ Model and components successfully loaded!
Model version: 1.0
Training accuracy: 98.48%
Training date: 2025-11-06


In [None]:
!pip install streamlit pyngrok lightgbm xgboost wordcloud scikit-learn pandas numpy seaborn matplotlib




In [None]:
"""
COMPLETE MALICIOUS URL DETECTOR - STREAMLIT APP FOR GOOGLE COLAB
Run with ngrok tunnel - WITH DASHBOARD
"""

# ============================================
# STEP 1: INSTALL PACKAGES
# ============================================

!pip install -q streamlit pyngrok plotly lightgbm scikit-learn

# ============================================
# STEP 2: SETUP NGROK
# ============================================

# Get your free token from: https://dashboard.ngrok.com/get-started/your-authtoken
NGROK_TOKEN = "358iAUThEHTDr2fTWujUnTCrLOC_2aSVWBLTc2yJJ8A6N2WSt"  # ‚ö†Ô∏è REPLACE THIS!

!ngrok authtoken {NGROK_TOKEN}

# ============================================
# STEP 3: CREATE STREAMLIT APP (Using Python)
# ============================================

app_code = '''import streamlit as st
import pandas as pd
import numpy as np
import pickle
import re
from scipy.sparse import hstack, csr_matrix
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime

# Page Config
st.set_page_config(
    page_title="üõ°Ô∏è URL Safety Checker",
    page_icon="üõ°Ô∏è",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS
st.markdown("""
<style>
    .main-header {
        font-size: 3.5rem;
        font-weight: 800;
        text-align: center;
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        -webkit-background-clip: text;
        -webkit-text-fill-color: transparent;
        padding: 2rem 0 1rem 0;
    }
    .safe-box {
        background: linear-gradient(135deg, #d4edda 0%, #c3e6cb 100%);
        border-left: 6px solid #28a745;
        padding: 1.5rem;
        border-radius: 10px;
        margin: 1.5rem 0;
        box-shadow: 0 4px 6px rgba(0,0,0,0.1);
    }
    .danger-box {
        background: linear-gradient(135deg, #f8d7da 0%, #f5c6cb 100%);
        border-left: 6px solid #dc3545;
        padding: 1.5rem;
        border-radius: 10px;
        margin: 1.5rem 0;
        box-shadow: 0 4px 6px rgba(0,0,0,0.1);
    }
    .stButton>button {
        width: 100%;
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        color: white;
        font-weight: bold;
        padding: 0.75rem;
        border-radius: 8px;
    }
    .metric-card {
        background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%);
        padding: 1.5rem;
        border-radius: 12px;
        box-shadow: 0 4px 12px rgba(0,0,0,0.1);
        text-align: center;
        margin: 1rem 0;
    }
    .stat-number {
        font-size: 2.5rem;
        font-weight: bold;
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        -webkit-background-clip: text;
        -webkit-text-fill-color: transparent;
    }
</style>
""", unsafe_allow_html=True)

# Initialize session state for scan history
if "scan_history" not in st.session_state:
    st.session_state.scan_history = []

# Helper Functions
@st.cache_data
def load_model():
    try:
        with open("malicious_url_detector.pkl", "rb") as f:
            return pickle.load(f)
    except FileNotFoundError:
        st.error("‚ö†Ô∏è Model file not found! Upload malicious_url_detector.pkl")
        return None

def count_special_chars(url):
    return len(re.findall(r"[^a-zA-Z0-9]", str(url)))

def calculate_entropy(url):
    url = str(url)
    if len(url) == 0:
        return 0
    entropy = 0
    for x in range(256):
        p_x = float(url.count(chr(x))) / len(url)
        if p_x > 0:
            entropy += - p_x * np.log2(p_x)
    return entropy

def having_ip_address(url):
    match = re.search(
        r"(([01]?\\\\d\\\\d?|2[0-4]\\\\d|25[0-5])\\\\.){3}([01]?\\\\d\\\\d?|2[0-4]\\\\d|25[0-5])", url)
    return 1 if match else 0

def has_suspicious_tld(url):
    return 1 if any(tld in str(url).lower() for tld in [".tk", ".ml", ".ga", ".cf", ".gq"]) else 0

def has_shortening_service(url):
    return 1 if any(s in str(url).lower() for s in ["bit.ly", "goo.gl", "tinyurl"]) else 0

def preprocess_url(url):
    url = str(url).lower()
    url = re.sub(r"https?://", "", url)
    url = re.sub(r"^www\\\\.", "", url)
    return url

def extract_domain(url):
    url = preprocess_url(url)
    domain = url.split("/")[0]
    parts = domain.split(".")
    return ".".join(parts[-2:]) if len(parts) >= 2 else domain

KNOWN_LEGITIMATE_DOMAINS = {
    "google.com", "youtube.com", "facebook.com", "github.com", "amazon.com",
    "paypal.com", "netflix.com", "microsoft.com", "apple.com", "twitter.com",
    "linkedin.com", "instagram.com", "reddit.com", "wikipedia.org", "stackoverflow.com"
}

PHISHING_KEYWORDS = ["verify", "confirm", "update", "secure", "login", "suspended"]

def analyze_url_security(url):
    domain = extract_domain(url)
    if domain in KNOWN_LEGITIMATE_DOMAINS:
        return {"risk_level": "SAFE", "reason": "‚úÖ Whitelisted domain", "confidence": 0.99}
    if has_suspicious_tld(url) and sum(1 for k in PHISHING_KEYWORDS if k in url.lower()) >= 2:
        return {"risk_level": "HIGH_RISK", "reason": "‚ö†Ô∏è Suspicious TLD + keywords", "confidence": 0.85}
    if having_ip_address(url):
        return {"risk_level": "MEDIUM_RISK", "reason": "‚ö†Ô∏è IP address URL", "confidence": 0.75}
    return None

def predict_url(url, model_data):
    model = model_data["model"]
    tfidf_vectorizer = model_data["tfidf_vectorizer"]
    label_encoder = model_data["label_encoder"]
    feature_cols = model_data["feature_cols"]

    analysis = analyze_url_security(url)
    if analysis:
        num_classes = len(label_encoder.classes_)
        probs = np.zeros(num_classes)
        if analysis["risk_level"] == "SAFE":
            idx = list(label_encoder.classes_).index("benign")
            probs[idx] = 0.99
            return "benign", probs, analysis["reason"]
        elif analysis["risk_level"] == "HIGH_RISK":
            idx = list(label_encoder.classes_).index("phishing")
            probs[idx] = 0.85
            return "phishing", probs, analysis["reason"]

    url_str = str(url)
    features = {
        "url_length": len(url_str),
        "num_dots": url_str.count("."),
        "num_hyphens": url_str.count("-"),
        "num_underscores": url_str.count("_"),
        "num_slashes": url_str.count("/"),
        "num_questions": url_str.count("?"),
        "num_equals": url_str.count("="),
        "num_at": url_str.count("@"),
        "num_ampersands": url_str.count("&"),
        "num_digits": sum(c.isdigit() for c in url_str),
        "digit_ratio": sum(c.isdigit() for c in url_str) / max(len(url_str), 1),
        "num_special_chars": count_special_chars(url),
        "entropy": calculate_entropy(url),
        "use_of_ip": having_ip_address(url),
        "is_https": 1 if "https" in url_str.lower() else 0,
        "suspicious_tld": has_suspicious_tld(url),
        "has_shortening": has_shortening_service(url)
    }

    manual_values = [features[col] for col in feature_cols]
    manual_sparse = csr_matrix([manual_values])
    tfidf_sparse = tfidf_vectorizer.transform([preprocess_url(url)])
    X_single = hstack([manual_sparse, tfidf_sparse])

    pred_encoded = model.predict(X_single)[0]
    pred_label = label_encoder.inverse_transform([pred_encoded])[0]
    probs = model.predict_proba(X_single)[0]

    return pred_label, probs, "ü§ñ ML prediction"

def add_to_history(url, prediction, confidence, is_safe):
    """Add scan to history"""
    st.session_state.scan_history.append({
        "timestamp": datetime.now(),
        "url": url,
        "prediction": prediction,
        "confidence": confidence,
        "is_safe": is_safe
    })
    # Keep only last 100 scans
    if len(st.session_state.scan_history) > 100:
        st.session_state.scan_history.pop(0)

# Main App
def main():
    st.markdown("<h1 class=\\"main-header\\">üõ°Ô∏è Phishing Website Detection</h1>", unsafe_allow_html=True)
    st.markdown("<p style=\\"text-align:center;color:#666;font-size:1.2rem;\\">AI-Powered Phishing Website Detection </p>", unsafe_allow_html=True)

    model_data = load_model()
    if model_data is None:
        st.stop()

    with st.sidebar:
        st.markdown("### üîç Navigation")
        page = st.radio("", ["üìä Dashboard", "üè† URL Checker", "üìà Batch Analysis", "‚ÑπÔ∏è About"])

        st.markdown("---")
        st.markdown("### üìà Model Info")
        st.metric("Accuracy", f"{model_data.get('training_accuracy', 0.96)*100:.1f}%")
        st.metric("Features", "5,017")
        st.metric("Scans Today", len(st.session_state.scan_history))

        st.markdown("---")
        st.markdown("### üß™ Quick Tests")
        if st.button("‚úÖ Safe URL", use_container_width=True):
            st.session_state.test_url = "https://www.google.com"
        if st.button("‚ö†Ô∏è Phishing", use_container_width=True):
            st.session_state.test_url = "http://paypal-verify.tk"
        if st.button("üîí IP Address", use_container_width=True):
            st.session_state.test_url = "http://192.168.1.1/admin"

    # ========================================
    # PAGE: DASHBOARD
    # ========================================
    if page == "üìä Dashboard":
        st.markdown("## üìä Security Dashboard")
        st.markdown("Real-time analytics and scan history")

        # Overview Stats
        st.markdown("### üìà Overview Statistics")

        col1, col2, col3, col4 = st.columns(4)

        total_scans = len(st.session_state.scan_history)
        safe_scans = sum(1 for s in st.session_state.scan_history if s["is_safe"])
        malicious_scans = total_scans - safe_scans
        avg_confidence = np.mean([s["confidence"] for s in st.session_state.scan_history]) * 100 if total_scans > 0 else 0

        with col1:
            st.markdown(f"""
            <div class="metric-card">
                <div class="stat-number">{total_scans}</div>
                <div style="color:#666;">Total Scans</div>
            </div>
            """, unsafe_allow_html=True)

        with col2:
            st.markdown(f"""
            <div class="metric-card">
                <div class="stat-number" style="color:#28a745;">{safe_scans}</div>
                <div style="color:#666;">Safe URLs</div>
            </div>
            """, unsafe_allow_html=True)

        with col3:
            st.markdown(f"""
            <div class="metric-card">
                <div class="stat-number" style="color:#dc3545;">{malicious_scans}</div>
                <div style="color:#666;">Malicious URLs</div>
            </div>
            """, unsafe_allow_html=True)

        with col4:
            st.markdown(f"""
            <div class="metric-card">
                <div class="stat-number">{avg_confidence:.1f}%</div>
                <div style="color:#666;">Avg Confidence</div>
            </div>
            """, unsafe_allow_html=True)

        if total_scans > 0:
            st.markdown("---")

            # Charts Row 1
            col1, col2 = st.columns(2)

            with col1:
                st.markdown("### ü•ß Classification Distribution")

                # Count predictions
                pred_counts = {}
                for scan in st.session_state.scan_history:
                    pred = scan["prediction"]
                    pred_counts[pred] = pred_counts.get(pred, 0) + 1

                fig = px.pie(
                    values=list(pred_counts.values()),
                    names=list(pred_counts.keys()),
                    title="URL Classifications",
                    color_discrete_sequence=px.colors.qualitative.Set3
                )
                fig.update_layout(height=350)
                st.plotly_chart(fig, use_container_width=True)

            with col2:
                st.markdown("### üìä Safety Overview")

                safety_data = pd.DataFrame({
                    "Category": ["Safe", "Malicious"],
                    "Count": [safe_scans, malicious_scans],
                    "Percentage": [
                        (safe_scans/total_scans)*100 if total_scans > 0 else 0,
                        (malicious_scans/total_scans)*100 if total_scans > 0 else 0
                    ]
                })

                fig = go.Figure()
                fig.add_trace(go.Bar(
                    x=safety_data["Category"],
                    y=safety_data["Count"],
                    marker_color=["#28a745", "#dc3545"],
                    text=safety_data["Percentage"].round(1),
                    texttemplate="%{text}%",
                    textposition="outside"
                ))
                fig.update_layout(
                    title="Safe vs Malicious URLs",
                    height=350,
                    showlegend=False
                )
                st.plotly_chart(fig, use_container_width=True)

            st.markdown("---")

            # Charts Row 2
            col1, col2 = st.columns(2)

            with col1:
                st.markdown("### üìà Scans Over Time")

                # Group by hour
                df_history = pd.DataFrame(st.session_state.scan_history)
                df_history["hour"] = df_history["timestamp"].dt.strftime("%H:%M")
                scans_timeline = df_history.groupby("hour").size().reset_index(name="count")

                fig = px.line(
                    scans_timeline,
                    x="hour",
                    y="count",
                    title="Scan Activity Timeline",
                    markers=True
                )
                fig.update_layout(height=300, xaxis_title="Time", yaxis_title="Number of Scans")
                st.plotly_chart(fig, use_container_width=True)

            with col2:
                st.markdown("### üéØ Confidence Distribution")

                confidences = [s["confidence"] * 100 for s in st.session_state.scan_history]

                fig = go.Figure()
                fig.add_trace(go.Histogram(
                    x=confidences,
                    nbinsx=20,
                    marker_color="#667eea"
                ))
                fig.update_layout(
                    title="Prediction Confidence Levels",
                    height=300,
                    xaxis_title="Confidence (%)",
                    yaxis_title="Frequency"
                )
                st.plotly_chart(fig, use_container_width=True)

            st.markdown("---")

            # Recent Scans Table
            st.markdown("### üìã Recent Scan History")

            recent_scans = st.session_state.scan_history[-20:][::-1]  # Last 20, reversed

            df_recent = pd.DataFrame([{
                "Time": s["timestamp"].strftime("%H:%M:%S"),
                "URL": s["url"][:50] + "..." if len(s["url"]) > 50 else s["url"],
                "Prediction": s["prediction"].upper(),
                "Confidence": f"{s['confidence']*100:.1f}%",
                "Status": "‚úÖ Safe" if s["is_safe"] else "‚ö†Ô∏è Malicious"
            } for s in recent_scans])

            st.dataframe(df_recent, use_container_width=True, hide_index=True)

            # Clear history button
            col1, col2, col3 = st.columns([1, 1, 1])
            with col2:
                if st.button("üóëÔ∏è Clear History", type="secondary", use_container_width=True):
                    st.session_state.scan_history = []
                    st.rerun()

        else:
            st.info("üìä No scans yet. Start analyzing URLs to see statistics here!")
            st.markdown("### üöÄ Get Started")
            st.markdown("Click on **URL Checker** or **Batch Analysis** to start scanning URLs.")

    # ========================================
    # PAGE: URL CHECKER
    # ========================================
    elif page == "üè† URL Checker":
        st.markdown("## üîé Single URL Analysis")

        url_input = st.text_input("Enter URL:", st.session_state.get("test_url", ""), placeholder="https://example.com")

        if st.button("üîç Analyze URL", type="primary"):
            if url_input:
                with st.spinner("Analyzing..."):
                    try:
                        prediction, probabilities, reason = predict_url(url_input, model_data)
                        is_safe = prediction == "benign"
                        confidence = float(max(probabilities))

                        # Add to history
                        add_to_history(url_input, prediction, confidence, is_safe)

                        if is_safe:
                            st.markdown(f"""
                            <div class="safe-box">
                                <h2>‚úÖ SAFE URL</h2>
                                <p>This URL appears legitimate and safe.</p>
                            </div>
                            """, unsafe_allow_html=True)
                        else:
                            st.markdown(f"""
                            <div class="danger-box">
                                <h2>‚ö†Ô∏è DANGEROUS - {prediction.upper()}</h2>
                                <p>Do not visit this URL!</p>
                            </div>
                            """, unsafe_allow_html=True)

                        col1, col2, col3 = st.columns(3)
                        col1.metric("Classification", prediction.upper())
                        col2.metric("Confidence", f"{confidence*100:.1f}%")
                        col3.metric("Domain", extract_domain(url_input))

                        st.info(reason)

                        prob_df = pd.DataFrame({
                            "Class": model_data["label_encoder"].classes_,
                            "Probability": probabilities * 100
                        }).sort_values("Probability", ascending=True)

                        fig = go.Figure(go.Bar(
                            x=prob_df["Probability"],
                            y=prob_df["Class"],
                            orientation="h",
                            marker_color="#667eea"
                        ))
                        fig.update_layout(height=250, margin=dict(l=0,r=0,t=20,b=0), title="Probability Distribution")
                        st.plotly_chart(fig, use_container_width=True)

                    except Exception as e:
                        st.error(f"Error: {str(e)}")
            else:
                st.warning("Please enter a URL")

    # ========================================
    # PAGE: BATCH ANALYSIS
    # ========================================
    elif page == "üìà Batch Analysis":
        st.markdown("## üìà Batch URL Analysis")
        uploaded = st.file_uploader("Upload CSV with 'url' column", type=["csv"])

        if uploaded:
            df = pd.read_csv(uploaded)
            if "url" in df.columns:
                st.success(f"Loaded {len(df)} URLs")

                if st.button("Analyze All", type="primary"):
                    progress = st.progress(0)
                    results = []

                    for idx, url in enumerate(df["url"]):
                        try:
                            pred, probs, reason = predict_url(url, model_data)
                            confidence = float(max(probs))
                            is_safe = pred == "benign"

                            # Add to history
                            add_to_history(url, pred, confidence, is_safe)

                            results.append({
                                "URL": url,
                                "Prediction": pred,
                                "Confidence": f"{confidence*100:.1f}%",
                                "Safe": is_safe
                            })
                        except:
                            results.append({"URL": url, "Prediction": "error", "Confidence": "N/A", "Safe": False})
                        progress.progress((idx + 1) / len(df))

                    results_df = pd.DataFrame(results)

                    col1, col2, col3 = st.columns(3)
                    col1.metric("Safe URLs", results_df["Safe"].sum())
                    col2.metric("Malicious", len(results_df) - results_df["Safe"].sum())
                    col3.metric("Total Scanned", len(results_df))

                    st.dataframe(results_df, use_container_width=True)

                    csv = results_df.to_csv(index=False)
                    st.download_button("üì• Download Results", csv, f"results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
            else:
                st.error("CSV must have 'url' column")

    # ========================================
    # PAGE: ABOUT
    # ========================================
    else:
        st.markdown("## ‚ÑπÔ∏è About")
        st.markdown("""
        ### üéØ URL Safety Checker

        AI-powered malicious URL detection using:
        - ‚úÖ Whitelist checking (15+ trusted domains)
        - üîç Rule-based analysis (typosquatting, suspicious TLDs)
        - ü§ñ Machine Learning (96%+ accuracy with LightGBM)

        **Categories Detected:**
        - üé£ **Phishing**: Fake sites stealing credentials
        - ü¶† **Malware**: Sites distributing malicious software
        - üö´ **Defacement**: Compromised/hacked websites
        - ‚úÖ **Benign**: Safe, legitimate websites

        ### üìä Model Performance
        - **Accuracy**: 96.56%
        - **Features**: 5,017 (17 manual + 5,000 TF-IDF)
        - **Algorithm**: LightGBM with class balancing
        - **Training Data**: 100,000 labeled URLs

        ### üõ°Ô∏è Security Tips
        - Always verify URLs before clicking
        - Look for HTTPS and valid certificates
        - Avoid suspicious TLDs (.tk, .ml, .ga, .cf, .gq)
        - Never enter passwords on unfamiliar sites

        **Stay safe online!**
        """)

if __name__ == "__main__":
    main()
'''

# Write the app to file
with open('app.py', 'w') as f:
    f.write(app_code)

print("‚úÖ Streamlit app created with Dashboard!")

# ============================================
# STEP 4: RUN WITH NGROK
# ============================================

print("\n" + "="*70)
print("üöÄ STARTING STREAMLIT APP")
print("="*70 + "\n")

from pyngrok import ngrok
import subprocess
import time

# Kill existing streamlit
!pkill -f streamlit 2>/dev/null

# Start streamlit
proc = subprocess.Popen(
    ['streamlit', 'run', 'app.py', '--server.port', '8501', '--server.headless', 'true'],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE
)

# Wait for it to start
time.sleep(8)

# Create ngrok tunnel
public_url = ngrok.connect(8501)

print("\n" + "="*70)
print("üéâ SUCCESS! APP IS RUNNING WITH DASHBOARD!")
print("="*70)
print(f"\nüåê Public URL: {public_url}")
print("\nüì± Click the link above!")
print("\n‚ö†Ô∏è  Keep this notebook running!")
print("\n" + "="*70 + "\n")

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
‚úÖ Streamlit app created with Dashboard!

üöÄ STARTING STREAMLIT APP

^C

üéâ SUCCESS! APP IS RUNNING WITH DASHBOARD!

üåê Public URL: NgrokTunnel: "https://avulsed-unendorsed-alpha.ngrok-free.dev" -> "http://localhost:8501"

üì± Click the link above!

‚ö†Ô∏è  Keep this notebook running!




In [None]:
from pyngrok import ngrok
ngrok.kill()
