<a href="https://colab.research.google.com/github/lavanya5454/phishingwebsite/blob/main/phishing_website_backend_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:


!pip install lightgbm xgboost wordcloud scikit-learn

import pandas as pd
import numpy as np
import re
import pickle
import time
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier
import xgboost as xgb
from scipy.sparse import hstack, csr_matrix
import warnings
warnings.filterwarnings('ignore')



In [None]:
from google.colab import drive
drive.mount('/content/drive')

ValueError: mount failed

In [None]:

print("="*70)
print("MALICIOUS URL DETECTION SYSTEM")
print("="*70)

print("\n[1/10] Loading dataset...")
df = pd.read_csv('malicious_phish.csv')
df = df.dropna()

print(f"‚úÖ Dataset loaded: {df.shape[0]:,} URLs")
print(f"\nClass Distribution:")
for cls, count in df['type'].value_counts().items():
    print(f"  {cls:12s}: {count:6,} ({count/len(df)*100:.1f}%)")

In [None]:
df.head()

In [None]:
df.shape

In [None]:


print("\n[2/10] Setting up feature extraction functions...")

def count_special_chars(url):
    """Count special characters in URL"""
    special = re.findall(r'[^a-zA-Z0-9]', str(url))
    return len(special)

def calculate_entropy(url):
    """Calculate Shannon entropy of URL"""
    url = str(url)
    if len(url) == 0:
        return 0
    entropy = 0
    for x in range(256):
        p_x = float(url.count(chr(x))) / len(url)
        if p_x > 0:
            entropy += - p_x * np.log2(p_x)
    return entropy

def having_ip_address(url):
    """Check if URL contains IP address"""
    match = re.search(
        r'(([01]?\d\d?|2[0-4]\d|25[0-5])\.([01]?\d\d?|2[0-4]\d|25[0-5])\.([01]?\d\d?|2[0-4]\d|25[0-5])\.'
        r'([01]?\d\d?|2[0-4]\d|25[0-5])\/)|'
        r'((0x[0-9a-fA-F]{1,2})\.(0x[0-9a-fA-F]{1,2})\.(0x[0-9a-fA-F]{1,2})\.(0x[0-9a-fA-F]{1,2})\/)|'
        r'(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}', url)
    return 1 if match else 0

def has_suspicious_tld(url):
    """Check for suspicious top-level domains"""
    suspicious_tlds = ['.tk', '.ml', '.ga', '.cf', '.gq', '.xyz', '.top', '.work', '.click']
    return 1 if any(tld in str(url).lower() for tld in suspicious_tlds) else 0

def has_shortening_service(url):
    """Check if URL uses shortening service"""
    shortening_services = ['bit.ly', 'goo.gl', 'tinyurl', 'ow.ly', 't.co', 'buff.ly']
    return 1 if any(service in str(url).lower() for service in shortening_services) else 0

def preprocess_url(url):
    """Preprocess URL by removing protocol and www"""
    url = str(url).lower()
    url = re.sub(r'https?://', '', url)
    url = re.sub(r'^www\.', '', url)
    return url

def extract_domain(url):
    """Extract main domain from URL"""
    url = str(url).lower()
    url = re.sub(r'https?://', '', url)
    url = re.sub(r'^www\.', '', url)
    domain = url.split('/')[0]
    parts = domain.split('.')
    if len(parts) >= 2:
        return '.'.join(parts[-2:])
    return domain

print("‚úÖ Feature extraction functions ready")

In [None]:



print("\n[3/10] Engineering features...")

# Manual features
df['url_length'] = df['url'].apply(lambda x: len(str(x)))
df['num_dots'] = df['url'].apply(lambda x: str(x).count('.'))
df['num_hyphens'] = df['url'].apply(lambda x: str(x).count('-'))
df['num_underscores'] = df['url'].apply(lambda x: str(x).count('_'))
df['num_slashes'] = df['url'].apply(lambda x: str(x).count('/'))
df['num_questions'] = df['url'].apply(lambda x: str(x).count('?'))
df['num_equals'] = df['url'].apply(lambda x: str(x).count('='))
df['num_at'] = df['url'].apply(lambda x: str(x).count('@'))
df['num_ampersands'] = df['url'].apply(lambda x: str(x).count('&'))
df['num_digits'] = df['url'].apply(lambda x: sum(c.isdigit() for c in str(x)))
df['digit_ratio'] = df['num_digits'] / df['url_length']
df['num_special_chars'] = df['url'].apply(count_special_chars)
df['entropy'] = df['url'].apply(calculate_entropy)
df['use_of_ip'] = df['url'].apply(having_ip_address)
df['is_https'] = df['url'].apply(lambda x: 1 if 'https' in str(x).lower() else 0)
df['suspicious_tld'] = df['url'].apply(has_suspicious_tld)
df['has_shortening'] = df['url'].apply(has_shortening_service)

# Preprocessed URL for TF-IDF
df['url_preprocessed'] = df['url'].apply(preprocess_url)

feature_cols = ['url_length', 'num_dots', 'num_hyphens', 'num_underscores',
                'num_slashes', 'num_questions', 'num_equals', 'num_at',
                'num_ampersands', 'num_digits', 'digit_ratio', 'num_special_chars',
                'entropy', 'use_of_ip', 'is_https', 'suspicious_tld', 'has_shortening']

print(f"‚úÖ Created {len(feature_cols)} manual features")


In [None]:

print("\n[4/10] Creating TF-IDF features...")

tfidf_vectorizer = TfidfVectorizer(
    analyzer='char_wb',
    ngram_range=(2, 5),
    max_features=5000,
    min_df=2,
    max_df=0.95,
    sublinear_tf=True
)

tfidf_features = tfidf_vectorizer.fit_transform(df['url_preprocessed'])
print(f"‚úÖ TF-IDF features: {tfidf_features.shape[1]}")


In [None]:


print("\n[5/10] Combining all features...")

manual_features = csr_matrix(df[feature_cols].values)
all_features = hstack([manual_features, tfidf_features])

print(f"‚úÖ Total features: {all_features.shape[1]} (Manual: {len(feature_cols)} + TF-IDF: {tfidf_features.shape[1]})")


In [None]:


print("\n[6/10] Encoding labels and splitting data...")

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['type'])

print("Label Mapping:")
for i, label in enumerate(label_encoder.classes_):
    print(f"  {i} -> {label}")

# Use sample for faster training (adjust sample_size for full dataset)
sample_size = min(100000, len(y_encoded))
sample_indices = np.random.choice(len(y_encoded), size=sample_size, replace=False)

X_sample = all_features[sample_indices]
y_sample = y_encoded[sample_indices]

X_train, X_test, y_train, y_test = train_test_split(
    X_sample, y_sample, test_size=0.2, random_state=42, stratify=y_sample
)

print(f"‚úÖ Training samples: {X_train.shape[0]:,}")
print(f"‚úÖ Testing samples: {X_test.shape[0]:,}")


In [None]:


print("\n[7/10] Training LightGBM classifier...")

start_time = time.time()

model = LGBMClassifier(
    n_estimators=100,
    max_depth=7,
    learning_rate=0.1,
    num_leaves=31,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

model.fit(X_train, y_train)

training_time = time.time() - start_time
print(f"‚úÖ Model trained in {training_time:.2f} seconds")

In [None]:
X_train

In [None]:

print("\n[8/10] Evaluating model performance...")

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"\n{'='*70}")
print(f"MODEL PERFORMANCE")
print(f"{'='*70}")
print(f"\n‚úÖ Overall Accuracy: {accuracy*100:.2f}%\n")

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix - Malicious URL Detection', fontsize=14, fontweight='bold')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

In [None]:


print("\n[9/10] Setting up domain whitelist and rules...")

KNOWN_LEGITIMATE_DOMAINS = {
    # Tech & Social Media
    'google.com', 'youtube.com', 'facebook.com', 'twitter.com', 'instagram.com',
    'linkedin.com', 'github.com', 'stackoverflow.com', 'reddit.com', 'pinterest.com',
    'tiktok.com', 'snapchat.com', 'discord.com', 'telegram.org', 'whatsapp.com',

    # E-commerce
    'amazon.com', 'ebay.com', 'alibaba.com', 'walmart.com', 'target.com',
    'etsy.com', 'shopify.com', 'bestbuy.com',

    # Finance
    'paypal.com', 'stripe.com', 'chase.com', 'bankofamerica.com', 'wellsfargo.com',
    'citibank.com', 'capitalone.com', 'americanexpress.com',

    # Streaming
    'netflix.com', 'hulu.com', 'disneyplus.com', 'spotify.com', 'soundcloud.com',
    'twitch.tv', 'vimeo.com', 'imdb.com',

    # News
    'cnn.com', 'bbc.com', 'nytimes.com', 'theguardian.com', 'reuters.com',
    'bloomberg.com', 'forbes.com', 'techcrunch.com', 'wired.com', 'theverge.com',

    # Education
    'wikipedia.org', 'coursera.org', 'udemy.com', 'khanacademy.org', 'edx.org',

    # Tech Companies
    'microsoft.com', 'apple.com', 'ibm.com', 'oracle.com', 'adobe.com',
    'salesforce.com', 'zoom.us', 'slack.com', 'dropbox.com',

    # Search Engines
    'bing.com', 'yahoo.com', 'duckduckgo.com', 'brave.com', 'mozilla.org',
}

PHISHING_KEYWORDS = [
    'verify', 'confirm', 'update', 'secure', 'account', 'login', 'signin',
    'banking', 'suspended', 'locked', 'unusual', 'activity', 'alert',
]

SUSPICIOUS_TLDS = ['.tk', '.ml', '.ga', '.cf', '.gq']

print(f"‚úÖ Whitelist contains {len(KNOWN_LEGITIMATE_DOMAINS)} legitimate domains")


In [None]:

def analyze_url_security(url):
    """
    Advanced security analysis with multiple checks
    """
    url_lower = str(url).lower()
    domain = extract_domain(url)

    # Rule 1: Whitelisted domain
    if domain in KNOWN_LEGITIMATE_DOMAINS:
        return {
            'risk_level': 'SAFE',
            'reason': 'Whitelisted legitimate domain',
            'confidence': 0.99
        }

    # Rule 2: Suspicious TLD + phishing keywords
    if any(tld in url_lower for tld in SUSPICIOUS_TLDS):
        phishing_count = sum(1 for kw in PHISHING_KEYWORDS if kw in url_lower)
        if phishing_count >= 2:
            return {
                'risk_level': 'HIGH_RISK',
                'reason': f'Suspicious TLD with phishing keywords',
                'confidence': 0.85
            }

    # Rule 3: Typosquatting detection
    for legit_domain in KNOWN_LEGITIMATE_DOMAINS:
        legit_name = legit_domain.split('.')[0]
        if legit_name in domain and domain != legit_domain:
            if not (legit_name == 'youtube' and domain == 'youtu.be'):
                return {
                    'risk_level': 'HIGH_RISK',
                    'reason': f'Possible typosquatting of {legit_domain}',
                    'confidence': 0.80
                }

    # Rule 4: IP address
    if having_ip_address(url):
        return {
            'risk_level': 'MEDIUM_RISK',
            'reason': 'Uses IP address instead of domain',
            'confidence': 0.75
        }

    return None

def predict_url(url):
    """
    HYBRID URL SAFETY PREDICTION
    Combines rule-based checks with ML model

    Args:
        url (str): URL to check

    Returns:
        tuple: (prediction, probabilities, reason)
    """
    # Step 1: Rule-based analysis
    analysis = analyze_url_security(url)

    if analysis:
        num_classes = len(label_encoder.classes_)
        probs = np.zeros(num_classes)

        if analysis['risk_level'] == 'SAFE':
            benign_idx = list(label_encoder.classes_).index('benign')
            probs[benign_idx] = analysis['confidence']
            for i in range(num_classes):
                if i != benign_idx:
                    probs[i] = (1 - analysis['confidence']) / (num_classes - 1)
            return 'benign', probs, analysis['reason']

        elif analysis['risk_level'] == 'HIGH_RISK':
            phishing_idx = list(label_encoder.classes_).index('phishing')
            probs[phishing_idx] = analysis['confidence']
            for i in range(num_classes):
                if i != phishing_idx:
                    probs[i] = (1 - analysis['confidence']) / (num_classes - 1)
            return 'phishing', probs, analysis['reason']

        elif analysis['risk_level'] == 'MEDIUM_RISK':
            malware_idx = list(label_encoder.classes_).index('malware')
            probs[malware_idx] = analysis['confidence']
            for i in range(num_classes):
                if i != malware_idx:
                    probs[i] = (1 - analysis['confidence']) / (num_classes - 1)
            return 'malware', probs, analysis['reason']

    # Step 2: ML Model prediction
    url_str = str(url)
    url_len = len(url_str)
    num_digits = sum(c.isdigit() for c in url_str)

    features_dict = {
        'url_length': url_len,
        'num_dots': url_str.count('.'),
        'num_hyphens': url_str.count('-'),
        'num_underscores': url_str.count('_'),
        'num_slashes': url_str.count('/'),
        'num_questions': url_str.count('?'),
        'num_equals': url_str.count('='),
        'num_at': url_str.count('@'),
        'num_ampersands': url_str.count('&'),
        'num_digits': num_digits,
        'digit_ratio': num_digits / max(url_len, 1),
        'num_special_chars': count_special_chars(url),
        'entropy': calculate_entropy(url),
        'use_of_ip': having_ip_address(url),
        'is_https': 1 if 'https' in url_str.lower() else 0,
        'suspicious_tld': has_suspicious_tld(url),
        'has_shortening': has_shortening_service(url)
    }

    manual_values = [features_dict[col] for col in feature_cols]
    manual_sparse = csr_matrix([manual_values])

    url_preprocessed = preprocess_url(url)
    tfidf_sparse = tfidf_vectorizer.transform([url_preprocessed])

    X_single = hstack([manual_sparse, tfidf_sparse])

    pred_encoded = model.predict(X_single)[0]
    pred_label = label_encoder.inverse_transform([pred_encoded])[0]
    probs = model.predict_proba(X_single)[0]

    return pred_label, probs, 'Machine Learning prediction'

def check_url(url):
    """
    User-friendly URL safety checker

    Args:
        url (str): URL to check

    Returns:
        dict: Safety analysis results
    """
    pred, probs, reason = predict_url(url)
    domain = extract_domain(url)

    return {
        'url': url,
        'domain': domain,
        'prediction': pred,
        'is_safe': pred == 'benign',
        'confidence': float(max(probs)),
        'reason': reason,
        'whitelisted': domain in KNOWN_LEGITIMATE_DOMAINS,
        'probabilities': {
            label_encoder.classes_[i]: float(probs[i])
            for i in range(len(label_encoder.classes_))
        }
    }

print("‚úÖ Hybrid prediction system ready")


In [None]:


print("\n[10/10] Running demonstration tests...")

print(f"\n{'='*70}")
print("DEMONSTRATION - URL SAFETY CHECKER")
print(f"{'='*70}")

test_urls = [
    "https://web.whatsapp.com/",
    "http://bet8.pages.dev",
    "https://www.youtube.com/watch?v=AlBTv_eBPd4",
    "http://paypal-verify-account.tk",
    "https://gotoworkb.vip/z7mF79/#/",
    "	https://richardsamuelmd.com/sso/login/",
    "http://192.168.0.100/admin"]

for url in test_urls:
    result = check_url(url)

    if result['is_safe']:
        status = "üü¢ SAFE"
        color = '\033[92m'  # Green
    else:
        status = "üî¥ MALICIOUS"
        color = '\033[91m'  # Red
    reset = '\033[0m'

    print(f"\n{color}{status}{reset}")
    print(f"URL: {url}")
    print(f"Prediction: {result['prediction'].upper()}")
    print(f"Confidence: {result['confidence']*100:.1f}%")
    print(f"Reason: {result['reason']}")
    if result['whitelisted']:
        print(f"‚úì Whitelisted domain")

In [None]:


print(f"\n{'='*70}")
print("SAVING MODEL")
print(f"{'='*70}")

model_package = {
    'model': model,
    'tfidf_vectorizer': tfidf_vectorizer,
    'label_encoder': label_encoder,
    'feature_cols': feature_cols,
    'whitelist': KNOWN_LEGITIMATE_DOMAINS,
    'version': '1.0',
    'training_accuracy': accuracy,
    'training_date': time.strftime('%Y-%m-%d')
}

with open('malicious_url_detector.pkl', 'wb') as f:
    pickle.dump(model_package, f)

print("\n‚úÖ Model saved to 'malicious_url_detector.pkl'")

In [None]:


def interactive_checker():
    """
    Interactive URL checker - keep testing URLs
    """
    print(f"\n{'='*70}")
    print("INTERACTIVE URL SAFETY CHECKER")
    print(f"{'='*70}")
    print("Enter URLs to check (type 'quit' to exit)\n")

    while True:
        url = input("üîó Enter URL: ").strip()

        if url.lower() in ['quit', 'exit', 'q']:
            print("\nüëã Goodbye!")
            break

        if not url:
            continue

        try:
            result = check_url(url)

            print("\n" + "-"*70)
            if result['is_safe']:
                print("‚úÖ SAFE - This URL appears to be legitimate")
            else:
                print(f"‚ö†Ô∏è  WARNING - This URL is classified as: {result['prediction'].upper()}")

            print(f"Confidence: {result['confidence']*100:.1f}%")
            print(f"Reason: {result['reason']}")

            print("\nDetailed Probabilities:")
            for cls, prob in sorted(result['probabilities'].items(), key=lambda x: x[1], reverse=True):
                bar = '‚ñà' * int(prob * 50)
                print(f"  {cls:12s}: {prob*100:5.1f}% {bar}")
            print("-"*70 + "\n")

        except Exception as e:
            print(f"‚ùå Error: {e}\n")

In [None]:


print(f"\n{'='*70}")
print("PROJECT SUMMARY")
print(f"{'='*70}")

print(f"""
‚úÖ PROJECT COMPLETED SUCCESSFULLY!

üìä Model Statistics:
   - Training Samples: {X_train.shape[0]:,}
   - Testing Samples: {X_test.shape[0]:,}
   - Features: {all_features.shape[1]}
   - Accuracy: {accuracy*100:.2f}%
   - Training Time: {training_time:.2f} seconds

üîß Features:
   - 17 Manual engineered features
   - 5000 TF-IDF character n-grams
   - Hybrid rule-based + ML approach
   - Whitelist of {len(KNOWN_LEGITIMATE_DOMAINS)} legitimate domains

üéØ Usage:
   1. Simple check: result = check_url('https://example.com')
   2. Interactive: interactive_checker()
   3. Direct predict: prediction, probs, reason = predict_url('url')

üìÅ Saved Files:
   - malicious_url_detector.pkl (model file)

üöÄ Ready for deployment and demonstration!
""")

print(f"{'='*70}\n")



In [None]:
from pyngrok import ngrok

# Authenticate ngrok
ngrok.set_auth_token("358iAUThEHTDr2fTWujUnTCrLOC_2aSVWBLTc2yJJ8A6N2WSt")


In [None]:
!pip install streamlit pyngrok lightgbm xgboost wordcloud scikit-learn pandas numpy seaborn matplotlib




In [None]:
 !pip install streamlit pyngrok lightgbm xgboost wordcloud scikit-learn


In [None]:
from pyngrok import ngrok

# List all tunnels
tunnels = ngrok.get_tunnels()
for t in tunnels:
    print(t.public_url)
    # Kill each tunnel
    ngrok.disconnect(t.public_url)


In [None]:
from pyngrok import ngrok

# Disconnect all active tunnels
for t in ngrok.get_tunnels():
    print(f"Disconnecting tunnel: {t.public_url}")
    ngrok.disconnect(t.public_url)


In [None]:
public_url = ngrok.connect(8501)
print("Streamlit URL:", public_url.public_url)




PyngrokNgrokHTTPError: ngrok client exception, API returned 502: {"error_code":103,"status_code":502,"msg":"failed to start tunnel","details":{"err":"failed to start tunnel: The endpoint 'https://avulsed-unendorsed-alpha.ngrok-free.dev' is already online. Either\n1. stop your existing endpoint first, or\n2. start both endpoints with `--pooling-enabled` to load balance between them.\r\n\r\nERR_NGROK_334\r\n"}}


In [None]:
from pyngrok import ngrok

# Authenticate ngrok
ngrok.set_auth_token("358iAUThEHTDr2fTWujUnTCrLOC_2aSVWBLTc2yJJ8A6N2WSt")


In [None]:
# Cell 1 ‚Äî install dependencies (run once)
!pip install -q streamlit pyngrok lightgbm xgboost wordcloud scikit-learn

# small helper for Colab to show the streaming logs
import os
os.environ["STREAMLIT_SERVER_HEADLESS"] = "true"
print("‚úÖ Installed dependencies.")


‚úÖ Installed dependencies.


In [None]:
# Cell 2 ‚Äî write the Streamlit app file
%%writefile app.py
import streamlit as st
import pickle
import numpy as np
import re
from scipy.sparse import hstack, csr_matrix
import os

st.set_page_config(page_title="Malicious URL Detector", layout="wide")

# === Load model helper ===
@st.cache_data(show_spinner=False)
def load_model(path="malicious_url_detector.pkl"):
    if not os.path.exists(path):
        return None
    with open(path, "rb") as f:
        pkg = pickle.load(f)
    return pkg

pkg = load_model()
if pkg is None:
    st.error("Model file 'malicious_url_detector.pkl' not found in working directory. Upload it to Colab or run training cells first.")
    st.stop()

model = pkg['model']
tfidf_vectorizer = pkg['tfidf_vectorizer']
label_encoder = pkg['label_encoder']
feature_cols = pkg['feature_cols']
KNOWN_LEGITIMATE_DOMAINS = set(pkg.get('whitelist', []))

# === Helper functions (same as notebook) ===
def count_special_chars(url):
    return len(re.findall(r'[^a-zA-Z0-9]', str(url)))

def calculate_entropy(url):
    url = str(url)
    if len(url) == 0:
        return 0.0
    entropy = 0.0
    for x in range(256):
        p_x = float(url.count(chr(x))) / len(url)
        if p_x > 0:
            entropy += - p_x * np.log2(p_x)
    return entropy

def having_ip_address(url):
    return 1 if re.search(
        r'(([01]?\d\d?|2[0-4]\d|25[0-5])\.([01]?\d\d?|2[0-4]\d|25[0-5])\.([01]?\d\d?|2[0-4]\d|25[0-5])\.([01]?\d\d?|2[0-4]\d|25[0-5]))',
        str(url)) else 0

def has_suspicious_tld(url):
    suspicious_tlds = ['.tk', '.ml', '.ga', '.cf', '.gq', '.xyz', '.top', '.work', '.click']
    return 1 if any(tld in str(url).lower() for tld in suspicious_tlds) else 0

def has_shortening_service(url):
    shortening_services = ['bit.ly', 'goo.gl', 'tinyurl', 'ow.ly', 't.co', 'buff.ly']
    return 1 if any(service in str(url).lower() for service in shortening_services) else 0

def preprocess_url(url):
    url = str(url).lower()
    url = re.sub(r'https?://', '', url)
    url = re.sub(r'^www\.', '', url)
    return url

def extract_domain(url):
    url = str(url).lower()
    url = re.sub(r'https?://', '', url)
    url = re.sub(r'^www\.', '', url)
    domain = url.split('/')[0]
    parts = domain.split('.')
    return '.'.join(parts[-2:]) if len(parts) >= 2 else domain

PHISHING_KEYWORDS = [
    'verify', 'confirm', 'update', 'secure', 'account', 'login', 'signin',
    'banking', 'suspended', 'locked', 'unusual', 'activity', 'alert',
]
SUSPICIOUS_TLDS = ['.tk', '.ml', '.ga', '.cf', '.gq']

def analyze_url_security(url):
    url_lower = str(url).lower()
    domain = extract_domain(url)

    # Rule 1: Whitelisted domain
    if domain in KNOWN_LEGITIMATE_DOMAINS:
        return ('SAFE', 'Whitelisted legitimate domain', 0.99)

    # Rule 2: Suspicious TLD + phishing keywords
    if any(tld in url_lower for tld in SUSPICIOUS_TLDS):
        phishing_count = sum(1 for kw in PHISHING_KEYWORDS if kw in url_lower)
        if phishing_count >= 2:
            return ('HIGH_RISK', 'Suspicious TLD with phishing keywords', 0.85)

    # Rule 3: Typosquatting detection (simple)
    for legit_domain in KNOWN_LEGITIMATE_DOMAINS:
        legit_name = legit_domain.split('.')[0]
        if legit_name in domain and domain != legit_domain:
            if not (legit_name == 'youtube' and domain == 'youtu.be'):
                return ('HIGH_RISK', f'Possible typosquatting of {legit_domain}', 0.80)

    # Rule 4: IP address
    if having_ip_address(url):
        return ('MEDIUM_RISK', 'Uses IP address instead of domain', 0.75)

    return None

def predict_with_model(url):
    url_str = str(url)
    url_len = len(url_str)
    num_digits = sum(c.isdigit() for c in url_str)

    features_dict = {
        'url_length': url_len,
        'num_dots': url_str.count('.'),
        'num_hyphens': url_str.count('-'),
        'num_underscores': url_str.count('_'),
        'num_slashes': url_str.count('/'),
        'num_questions': url_str.count('?'),
        'num_equals': url_str.count('='),
        'num_at': url_str.count('@'),
        'num_ampersands': url_str.count('&'),
        'num_digits': num_digits,
        'digit_ratio': num_digits / max(url_len, 1),
        'num_special_chars': count_special_chars(url),
        'entropy': calculate_entropy(url),
        'use_of_ip': having_ip_address(url),
        'is_https': 1 if 'https' in url_str.lower() else 0,
        'suspicious_tld': has_suspicious_tld(url),
        'has_shortening': has_shortening_service(url)
    }

    manual_values = [features_dict[c] for c in feature_cols]
    manual_sparse = csr_matrix([manual_values])
    tfidf_sparse = tfidf_vectorizer.transform([preprocess_url(url)])
    X_single = hstack([manual_sparse, tfidf_sparse])

    pred_encoded = model.predict(X_single)[0]
    pred_label = label_encoder.inverse_transform([pred_encoded])[0]
    probs = model.predict_proba(X_single)[0]
    probs_dict = {label_encoder.classes_[i]: float(probs[i]) for i in range(len(probs))}
    return pred_label, float(np.max(probs)), probs_dict

# === UI Layout ===
st.title("üîç Malicious URL Detection ‚Äî Hybrid (rules + ML)")
col1, col2 = st.columns([3,1])

with col1:
    url_input = st.text_input("Enter URL to analyze", value="https://www.example.com")
    if st.button("Analyze URL"):
        if not url_input.strip():
            st.warning("Please enter a URL.")
        else:
            analysis = analyze_url_security(url_input)
            if analysis:
                risk, reason, conf = analysis
                if risk == 'SAFE':
                    displayed_label = 'benign'
                    st.success(f"üü¢ SAFE ‚Äî {reason} (confidence {conf*100:.1f}%)")
                elif risk == 'HIGH_RISK':
                    displayed_label = 'phishing'
                    st.error(f"üî¥ HIGH RISK ‚Äî {reason} (confidence {conf*100:.1f}%)")
                else:
                    displayed_label = 'malware'
                    st.warning(f"‚ö†Ô∏è MEDIUM RISK ‚Äî {reason} (confidence {conf*100:.1f}%)")

                # Build estimated probs vector (simple)
                classes = list(label_encoder.classes_)
                probs = {c: 0.0 for c in classes}
                probs[displayed_label] = conf
                # distribute remaining prob evenly (for display)
                remaining = (1 - conf)
                others = [c for c in classes if c != displayed_label]
                for o in others:
                    probs[o] = remaining / max(1, len(others))

                st.write("**Probabilities (estimated):**")
                for cls, p in sorted(probs.items(), key=lambda x: x[1], reverse=True):
                    bar = '‚ñà' * int(p * 40)
                    st.write(f"- {cls:12s}: {p*100:5.1f}% {bar}")

                st.write("---")
                st.write("**Reason:**", reason)
            else:
                pred_label, conf, probs_dict = predict_with_model(url_input)
                if pred_label == 'benign':
                    st.success(f"üü¢ SAFE ‚Äî {pred_label} (confidence {conf*100:.1f}%)")
                else:
                    st.error(f"üî¥ {pred_label.upper()} ‚Äî (confidence {conf*100:.1f}%)")

                st.write("**Probabilities:**")
                for cls, p in sorted(probs_dict.items(), key=lambda x: x[1], reverse=True):
                    bar = '‚ñà' * int(p * 40)
                    st.write(f"- {cls:12s}: {p*100:5.1f}% {bar}")

            st.write("---")
            st.write("**Domain:**", extract_domain(url_input))
            st.write("**Whitelisted:**", extract_domain(url_input) in KNOWN_LEGITIMATE_DOMAINS)

with col2:
    st.markdown("### Quick examples")
    if st.button("Phishing example 1"):
        st.experimental_set_query_params()  # noop
        st.session_state["url_example"] = "http://paypal-verify-account.tk"
        st.warning("Example: http://paypal-verify-account.tk ‚Äî copy into main input and click Analyze")
    if st.button("Phishing example 2"):
        st.session_state["url_example"] = "http://google-secure-login.tk"
        st.warning("Example: http://google-secure-login.tk ‚Äî copy into main input and click Analyze")
    if st.button("Shortened example"):
        st.session_state["url_example"] = "https://bit.ly/confirm-account"
        st.warning("Example: https://bit.ly/confirm-account ‚Äî copy into main input and click Analyze")
    if st.button("Benign example (Google)"):
        st.session_state["url_example"] = "https://www.google.com"
        st.success("Example: https://www.google.com ‚Äî copy into main input and click Analyze")

    st.write("---")
    st.markdown("**Notes**")
    st.write("- Examples are synthetic for testing only.")
    st.write("- Do NOT paste real, active malicious URLs with tokens/credentials into public notebooks.")

st.caption("Model version: {}".format(pkg.get('version', 'unknown')))


In [None]:
# Cell 3 ‚Äî upload model if you haven't already, authenticate ngrok, and run Streamlit
from google.colab import files
import os

# If you don't have the model in the session, upload it now
if not os.path.exists("malicious_url_detector.pkl"):
    print("Upload 'malicious_url_detector.pkl' now (choose file).")
    uploaded = files.upload()  # interactively upload
    if "malicious_url_detector.pkl" not in uploaded:
        print("Make sure the uploaded filename is exactly 'malicious_url_detector.pkl'.")
    else:
        print("‚úÖ Uploaded model file.")

# Authenticate ngrok (replace token)
from pyngrok import ngrok, conf
NGROK_AUTH_TOKEN = "YOUR_NGROK_AUTHTOKEN_HERE"  # <<--- REPLACE this with your token
if NGROK_AUTH_TOKEN == "YOUR_NGROK_AUTHTOKEN_HERE":
    print("‚ö†Ô∏è Please replace NGROK_AUTH_TOKEN with your actual token from https://dashboard.ngrok.com/get-started/your-authtoken")
else:
    ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# Start ngrok tunnel
print("Starting ngrok tunnel on port 8501...")
public_url = ngrok.connect(8501)
print("üåê Streamlit URL:", public_url.public_url)

# Launch Streamlit
print("Launching Streamlit app (this runs in the background). Logs will appear below.")
get_ipython().system_raw('streamlit run app.py --server.port 8501 &')

# Optional: show the last lines of the streamlit log for status
import time, subprocess, sys
time.sleep(2)
print("You can open the URL above. If it doesn't load immediately, wait a few seconds and refresh.")


# frontend 2

In [None]:
from pyngrok import ngrok

# Kill all active tunnels
ngrok.kill()

