# **Libraries**

In [None]:
import pandas as pd
import numpy as np
import re
from urllib.parse import urlparse
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

# **# Pre-Processing Functions:**

In [None]:
!pip install tldextract
# Add this import at the top of your file
import tldextract

# ---------- Helper functions for normalization ----------
def _ensure_scheme(url: str) -> str:
    """Ensure URL has a scheme so urlparse can find netloc."""
    url = url.strip()
    if not re.match(r'^[a-zA-Z]+://', url):
        return 'http://' + url
    return url

def get_registered_domain(url: str) -> str:
    """
    Return the registered domain (e.g. 'google.com', 'example.co.uk').
    Falls back to the host (without www) if tldextract can't determine suffix.
    """
    url_with_scheme = _ensure_scheme(url)
    parsed = urlparse(url_with_scheme)
    host = parsed.netloc.split(':')[0].lower()  # remove port if present
    host_no_www = host[4:] if host.startswith('www.') else host

    try:
        ext = tldextract.extract(url_with_scheme)
        if ext.domain and ext.suffix:
            return f"{ext.domain}.{ext.suffix}".lower()
    except Exception:
        # If tldextract fails (e.g., no internet to fetch suffix list), fallback
        pass

    return host_no_www

def get_host(url: str) -> str:
    """Return the exact host without port (e.g. 'mail.google.com' or 'google.com')."""
    url_with_scheme = _ensure_scheme(url)
    parsed = urlparse(url_with_scheme)
    host = parsed.netloc.split(':')[0].lower()
    return host

# ---------- Whitelist normalization ----------
# keep your original raw whitelist entries here (can be host or just domain)
whitelist_raw = [
    # — Global / Popular International Websites
    "google.com", "gmail.com", "youtube.com", "facebook.com",
    "twitter.com", "instagram.com", "linkedin.com", "wikipedia.org",
    "github.com", "stackoverflow.com", "reddit.com", "quora.com",
    "amazon.com", "amazon.co.uk", "apple.com", "microsoft.com",
    "netflix.com", "wikipedia.org", "bing.com", "whatsapp.com",
    "dropbox.com", "zoom.us", "paypal.com", "ebay.com",

    # — Pakistani Government & Official Portals
    "gov.pk", "nadra.gov.pk", "fbr.gov.pk", "election.gov.pk",
    "nadra.gov.pk", "pesb.gov.pk", "ophrd.gov.pk", "hec.gov.pk",
    "passport.gov.pk", "telenor.com.pk", "ptcl.com.pk", "ptic.org.pk",
    "epay.gov.pk", "ntec.gov.pk", "kp.gov.pk",

    # — Pakistani Academic & University Domains
    "comsats.edu.pk", "fast.edu.pk", "lums.edu.pk", "uok.edu.pk",
    "su.edu.pk", "punjab.gov.pk", "karachi.edu.pk", "gcuf.edu.pk",
    "nu.edu.pk", "uaf.edu.pk", "puanman.gov.pk", "qu.edu.pk",
    "smiu.edu.pk", "baqai.edu.pk", "iobm.edu.pk", "lhr.edu.pk",
    "qau.edu.pk", "cue.edu.pk", "iiu.edu.pk", "nu.edu.pk",

    # — Pakistani News & Media
    "dawn.com", "geo.tv", "tribune.com.pk", "express.com.pk", "thenews.com.pk",
    "jang.com.pk", "nawaiwaqt.com.pk", "urdupoint.com", "arynews.tv",
    "aa.com.tr", "bbc.com (Pakistan section)",  # global
    "pakistantoday.com.pk", "nation.com.pk", "business-standard.com",
    "gulftoday.ae", "pakobserver.net", "pkhope.com", "hipin.com.pk",
    "techjuice.pk",

    # — Pakistani Banks & Financial Services
    "hbl.com", "mcb.com.pk", "sc.com.pk", "albaraka banks.com.pk",
    "bankalfalah.com", "jsbl.com", "cf.com.pk", "askari bank.com.pk",
    "jubileebank.com.pk", "alhamratrustbank.com", "nrbcommercialbank.com",
    "sbbal.net", "bbacash.com", "sbl.com.pk",

    # — Pakistani Telecommunications & ISPs
    "ptcl.com.pk", "telenor.com.pk", "jazz.com.pk", "zong.com.pk",
    "ufone.com.pk", "stormfiber.com", "transworld.com.pk", "multinet.com.pk",
    "optic.com.pk", "comsats.net.pk",

    # — Pakistani E-Commerce and Classifieds
    "daraz.pk", "olx.com.pk", "pakwheels.com", "zameen.com",
    "rentrelo.com", "foodpanda.pk", "bykea.com", "groceryone.pk",
    "yayvo.com", "symbios.pk", "homeshopping.pk", "telemart.pk",

    # — Pakistani Government News & Public Service
    "psc.gov.pk", "secp.gov.pk", "ptiv.gov.pk", "nea.gov.pk",
    "ppra.org.pk", "nadra.gov.pk", "naf.gov.pk", "fpo.gov.pk",

    # — Pakistani Health & Public Sector (COVID, Health, etc.)
    "health.gov.pk", "nhsrc.gov.pk", "iedcr.gov.pk", "nicvd.org",
    "nhs.gov.pk", "police.gov.pk", "lahorepolice.gov.pk", "punjabpolice.gov.pk",

    # — Streaming & Entertainment (Pakistani)
    "pixflix.com", "pakfilms.net", "pakmag.net", "ptv.com.pk",
    "urduflix.com", "arydigital.tv",

    # — Additional Noteworthy Domains (span various Pakistani sectors)
    "pak-telecom.net", "paki.org", "studentsportal.edu.pk", "labour.gov.pk",
    "imranpk.com", "pakistan.gov.pk", "sindh.gov.pk", "sindheducation.gov.pk",
    "wasa.gov.pk", "lic.pak", "kennations.com", "citizenportal.gov.pk"
]


# Normalize into two sets:
# 1) whitelist_domains contains registered domains (google.com, example.co.uk)
# 2) whitelist_hosts contains exact hosts (mail.google.com) — useful if you want to whitelist a specific subdomain only
whitelist_domains = set(get_registered_domain(w) for w in whitelist_raw)
whitelist_hosts = set(get_host(w) for w in whitelist_raw)

# ---------- The new is_whitelisted function (replace the old one) ----------
def is_whitelisted(url: str) -> bool:
    """Return True if url (any form) matches whitelist domain or exact host."""
    host = get_host(url)
    reg = get_registered_domain(url)
    return (reg in whitelist_domains) or (host in whitelist_hosts)



In [None]:
def preprocess_url(url: str) -> str:
    """Clean and normalize URL string."""
    url = str(url).lower()
    return re.sub(r'https?://(www\.)?', '', url)


def extract_features(url: str) -> dict:
    """Extract handcrafted features from a URL."""
    parsed = urlparse(url)
    return {
        "url_length": len(url),
        "num_digits": sum(c.isdigit() for c in url),
        "num_dots": url.count('.'),
        "num_slashes": url.count('/'),
        "has_https": int(url.startswith("https")),
        "has_login": int("login" in url),
        "tld": parsed.netloc.split('.')[-1] if parsed.netloc else ""
    }


# **Merging Two datasets**

# **Importing Datasets and Applying Functions**

In [None]:
train_df = pd.read_csv("train.csv")
X_raw = train_df.iloc[:, 0].values  # URLs
y_raw = train_df.iloc[:, 1].values  # Labels

# Preprocess URLs
urls_clean = [preprocess_url(u) for u in X_raw]

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(
    analyzer="char_wb",   # character n-grams (good for URLs)
    ngram_range=(3, 5),   # 3–5 character chunks
    max_features=20000    # cap feature size
)
X_text = vectorizer.fit_transform(urls_clean)

# Extract handcrafted features
df_features = pd.DataFrame([extract_features(u) for u in urls_clean])

# One-hot encode TLD
ohe_tld = OneHotEncoder(handle_unknown="ignore")
X_tld = ohe_tld.fit_transform(df_features[["tld"]])

# Combine numeric features (drop TLD column after encoding)
X_num = df_features.drop(columns=["tld"]).values

# Final combined feature matrix
X_final = hstack([X_text, X_num, X_tld])

# **Encoding Target Variable**

In [None]:
ohe_y = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
y_encoded = ohe_y.fit_transform(y_raw.reshape(-1, 1))
from sklearn.preprocessing import LabelEncoder


\# ** Train/Test Split **

In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    X_final, y_encoded, test_size=0.2, random_state=42
)

# **Model Training**

In [None]:
clf = RandomForestClassifier(
    n_estimators=100,
    criterion="gini",
    class_weight="balanced",
    n_jobs=-1,
    random_state=42
)
clf.fit(X_train, y_train)

# **Evaluate on Validation Split**

In [None]:
y_pred_encoded = clf.predict(X_test)
# Predicted class index from one-hot encoding
y_pred_indices = np.argmax(y_pred_encoded, axis=1)
y_test_indices = np.argmax(y_test, axis=1)

# Map indices back to original labels using encoder categories
labels = ohe_y.categories_[0]
y_pred_decoded = labels[y_pred_indices]
y_test_decoded = labels[y_test_indices]

# Confusion matrix & accuracy
cm_val = confusion_matrix(y_test_decoded, y_pred_decoded)
acc_val = accuracy_score(y_test_decoded, y_pred_decoded)

print("Validation Confusion Matrix:\n", cm_val)
print("Validation Accuracy:", acc_val)


Validation Confusion Matrix:
 [[3376   48    2  187]
 [ 176 3402    0   10]
 [  55    1 3516  106]
 [ 504   39    8 3055]]
Validation Accuracy: 0.9215740421125302


# **Test on External Test Dataset**

In [None]:
test_df = pd.read_csv("test.csv")
test_urls = [preprocess_url(u) for u in test_df["url"]]

# Extract test features
test_features = pd.DataFrame([extract_features(u) for u in test_urls])
test_X_text = vectorizer.transform(test_urls)
test_X_tld = ohe_tld.transform(test_features[["tld"]])
test_X_num = test_features.drop(columns=["tld"]).values
test_X_final = hstack([test_X_text, test_X_num, test_X_tld])


# **Encode test labels using fitted encoder**

In [None]:
test_y_encoded = ohe_y.transform(test_df["type"].values.reshape(-1, 1))

# **Predictions on test dataset and Accuracy Matrix**

In [None]:
# Predictions (one-hot like outputs)
test_y_pred_encoded = clf.predict(test_X_final)

# Convert to class indices
test_y_pred_indices = np.argmax(test_y_pred_encoded, axis=1)
test_y_indices = np.argmax(test_y_encoded, axis=1)

# Map indices back to labels
labels = ohe_y.categories_[0]
test_y_pred = labels[test_y_pred_indices]
test_y_decoded = labels[test_y_indices]

# Evaluate
cm_test = confusion_matrix(test_y_decoded, test_y_pred)
acc_test = accuracy_score(test_y_decoded, test_y_pred)

print("\nTest Confusion Matrix:\n", cm_test)
print("Test Accuracy:", acc_test)



Test Confusion Matrix:
 [[2815   38    1  146]
 [ 138 2856    1    5]
 [  47    2 2862   89]
 [ 464   25    6 2505]]
Test Accuracy: 0.9198333333333333


# **Predicting Function for a Single URL**

In [None]:
def predict_single_url(url: str, clf, vectorizer, ohe_tld, ohe_y):
    """Predict the class of a single URL."""
    # 1. Preprocess
    clean_url = preprocess_url(url)

    # 2. Extract features
    features = extract_features(clean_url)
    df_feat = pd.DataFrame([features])

    # 3. Transform with fitted encoders/vectorizer
    X_text = vectorizer.transform([clean_url])
    X_tld = ohe_tld.transform(df_feat[["tld"]])
    X_num = df_feat.drop(columns=["tld"]).values

    # 4. Combine
    X_final = hstack([X_text, X_num, X_tld])

    # 5. Predict
    y_pred_encoded = clf.predict(X_final)
    y_pred_index = np.argmax(y_pred_encoded, axis=1)
    label = ohe_y.categories_[0][y_pred_index][0]

    return label


In [None]:
def predict_single_url(url: str, clf, vectorizer, ohe_tld, ohe_y):
    """Predict the class of a single URL with whitelist check."""

    # 0. Whitelist check first
    if is_whitelisted(url):
        return "benign"

    # 1. Preprocess
    clean_url = preprocess_url(url)

    # 2. Extract features
    features = extract_features(clean_url)
    df_feat = pd.DataFrame([features])

    # 3. Transform with fitted encoders/vectorizer
    X_text = vectorizer.transform([clean_url])
    X_tld = ohe_tld.transform(df_feat[["tld"]])
    X_num = df_feat.drop(columns=["tld"]).values

    # 4. Combine
    X_final = hstack([X_text, X_num, X_tld])

    # 5. Predict
    y_pred_encoded = clf.predict(X_final)
    y_pred_index = np.argmax(y_pred_encoded, axis=1)
    label = ohe_y.categories_[0][y_pred_index][0]

    return label


# **Predicting a new URL**

In [None]:
sample_url = "quickfacts.census.gov/qfd/maps/iowa_map.html"
prediction = predict_single_url(sample_url, clf, vectorizer, ohe_tld, ohe_y)
print("Predicted class:", prediction)


NameError: name 'predict_single_url' is not defined