# Safe Browsing Model Testing Notebook

In [21]:
# Import necessary libraries
import numpy as np
import pandas as pd
import joblib
import os
import logging
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# Import local modules using absolute imports
import sys
sys.path.append('/home/kasinadhsarma/safe_browsing/backend/ml/ai')
from dataset import generate_dataset, extract_url_features
from training import balance_dataset, evaluate_model, train_models, generate_evaluation_report, calculate_age_based_risk, ensemble_predict

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

## Load and Prepare Dataset

In [22]:
# Generate dataset
df = generate_dataset()
if df.empty:
    raise ValueError("Empty dataset generated")

# Ensure all required columns exist in the DataFrame
feature_cols = [
    # Basic features
    'length', 'num_dots', 'num_digits', 'num_special', 'entropy',
    'token_count', 'avg_token_length', 'max_token_length', 'min_token_length',
    # Domain features
    'domain_length', 'has_subdomain', 'has_www', 'domain_entropy',
    'is_ip_address', 'domain_digit_ratio', 'domain_special_ratio',
    'domain_uppercase_ratio',
    # Path features
    'path_length', 'num_directories', 'path_entropy', 'has_double_slash',
    'directory_length_mean', 'directory_length_max', 'directory_length_min',
    'path_special_ratio',
    # Query features
    'num_params', 'query_length', 'has_suspicious_params',
    'param_entropy', 'param_special_ratio',
    # Security features
    'has_https', 'has_port', 'suspicious_tld', 'has_fragment',
    'has_redirect', 'has_obfuscation',
    # Content indicators
    'has_suspicious_words', 'suspicious_word_count', 'suspicious_word_ratio',
    'has_executable', 'has_archive',
    # Age-specific features
    'kid_unsafe_words', 'teen_unsafe_words', 'kid_unsafe_ratio',
    'teen_unsafe_ratio', 'kid_unsafe_score', 'teen_unsafe_score'
]

# Initialize missing columns with 0
for col in feature_cols:
    if col not in df.columns:
        df[col] = 0

X = df[feature_cols].values
y = df['is_blocked'].values

# Balance dataset
X_balanced, y_balanced = balance_dataset(X, y)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

logging.info("Dataset prepared and features scaled.")

## Train Models

In [None]:
# Train models
models = train_models()
if models is None:
    logging.error("Model training failed.")
else:
    logging.info("Models trained successfully.")

## Evaluate Models

In [None]:
# Evaluate models
if models:
    for model_name, model_data in models.items():
        y_pred = model_data['model'].predict(X_test_scaled)
        metrics = evaluate_model(y_test, y_pred, model_name)
        logging.info(f"{model_name} Metrics: {metrics}")

        # Visualize model performance metrics
        plt.figure(figsize=(8, 6))
        plt.bar(['Accuracy', 'Precision', 'Recall', 'F1'], 
                [metrics['accuracy'], metrics['precision'], 
                 metrics['recall'], metrics['f1']])
        plt.title(f'{model_name} Performance Metrics')
        plt.ylim(0, 1)
        plt.ylabel('Score')
        plt.show()
else:
    logging.error("No models available for evaluation.")