# DNS Ad-Blocker Training

Train XGBoost model to block ads at DNS level using domain name features.

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from enhanced_dns_features import EnhancedDNSFeatureExtractor as DNSFeatureExtractor
from tqdm.auto import tqdm

np.random.seed(42)
print("Libraries loaded")

## Load Dataset

In [None]:
df = pd.read_csv('../Data/dns_training_data.csv')

print(f"Total domains: {len(df):,}")
print(f"\nLabel distribution:")
print(df['label'].value_counts())
print(f"\nSample:")
print(df.head(10))

## Extract Features

In [None]:
extractor = DNSFeatureExtractor()

print("Extracting features from 198k domains...\n")

features_list = []
for domain in tqdm(df['domain'], desc="Extracting"):
    try:
        features = extractor.extract_features(domain)
        features_list.append(features)
    except:
        features_list.append({})

features_df = pd.DataFrame(features_list)

print(f"\nFeatures extracted: {features_df.shape[1]}")
print(f"\nSample:")
print(features_df.head())

In [None]:
X = features_df.fillna(0)
y = df['label']

print(f"Feature matrix: {X.shape}")
print(f"Labels: {y.shape}")

## Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training: {len(X_train):,} samples")
print(f"Test: {len(X_test):,} samples")

## Train XGBoost Model

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'binary:logistic',
    'max_depth': 8,
    'learning_rate': 0.05,
    'min_child_weight': 3,
    'eval_metric': 'logloss',
    'seed': 42,
    'tree_method': 'hist',
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0.1,
    'reg_alpha': 0.1,
    'reg_lambda': 1.0,
}

evals = [(dtrain, 'train'), (dtest, 'test')]
evals_result = {}

print("Training...\n")

model = xgb.train(
    params,
    dtrain,
    num_boost_round=200,
    evals=evals,
    evals_result=evals_result,
    early_stopping_rounds=20,
    verbose_eval=20
)

print("\nTraining complete")

## Training Loss Curve

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(evals_result['train']['logloss'], label='Train')
plt.plot(evals_result['test']['logloss'], label='Test')
plt.xlabel('Iteration')
plt.ylabel('Log Loss')
plt.title('Training and Test Loss Over Time')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('loss_curve.png', dpi=150)
plt.show()

## Evaluate Performance

In [None]:
y_pred_proba = model.predict(dtest)
y_pred = (y_pred_proba > 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("="*60)
print("MODEL PERFORMANCE")
print("="*60)
print(f"Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Precision: {precision:.4f} ({precision*100:.2f}%)")
print(f"Recall:    {recall:.4f} ({recall*100:.2f}%)")
print(f"F1-Score:  {f1:.4f} ({f1*100:.2f}%)")
print("="*60)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Legitimate', 'Ad']))

## Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Legitimate', 'Ad'],
            yticklabels=['Legitimate', 'Ad'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=150)
plt.show()

tn, fp, fn, tp = cm.ravel()
fpr = fp / (fp + tn)
fnr = fn / (fn + tp)

print(f"False Positive Rate: {fpr:.4f} ({fpr*100:.2f}%)")
print(f"False Negative Rate: {fnr:.4f} ({fnr*100:.2f}%)")

## Feature Importance

In [None]:
importance = model.get_score(importance_type='weight')

feature_importance = pd.DataFrame({
    'feature': list(importance.keys()),
    'importance': list(importance.values())
}).sort_values('importance', ascending=False)

feature_names = X.columns.tolist()
feature_importance['feature_name'] = feature_importance['feature'].apply(
    lambda x: feature_names[int(x.replace('f', ''))] if x.startswith('f') else x
)

print("Top 15 Features:")
print(feature_importance.head(15))

plt.figure(figsize=(10, 8))
top_features = feature_importance.head(15)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature_name'])
plt.xlabel('Importance')
plt.title('Top 15 Most Important Features')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=150)
plt.show()

## Test on Real Domains

In [None]:
test_domains = [
    'googleads.g.doubleclick.net',
    'pagead2.googlesyndication.com',
    'static.ads-twitter.com',
    'an.facebook.com',
    'adservice.google.com',
    'www.google.com',
    'api.github.com',
    'www.wikipedia.org',
    'mail.yahoo.com',
    'www.reddit.com'
]

print("Testing on example domains:\n")
print("="*70)

for domain in test_domains:
    features = extractor.extract_features(domain)
    features_vec = pd.DataFrame([features])[X.columns].fillna(0)
    dmatrix = xgb.DMatrix(features_vec)
    pred_proba = model.predict(dmatrix)[0]
    prediction = "AD" if pred_proba > 0.5 else "LEGITIMATE"
    
    print(f"{domain:45s} -> {prediction:12s} ({pred_proba:.3f})")

print("="*70)

## Save Model

In [None]:
model.save_model('dns_adblocker_model.ubj')
model.save_model('dns_adblocker_model.json')

import json
with open('feature_names.json', 'w') as f:
    json.dump(X.columns.tolist(), f, indent=2)

import os
ubj_size = os.path.getsize('dns_adblocker_model.ubj') / 1024
json_size = os.path.getsize('dns_adblocker_model.json') / 1024

print("Model saved:")
print(f"  dns_adblocker_model.ubj ({ubj_size:.1f} KB)")
print(f"  dns_adblocker_model.json ({json_size:.1f} KB)")
print(f"  feature_names.json")
print(f"\nModel ready for deployment")