# Pattern Analysis for Hierarchical Rule Generation

This notebook analyzes the DNS training dataset to extract discriminative patterns for building a hierarchical classifier.

## Goals:
1. Analyze TLD distribution between legitimate and malicious domains
2. Extract domain name patterns (length, special characters, entropy)
3. Identify subdomain characteristics
4. Calculate information gain for each feature
5. Generate candidate rules ordered by performance

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from urllib.parse import urlparse
import re

# Load dataset
df = pd.read_csv('../../Data/dns_training_data.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nClass distribution:")
print(df['label'].value_counts())
print(f"\nMalicious ratio: {df['label'].mean():.2%}")

## 1. TLD Analysis

In [None]:
def extract_tld(domain):
    """Extract top-level domain"""
    parts = domain.split('.')
    if len(parts) >= 2:
        return parts[-1]
    return parts[0] if parts else ''

df['tld'] = df['domain'].apply(extract_tld)

# TLD distribution by class
tld_stats = df.groupby('tld').agg({
    'label': ['count', 'sum', 'mean']
}).round(3)
tld_stats.columns = ['total_count', 'malicious_count', 'malicious_ratio']
tld_stats = tld_stats.sort_values('total_count', ascending=False)

print("Top 20 TLDs:")
print(tld_stats.head(20))

# High-risk TLDs (>70% malicious, min 10 samples)
high_risk_tlds = tld_stats[
    (tld_stats['malicious_ratio'] > 0.7) & (tld_stats['total_count'] >= 10)
].sort_values('malicious_ratio', ascending=False)

print(f"\nHigh-risk TLDs (>70% malicious, n>=10): {len(high_risk_tlds)}")
print(high_risk_tlds.head(20))

## 2. Domain Pattern Analysis

In [None]:
def extract_domain_features(domain):
    """Extract features from domain name"""
    parts = domain.split('.')
    
    # Get main domain (second-level domain)
    if len(parts) >= 2:
        main_domain = parts[-2]
    else:
        main_domain = parts[0] if parts else ''
    
    features = {
        'domain_length': len(domain),
        'main_domain_length': len(main_domain),
        'num_subdomains': len(parts) - 2 if len(parts) > 2 else 0,
        'num_dots': domain.count('.'),
        'num_hyphens': domain.count('-'),
        'num_digits': sum(c.isdigit() for c in domain),
        'has_www': 1 if domain.startswith('www.') else 0,
        'digit_ratio': sum(c.isdigit() for c in main_domain) / len(main_domain) if main_domain else 0,
    }
    
    # Calculate entropy
    if main_domain:
        char_freq = Counter(main_domain)
        entropy = -sum((freq/len(main_domain)) * np.log2(freq/len(main_domain)) 
                      for freq in char_freq.values())
        features['entropy'] = entropy
    else:
        features['entropy'] = 0
    
    return features

# Extract features for all domains
feature_df = pd.DataFrame([extract_domain_features(d) for d in df['domain']])
df_features = pd.concat([df, feature_df], axis=1)

print("Feature statistics by class:")
print(df_features.groupby('label')[feature_df.columns].mean())

## 3. Information Gain Analysis

In [None]:
from sklearn.metrics import mutual_info_score
from sklearn.tree import DecisionTreeClassifier

# Calculate information gain for numerical features
def calculate_information_gain(df, feature_col, target_col='label', bins=10):
    """Calculate information gain for a feature"""
    # Discretize continuous features
    feature_binned = pd.cut(df[feature_col], bins=bins, duplicates='drop')
    return mutual_info_score(df[target_col], feature_binned)

# Calculate IG for all numerical features
ig_scores = {}
for col in feature_df.columns:
    ig_scores[col] = calculate_information_gain(df_features, col)

ig_df = pd.DataFrame([
    {'feature': k, 'information_gain': v} 
    for k, v in ig_scores.items()
]).sort_values('information_gain', ascending=False)

print("\nInformation Gain for features:")
print(ig_df)

# Visualize
plt.figure(figsize=(10, 6))
plt.barh(ig_df['feature'], ig_df['information_gain'])
plt.xlabel('Information Gain')
plt.title('Feature Importance by Information Gain')
plt.tight_layout()
plt.show()

## 4. Decision Tree for Rule Extraction

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.metrics import classification_report, confusion_matrix

# Prepare features
X = df_features[feature_df.columns]
y = df_features['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train decision tree (this will help us extract rules)
dt = DecisionTreeClassifier(
    max_depth=10,
    min_samples_split=50,
    min_samples_leaf=20,
    random_state=42
)
dt.fit(X_train, y_train)

# Evaluate
y_pred = dt.predict(X_test)
print("\nDecision Tree Performance:")
print(classification_report(y_test, y_pred, target_names=['Legitimate', 'Malicious']))

# Extract rules
tree_rules = export_text(dt, feature_names=list(feature_df.columns))
print("\nExtracted Rules (first 50 lines):")
print('\n'.join(tree_rules.split('\n')[:50]))

## 5. Generate Hierarchical Rules

In [None]:
# Extract simple rules with performance metrics
def generate_candidate_rules(df_features, tld_stats):
    """Generate candidate rules with performance metrics"""
    rules = []
    
    # Rule 1: High-risk TLDs
    high_risk = tld_stats[
        (tld_stats['malicious_ratio'] > 0.7) & (tld_stats['total_count'] >= 10)
    ]
    for tld in high_risk.index:
        mask = df_features['tld'] == tld
        if mask.sum() > 0:
            precision = df_features[mask]['label'].mean()
            coverage = mask.sum() / len(df_features)
            rules.append({
                'type': 'tld',
                'condition': f"TLD == '{tld}'",
                'prediction': 1,
                'precision': precision,
                'coverage': coverage,
                'score': precision * coverage
            })
    
    # Rule 2: Domain length based rules
    for threshold in [10, 15, 20, 30, 40]:
        mask = df_features['domain_length'] > threshold
        if mask.sum() > 0:
            precision = df_features[mask]['label'].mean()
            coverage = mask.sum() / len(df_features)
            if precision > 0.6:  # Only keep if reasonably predictive
                rules.append({
                    'type': 'domain_length',
                    'condition': f'domain_length > {threshold}',
                    'prediction': 1,
                    'precision': precision,
                    'coverage': coverage,
                    'score': precision * coverage
                })
    
    # Rule 3: Number of subdomains
    for threshold in [2, 3, 4]:
        mask = df_features['num_subdomains'] >= threshold
        if mask.sum() > 0:
            precision = df_features[mask]['label'].mean()
            coverage = mask.sum() / len(df_features)
            if precision > 0.6:
                rules.append({
                    'type': 'subdomain',
                    'condition': f'num_subdomains >= {threshold}',
                    'prediction': 1,
                    'precision': precision,
                    'coverage': coverage,
                    'score': precision * coverage
                })
    
    # Rule 4: Digit ratio
    for threshold in [0.2, 0.3, 0.4, 0.5]:
        mask = df_features['digit_ratio'] > threshold
        if mask.sum() > 0:
            precision = df_features[mask]['label'].mean()
            coverage = mask.sum() / len(df_features)
            if precision > 0.6:
                rules.append({
                    'type': 'digit_ratio',
                    'condition': f'digit_ratio > {threshold}',
                    'prediction': 1,
                    'precision': precision,
                    'coverage': coverage,
                    'score': precision * coverage
                })
    
    return pd.DataFrame(rules).sort_values('score', ascending=False)

rules_df = generate_candidate_rules(df_features, tld_stats)
print(f"\nGenerated {len(rules_df)} candidate rules")
print("\nTop 20 rules by score:")
print(rules_df.head(20))

## 6. Save Results

In [None]:
# Save extracted patterns
tld_stats.to_csv('../data/tld_analysis.csv')
rules_df.to_csv('../data/candidate_rules.csv', index=False)
ig_df.to_csv('../data/feature_importance.csv', index=False)

print("Analysis results saved to data/ directory")