
# Security Data Analysis Workshop
## Blue Team Focus: When to use AI/ML vs Basic Methods

This notebook demonstrates when to use simple statistical methods vs machine learning 
for cybersecurity analysis. Each module compares both approaches with realistic datasets.

## Setup and Imports ##

In [19]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import warnings
warnings.filterwarnings('ignore')

# ML libraries
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.metrics import classification_report


In [20]:

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('default')
sns.set_palette("husl")

# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)
print("✅ Libraries imported successfully!")



✅ Libraries imported successfully!


# ---
# Module 1: The Overwhelmed SOC Analyst
# **Persona**: Junior analyst drowning in 50,000 daily log entries
# 
# **Scenario**: You have network traffic logs and need to quickly identify suspicious activity.
# 
# **Question**: Should you use Excel pivot tables or machine learning?

# 1.1 Generate Network Traffic Dataset
# ---

In [21]:
def generate_network_traffic_data(num_records=10000):
    """Generate realistic network traffic data with hidden anomalies"""
    
    # Normal traffic patterns
    normal_sources = [f"192.168.1.{i}" for i in range(10, 100)]
    normal_destinations = [f"10.0.{i}.{j}" for i in range(1, 5) for j in range(10, 50)]
    common_ports = [80, 443, 22, 25, 53, 110, 143, 993, 995]
    
    # Generate timestamps over last 24 hours
    start_time = datetime.now() - timedelta(hours=24)
    
    data = []
    
    # Normal traffic (90% of data)
    for _ in range(int(num_records * 0.90)):
        timestamp = start_time + timedelta(seconds=random.randint(0, 86400))
        source = random.choice(normal_sources)
        destination = random.choice(normal_destinations)
        port = random.choice(common_ports)
        bytes_sent = random.randint(64, 1500)
        
        data.append({
            'timestamp': timestamp.strftime('%Y-%m-%d %H:%M:%S'),
            'source_ip': source,
            'destination_ip': destination,
            'destination_port': port,
            'bytes': bytes_sent,
            'protocol': 'TCP'
        })
    
    # Suspicious traffic (10% of data) - More obvious for Excel detection
    suspicious_ips = ['185.220.70.43', '194.233.164.24', '91.234.99.12']
    suspicious_ports = [1337, 4444, 8080, 9999, 31337]
    
    for _ in range(int(num_records * 0.10)):
        timestamp = start_time + timedelta(seconds=random.randint(0, 86400))
        
        if random.random() < 0.4:  # External suspicious IP
            source = random.choice(suspicious_ips)
            destination = random.choice(normal_destinations)
            port = random.choice(suspicious_ports)
            bytes_sent = random.randint(5000, 50000)  # Much larger transfers
        else:  # Internal lateral movement
            source = random.choice(normal_sources)
            destination = random.choice(normal_sources)
            port = random.choice(suspicious_ports)
            bytes_sent = random.randint(2000, 10000)
        
        data.append({
            'timestamp': timestamp.strftime('%Y-%m-%d %H:%M:%S'),
            'source_ip': source,
            'destination_ip': destination,
            'destination_port': port,
            'bytes': bytes_sent,
            'protocol': 'TCP'
        })
    
    df = pd.DataFrame(data)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    return df.sample(frac=1).reset_index(drop=True)  # Shuffle the data

In [45]:
# Generate the dataset
traffic_df = generate_network_traffic_data(500000)
print(f"📊 Generated {len(traffic_df):,} network traffic records")
print(f"📅 Time range: {traffic_df['timestamp'].min()} to {traffic_df['timestamp'].max()}")

# Save to CSV for Excel analysis
traffic_df.to_csv('network_traffic.csv', index=False)
print("💾 Saved to network_traffic.csv")

# Display sample
print("\n🔍 Sample of the data:")
display(traffic_df.head())

📊 Generated 500,000 network traffic records
📅 Time range: 2025-07-16 09:10:55 to 2025-07-17 09:10:55
💾 Saved to network_traffic.csv

🔍 Sample of the data:


Unnamed: 0,timestamp,source_ip,destination_ip,destination_port,bytes,protocol
0,2025-07-16 14:00:45,192.168.1.23,10.0.4.24,53,1481,TCP
1,2025-07-16 22:08:13,192.168.1.27,10.0.1.21,53,1129,TCP
2,2025-07-16 13:43:28,192.168.1.16,10.0.1.38,25,1268,TCP
3,2025-07-17 07:31:23,192.168.1.82,10.0.2.23,25,1498,TCP
4,2025-07-17 06:31:47,192.168.1.79,10.0.2.10,25,472,TCP


## 1.2 Basic Analysis (Excel-Style in Python)
**Let's generate some basic statistics! You can use a tool like Excel for this**
**type of analysis, but you can also do this using python and pandas!**

Let's look for:
- Top talkers by bytes sent
- Unusual ports (simple frequency analysis)
- Large transfers (95th percentile threshold)
- Quick suspicious IP identification based on a pre-determined list of "suspicious" IPs

Pros:
- Very quick analysis
- Summary statistics
- Explainable!

Cons:
- No insight into complex behavior patterns
- Must pre-determine statistics you want to examine


In [46]:
def basic_traffic_analysis(df):
    """Basic analysis that mirrors Excel pivot tables and filters"""
    
    print("🔎 BASIC ANALYSIS (Excel-style approach)")
    print("=" * 50)
    
    # Analysis 1: Top talkers by bytes sent
    print("\n📈 TOP 10 SOURCES BY TOTAL BYTES:")
    top_sources = df.groupby('source_ip')['bytes'].agg(['sum', 'count', 'mean']).round(2)
    top_sources.columns = ['total_bytes', 'connections', 'avg_bytes']
    top_sources = top_sources.sort_values('total_bytes', ascending=False)
    print(top_sources.head(10))
    
    # Analysis 2: Unusual ports (simple frequency analysis)
    print("\n🔍 UNUSUAL PORTS (used ≤ 5 times):")
    port_counts = df['destination_port'].value_counts()
    unusual_ports = port_counts[port_counts <= 5]
    print(f"Found {len(unusual_ports)} unusual ports:")
    print(unusual_ports.head(10))
    
    # Analysis 3: Large transfers (95th percentile threshold)
    threshold = df['bytes'].quantile(0.95)
    large_transfers = df[df['bytes'] > threshold]
    print(f"\n📊 LARGE TRANSFERS (> {threshold:,.0f} bytes):")
    print(f"Found {len(large_transfers)} large transfers")
    
    large_by_source = large_transfers.groupby('source_ip').agg({
        'bytes': ['count', 'sum', 'max'],
        'destination_port': lambda x: list(x.unique())
    })
    large_by_source.columns = ['transfer_count', 'total_bytes', 'max_bytes', 'ports_used']
    print(large_by_source.head())
    
    # Quick suspicious IP identification
    suspicious_keywords = ['185.', '194.', '91.', '203.']
    external_ips = df[df['source_ip'].str.contains('|'.join(suspicious_keywords), na=False)]
    
    print(f"\n🚨 EXTERNAL SOURCE IPs:")
    if len(external_ips) > 0:
        ext_summary = external_ips.groupby('source_ip').agg({
            'bytes': ['count', 'sum'],
            'destination_port': lambda x: list(x.unique())
        })
        ext_summary.columns = ['connections', 'total_bytes', 'ports_used']
        print(ext_summary)
    else:
        print("No external IPs detected with basic pattern matching")
    
    return {
        'top_sources': top_sources,
        'unusual_ports': unusual_ports,
        'large_transfers': large_transfers,
        'external_ips': external_ips
    }

In [47]:
# Time the analysis
start_time = time.time()
basic_results = basic_traffic_analysis(traffic_df)
basic_time = time.time() - start_time

print(f"\n⏱️ Basic analysis completed in {basic_time:.2f} seconds")

🔎 BASIC ANALYSIS (Excel-style approach)

📈 TOP 10 SOURCES BY TOTAL BYTES:
                total_bytes  connections  avg_bytes
source_ip                                          
185.220.70.43     184993788         6677   27706.12
194.233.164.24    183768616         6696   27444.54
91.234.99.12      181807859         6669   27261.64
192.168.1.68        6348101         5496    1155.04
192.168.1.67        6156459         5380    1144.32
192.168.1.37        6151591         5438    1131.22
192.168.1.13        6148111         5345    1150.25
192.168.1.77        6137689         5474    1121.24
192.168.1.85        6113239         5401    1131.87
192.168.1.11        6110646         5416    1128.26

🔍 UNUSUAL PORTS (used ≤ 5 times):
Found 0 unusual ports:
Series([], Name: count, dtype: int64)

📊 LARGE TRANSFERS (> 8,310 bytes):
Found 24997 large transfers
               transfer_count  total_bytes  max_bytes  \
source_ip                                               
185.220.70.43            623

# 1.3 Machine Learning Analysis
**We will use an isolation forest to identify anomalies in this dataset.**

**What is an isolation forest, and why would we use this type of algorithm?**
The isolation forest algorithm is one ML algorithm which answers the question,
"Can you show me data points that are anomalous?" This algorithm is one example 
of a machine learning algorithm that is relatively efficient and low memory, 
which makes it a great algorithm for large datasets. 

Pros:
- Allows you to discover and examine complex behavior patterns
- Incorporates context into your analysis
- Depending on the features extracted, can be explainable.

Cons:
- The performance of the Isolation Forest algorithm is highly dependent on the selection of its parameters.  
- More complex algorithm-- caution should be used if you don't know how it works!
- Depending on the features, can be hard to explain!
- May produce false positives/false negatives

In [48]:
def ml_traffic_analysis(df):
    """ML-based anomaly detection for network traffic"""
    
    print("🤖 MACHINE LEARNING ANALYSIS")
    print("=" * 50)
    
    # Feature engineering: Create behavioral profiles per source IP
    print("\n🔧 Creating behavioral features...")
    
    features = df.groupby('source_ip').agg({
        'bytes': ['sum', 'mean', 'std', 'max', 'count'],
        'destination_ip': 'nunique',
        'destination_port': ['nunique', lambda x: list(x)],
        'timestamp': lambda x: (x.max() - x.min()).total_seconds() / 3600  # session duration in hours
    })
    
    # Flatten column names
    features.columns = ['bytes_sum', 'bytes_mean', 'bytes_std', 'bytes_max', 'connection_count', 
                       'unique_destinations', 'unique_ports', 'ports_list', 'session_duration_hours']
    
    # Fill NaN values
    features['bytes_std'] = features['bytes_std'].fillna(0)
    
    # Add derived features
    features['avg_bytes_per_connection'] = features['bytes_sum'] / features['connection_count']
    features['port_diversity'] = features['unique_ports'] / features['connection_count']
    
    # Check for suspicious ports based on pre-determined list of ports
    suspicious_ports = [1337, 4444, 8080, 9999, 31337, 6666]
    features['has_suspicious_ports'] = features['ports_list'].apply(
        lambda ports: any(port in suspicious_ports for port in ports)
    ).astype(int)
    
    print(f"✅ Created features for {len(features)} unique source IPs")
    print("\n📊 Feature summary:")
    display(features[['bytes_sum', 'connection_count', 'unique_destinations', 'unique_ports', 'has_suspicious_ports']].describe())
    
    # Prepare features for ML (exclude non-numeric columns)
    ml_features = features.drop(['ports_list'], axis=1)
    
    # Standardize features
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(ml_features)
    
    # Apply Isolation Forest for anomaly detection
    print("\n🔍 Applying Isolation Forest anomaly detection...")
    iso_forest = IsolationForest(contamination=0.1, random_state=42, n_estimators=100)
    anomaly_labels = iso_forest.fit_predict(features_scaled)
    
    # Get anomaly scores
    anomaly_scores = iso_forest.decision_function(features_scaled)
    
    # Add results back to features
    features['anomaly_label'] = anomaly_labels
    features['anomaly_score'] = anomaly_scores
    
    # Analyze anomalies
    anomalies = features[features['anomaly_label'] == -1].sort_values('anomaly_score')
    
    print(f"\n🚨 ANOMALOUS SOURCE IPs DETECTED: {len(anomalies)}")
    print("\nTop 5 most anomalous IPs:")
    
    for i, (ip, row) in enumerate(anomalies.head().iterrows()):
        print(f"\n{i+1}. {ip} (anomaly score: {row['anomaly_score']:.3f})")
        print(f"   📊 Total bytes: {row['bytes_sum']:,}")
        print(f"   🔗 Connections: {row['connection_count']}")
        print(f"   🎯 Unique destinations: {row['unique_destinations']}")
        print(f"   🔌 Unique ports: {row['unique_ports']}")
        print(f"   ⚠️  Has suspicious ports: {'Yes' if row['has_suspicious_ports'] else 'No'}")
        print(f"   🕐 Session duration: {row['session_duration_hours']:.1f} hours")
        
        # Show actual ports used
        actual_data = df[df['source_ip'] == ip]
        ports_used = actual_data['destination_port'].value_counts().head(5)
        print(f"   🔌 Top ports: {dict(ports_used)}")
    
    return {
        'features': features,
        'anomalies': anomalies,
        'model': iso_forest,
        'scaler': scaler
    }

In [49]:
# Time the ML analysis
start_time = time.time()
ml_results = ml_traffic_analysis(traffic_df)
ml_time = time.time() - start_time

print(f"\n⏱️ ML analysis completed in {ml_time:.2f} seconds")

🤖 MACHINE LEARNING ANALYSIS

🔧 Creating behavioral features...
✅ Created features for 93 unique source IPs

📊 Feature summary:


Unnamed: 0,bytes_sum,connection_count,unique_destinations,unique_ports,has_suspicious_ports
count,93.0,93.0,93.0,93.0,93.0
mean,11638450.0,5376.344086,244.978495,13.709677,1.0
std,31553050.0,249.659616,15.659404,1.598781,0.0
min,5589818.0,5118.0,160.0,5.0,1.0
25%,5819111.0,5290.0,247.0,14.0,1.0
50%,5910347.0,5337.0,248.0,14.0,1.0
75%,6026578.0,5389.0,249.0,14.0,1.0
max,184993800.0,6696.0,250.0,14.0,1.0



🔍 Applying Isolation Forest anomaly detection...

🚨 ANOMALOUS SOURCE IPs DETECTED: 10

Top 5 most anomalous IPs:

1. 194.233.164.24 (anomaly score: -0.282)
   📊 Total bytes: 183,768,616
   🔗 Connections: 6696
   🎯 Unique destinations: 160
   🔌 Unique ports: 5
   ⚠️  Has suspicious ports: Yes
   🕐 Session duration: 24.0 hours
   🔌 Top ports: {8080: np.int64(1380), 4444: np.int64(1374), 31337: np.int64(1339), 9999: np.int64(1310), 1337: np.int64(1293)}

2. 91.234.99.12 (anomaly score: -0.266)
   📊 Total bytes: 181,807,859
   🔗 Connections: 6669
   🎯 Unique destinations: 160
   🔌 Unique ports: 5
   ⚠️  Has suspicious ports: Yes
   🕐 Session duration: 24.0 hours
   🔌 Top ports: {1337: np.int64(1397), 8080: np.int64(1361), 31337: np.int64(1341), 9999: np.int64(1287), 4444: np.int64(1283)}

3. 185.220.70.43 (anomaly score: -0.262)
   📊 Total bytes: 184,993,788
   🔗 Connections: 6677
   🎯 Unique destinations: 160
   🔌 Unique ports: 5
   ⚠️  Has suspicious ports: Yes
   🕐 Session duration: 24

# 1.4 Comparison and Analysis

In [52]:
def compare_methods_module1():
    """Compare the effectiveness of basic vs ML methods"""
    
    print("📊 METHOD COMPARISON - MODULE 1")
    print("=" * 50)
    
    # Get the actual suspicious IPs from our data generation
    actual_suspicious = ['185.220.70.43', '194.233.164.24', '91.234.99.12']
    
    # Basic method results
    basic_suspicious = basic_results['external_ips']['source_ip'].unique() if len(basic_results['external_ips']) > 0 else []
    
    # ML method results
    ml_suspicious = ml_results['anomalies'].index.tolist()
    
    print(f"🎯 ACTUAL SUSPICIOUS IPs: {actual_suspicious}")
    print(f"🔍 BASIC METHOD found: {list(basic_suspicious)}")
    print(f"🤖 ML METHOD found: {ml_suspicious[:5]}...")  # Show top 5
    
    # Calculate detection rates
    basic_detected = len(set(basic_suspicious) & set(actual_suspicious))
    ml_detected = len(set(ml_suspicious) & set(actual_suspicious))
    
    print(f"\n📈 DETECTION RESULTS:")
    print(f"   Basic method detected: {basic_detected}/{len(actual_suspicious)} suspicious IPs")
    print(f"   ML method detected: {ml_detected}/{len(actual_suspicious)} suspicious IPs")
    
    # Show timing comparison
    print(f"\n⏱️ TIMING COMPARISON:")
    print(f"   Basic method: {basic_time:.2f} seconds")
    print(f"   ML method: {ml_time:.2f} seconds")
    print(f"   Speed difference: {ml_time/basic_time:.1f}x slower")
    
    # When to use each method
    print(f"\n💡 RECOMMENDATIONS:")
    print("✅ Use BASIC METHOD when:")
    print("   • You need results very quickly")
    print("   • Looking for known bad indicators")
    print("   • Working with smaller dataset")
    print("   • Need explainable results for management")
    
    print("\n✅ Use ML METHOD when:")
    print("   • Unknown threat patterns")
    print("   • Large datasets")
    print("   • Building automated detection")
    print("   • Complex behavioral analysis needed")

compare_methods_module1()

📊 METHOD COMPARISON - MODULE 1
🎯 ACTUAL SUSPICIOUS IPs: ['185.220.70.43', '194.233.164.24', '91.234.99.12']
🔍 BASIC METHOD found: ['185.220.70.43', '194.233.164.24', '91.234.99.12']
🤖 ML METHOD found: ['194.233.164.24', '91.234.99.12', '185.220.70.43', '192.168.1.68', '192.168.1.87']...

📈 DETECTION RESULTS:
   Basic method detected: 3/3 suspicious IPs
   ML method detected: 3/3 suspicious IPs

⏱️ TIMING COMPARISON:
   Basic method: 0.38 seconds
   ML method: 0.66 seconds
   Speed difference: 1.7x slower

💡 RECOMMENDATIONS:
✅ Use BASIC METHOD when:
   • You need results very quickly
   • Looking for known bad indicators
   • Working with smaller dataset
   • Need explainable results for management

✅ Use ML METHOD when:
   • Unknown threat patterns
   • Large datasets
   • Building automated detection
   • Complex behavioral analysis needed


# ---
# Module 2: The Incident Responder Under Pressure
# **Persona**: IR team member with 30 minutes to determine if an alert is real
# 
# **Scenario**: Authentication logs showing potential brute force attacks
# 
# **Question**: Pivot tables or behavioral modeling?

# 2.1 Generate Authentication Dataset

In [17]:
def generate_auth_logs(num_records=10000):
    """Generate authentication logs with brute force attacks"""
    
    # Normal users
    normal_users = [f"user{i:03d}" for i in range(1, 201)]
    normal_sources = [f"192.168.{i}.{j}" for i in range(1, 5) for j in range(10, 50)]
    
    # Attackers
    attack_sources = ['203.0.113.15', '198.51.100.42', '192.0.2.123']
    attack_targets = ['admin', 'administrator', 'root', 'service', 'guest']
    
    data = []
    start_time = datetime.now() - timedelta(hours=6)
    
    # Normal successful logins (80%)
    for _ in range(int(num_records * 0.8)):
        timestamp = start_time + timedelta(seconds=random.randint(0, 21600))
        user = random.choice(normal_users)
        source = random.choice(normal_sources)
        
        data.append({
            'timestamp': timestamp,
            'username': user,
            'source_ip': source,
            'event_type': 'login_success',
            'service': 'ssh'
        })
    
    # Normal failed logins (15%)
    for _ in range(int(num_records * 0.15)):
        timestamp = start_time + timedelta(seconds=random.randint(0, 21600))
        user = random.choice(normal_users)
        source = random.choice(normal_sources)
        
        data.append({
            'timestamp': timestamp,
            'username': user,
            'source_ip': source,
            'event_type': 'login_failure',
            'service': 'ssh'
        })
    
    # Brute force attacks (5%)
    for attacker_ip in attack_sources:
        attack_start = start_time + timedelta(hours=random.randint(1, 4))
        
        # Generate rapid-fire attempts
        for i in range(random.randint(50, 200)):
            timestamp = attack_start + timedelta(seconds=i * random.randint(1, 5))
            user = random.choice(attack_targets + normal_users[:20])
            
            data.append({
                'timestamp': timestamp,
                'username': user,
                'source_ip': attacker_ip,
                'event_type': 'login_failure',
                'service': 'ssh'
            })
    
    return pd.DataFrame(data)

def basic_auth_analysis(df):
    """Basic authentication analysis"""
    
    print("=== BASIC AUTH ANALYSIS ===")
    
    # Failed login attempts by IP
    failed_logins = df[df['event_type'] == 'login_failure']
    ip_failures = failed_logins.groupby('source_ip').size().sort_values(ascending=False)
    
    print("\nTop IPs by failed login attempts:")
    print(ip_failures.head(10))
    
    # Brute force detection (simple threshold)
    brute_force_threshold = 20
    potential_attackers = ip_failures[ip_failures >= brute_force_threshold]
    
    print(f"\nPotential brute force attackers (>={brute_force_threshold} failures):")
    for ip, count in potential_attackers.items():
        ip_data = failed_logins[failed_logins['source_ip'] == ip]
        unique_users = ip_data['username'].nunique()
        time_span = (ip_data['timestamp'].max() - ip_data['timestamp'].min()).total_seconds() / 60
        print(f"  {ip}: {count} failures, {unique_users} users, {time_span:.1f} minutes")
    
    return potential_attackers

def ml_auth_analysis(df):
    """ML-based authentication analysis"""
    
    print("\n=== ML AUTH ANALYSIS ===")
    
    # Create time-based features
    df['hour'] = df['timestamp'].dt.hour
    df['minute'] = df['timestamp'].dt.minute
    
    # Aggregate by IP and time window (5-minute windows)
    df['time_window'] = df['timestamp'].dt.floor('5min')
    
    features = df.groupby(['source_ip', 'time_window']).agg({
        'username': 'nunique',
        'event_type': lambda x: (x == 'login_failure').sum(),
        'timestamp': 'count'
    }).rename(columns={
        'username': 'unique_users',
        'event_type': 'failures',
        'timestamp': 'total_attempts'
    })
    
    # Calculate failure rate
    features['failure_rate'] = features['failures'] / features['total_attempts']
    features = features.fillna(0)
    
    # Apply clustering to find unusual patterns
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)
    
    # Use DBSCAN to find clusters
    dbscan = DBSCAN(eps=0.5, min_samples=5)
    clusters = dbscan.fit_predict(features_scaled)
    
    # Analyze outliers (cluster -1)
    outliers = features[clusters == -1]
    
    print(f"\nOutlier authentication patterns detected: {len(outliers)}")
    
    # Group outliers by IP
    for ip in outliers.index.get_level_values('source_ip').unique()[:5]:
        ip_outliers = outliers[outliers.index.get_level_values('source_ip') == ip]
        print(f"\n{ip}:")
        print(f"  Outlier time windows: {len(ip_outliers)}")
        print(f"  Max failures in 5min: {ip_outliers['failures'].max()}")
        print(f"  Max unique users in 5min: {ip_outliers['unique_users'].max()}")
        print(f"  Max failure rate: {ip_outliers['failure_rate'].max():.2f}")
    
    return outliers

In [18]:
def generate_auth_logs(num_records=8000):
    """Generate authentication logs with brute force attacks"""
    
    # Normal users and systems
    normal_users = [f"user{i:03d}" for i in range(1, 150)]
    service_accounts = ['backup_svc', 'monitoring_svc', 'db_service', 'web_service']
    normal_sources = [f"192.168.{i}.{j}" for i in range(1, 5) for j in range(10, 50)]
    
    # Attack sources and targets
    attack_sources = ['203.0.113.15', '198.51.100.42', '192.0.2.123', '185.199.108.153']
    common_targets = ['admin', 'administrator', 'root', 'guest', 'test', 'user']
    
    data = []
    start_time = datetime.now() - timedelta(hours=8)
    
    # Normal successful logins (70%)
    for _ in range(int(num_records * 0.70)):
        timestamp = start_time + timedelta(seconds=random.randint(0, 28800))
        user = random.choice(normal_users + service_accounts)
        source = random.choice(normal_sources)
        
        data.append({
            'timestamp': timestamp.strftime('%Y-%m-%d %H:%M:%S'),
            'username': user,
            'source_ip': source,
            'event_type': 'login_success',
            'service': random.choice(['ssh', 'rdp', 'web'])
        })
    
    # Normal failed logins (20% - typos, expired passwords, etc.)
    for _ in range(int(num_records * 0.20)):
        timestamp = start_time + timedelta(seconds=random.randint(0, 28800))
        user = random.choice(normal_users)
        source = random.choice(normal_sources)
        
        data.append({
            'timestamp': timestamp.strftime('%Y-%m-%d %H:%M:%S'),
            'username': user,
            'source_ip': source,
            'event_type': 'login_failure',
            'service': random.choice(['ssh', 'rdp', 'web'])
        })
    
    # Brute force attacks (10%)
    for attacker_ip in attack_sources:
        # Each attacker targets multiple accounts
        attack_start = start_time + timedelta(hours=random.randint(1, 6))
        
        # Generate rapid-fire attempts
        for i in range(random.randint(30, 120)):
            timestamp = attack_start + timedelta(seconds=i * random.randint(1, 10))
            
            # Mix of common targets and real usernames
            if random.random() < 0.6:
                user = random.choice(common_targets)
            else:
                user = random.choice(normal_users[:30])  # Target real users too
            
            data.append({
                'timestamp': timestamp.strftime('%Y-%m-%d %H:%M:%S'),
                'username': user,
                'source_ip': attacker_ip,
                'event_type': 'login_failure',
                'service': 'ssh'
            })
    
    df = pd.DataFrame(data)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    return df.sort_values('timestamp').reset_index(drop=True)

In [None]:
auth_df = generate_auth_logs(4000)
print(f"📊 Generated {len(auth_df):,} authentication records")
print(f"📅 Time range: {auth_df['timestamp'].min()} to {auth_df['timestamp'].max()}")

# Save to CSV
auth_df.to_csv('auth_logs.csv', index=False)
print("💾 Saved to auth_logs.csv")

# Show sample
print("\n🔍 Sample of the data:")
display(auth_df.head())

# Quick overview
print(f"\n📈 Event breakdown:")
print(auth_df['event_type'].value_counts())


In [None]:
# ## 2.2 Basic Analysis (Excel-Style)
# **⏱️ Time Limit: 5 minutes**

# %%
def basic_auth_analysis(df):
    """Basic authentication analysis using simple aggregations"""
    
    print("🔎 BASIC AUTH ANALYSIS (Excel-style)")
    print("=" * 50)
    
    # Analysis 1: Failed login attempts by source IP
    failed_logins = df[df['event_type'] == 'login_failure']
    
    print(f"\n📊 FAILED LOGIN SUMMARY:")
    print(f"Total failed logins: {len(failed_logins):,}")
    print(f"Unique source IPs: {failed_logins['source_ip'].nunique()}")
    print(f"Unique usernames targeted: {failed_logins['username'].nunique()}")
    
    # Top IPs by failed attempts
    ip_failures = failed_logins.groupby('source_ip').agg({
        'username': ['count', 'nunique'],
        'timestamp': ['min', 'max']
    })
    ip_failures.columns = ['total_failures', 'unique_users_targeted', 'first_attempt', 'last_attempt']
    ip_failures['attack_duration_minutes'] = (
        ip_failures['last_attempt'] - ip_failures['first_attempt']
    ).dt.total_seconds() / 60
    
    ip_failures = ip_failures.sort_values('total_failures', ascending=False)
    
    print(f"\n🎯 TOP 10 SOURCE IPs BY FAILED ATTEMPTS:")
    display(ip_failures.head(10))
    
    # Simple brute force detection (threshold-based)
    brute_force_threshold = 20
    potential_attackers = ip_failures[ip_failures['total_failures'] >= brute_force_threshold]
    
    print(f"\n🚨 POTENTIAL BRUTE FORCE ATTACKERS (≥{brute_force_threshold} failures):")
    if len(potential_attackers) > 0:
        for ip, row in potential_attackers.iterrows():
            print(f"\n📍 {ip}:")
            print(f"   💥 Total failures: {row['total_failures']}")
            print(f"   👤 Users targeted: {row['unique_users_targeted']}")
            print(f"   ⏱️  Attack duration: {row['attack_duration_minutes']:.1f} minutes")
            
            # Show most targeted usernames for this IP
            ip_targets = failed_logins[failed_logins['source_ip'] == ip]['username'].value_counts().head(5)
            print(f"   🎯 Top targets: {dict(ip_targets)}")
    else:
        print("No IPs exceed the brute force threshold")
    
    # Time-based analysis
    failed_logins['hour'] = failed_logins['timestamp'].dt.hour
    hourly_failures = failed_logins.groupby('hour').size()
    
    print(f"\n🕐 FAILED LOGINS BY HOUR:")
    peak_hour = hourly_failures.idxmax()
    print(f"Peak hour: {peak_hour}:00 with {hourly_failures[peak_hour]} failures")
    
    return {
        'ip_failures': ip_failures,
        'potential_attackers': potential_attackers,
        'hourly_failures': hourly_failures,
        'failed_logins': failed_logins
    }

# Time the basic analysis
start_time = time.time()
basic_auth_results = basic_auth_analysis(auth_df)
basic_auth_time = time.time() - start_time

print(f"\n⏱️ Basic analysis completed in {basic_auth_time:.2f} seconds")

# %% [markdown]
# ## 2.3 Machine Learning Analysis

# %%
def ml_auth_analysis(df):
    """ML-based authentication analysis using behavioral modeling"""
    
    print("🤖 ML AUTHENTICATION ANALYSIS")
    print("=" * 50)
    
    # Create time windows for behavioral analysis
    print("\n🔧 Creating behavioral features with time windows...")
    
    # Use 10-minute time windows
    df['time_window'] = df['timestamp'].dt.floor('10min')
    
    # Create features per IP per time window
    window_features = df.groupby(['source_ip', 'time_window']).agg({
        'username': ['nunique', 'count'],
        'event_type': [lambda x: (x == 'login_failure').sum(), 
                      lambda x: (x == 'login_success').sum()],
        'service': 'nunique'
    })
    
    window_features.columns = ['unique_users', 'total_attempts', 'failures', 'successes', 'unique_services']
    
    # Calculate rates and ratios
    window_features['failure_rate'] = window_features['failures'] / window_features['total_attempts']
    window_features['attempts_per_minute'] = window_features['total_attempts'] / 10  # 10-minute windows
    window_features['user_diversity'] = window_features['unique_users'] / window_features['total_attempts']
    
    # Fill NaN values
    window_features = window_features.fillna(0)
    
    # Add IP-level behavioral features
    ip_features = df.groupby('source_ip').agg({
        'username': 'nunique',
        'timestamp': ['count', lambda x: (x.max() - x.min()).total_seconds() / 3600],
        'event_type': [lambda x: (x == 'login_failure').sum(), 
                      lambda x: (x == 'login_success').sum()]
    })
    
    ip_features.columns = ['total_unique_users', 'total_attempts', 'session_duration_hours', 
                          'total_failures', 'total_successes']
    ip_features['overall_failure_rate'] = ip_features['total_failures'] / ip_features['total_attempts']
    
    # Check if IP is external (simple heuristic)
    ip_features['is_external'] = ~ip_features.index.str.startswith('192.168.')
    ip_features['is_external'] = ip_features['is_external'].astype(int)
    
    print(f"✅ Created features for {len(window_features)} time windows across {len(ip_features)} IPs")
    
    # Apply clustering to find unusual patterns in time windows
    print("\n🔍 Applying DBSCAN clustering on time window features...")
    
    # Select features for clustering
    cluster_features = window_features[['unique_users', 'total_attempts', 'failure_rate', 
                                      'attempts_per_minute', 'user_diversity']].copy()
    
    # Standardize features
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(cluster_features)
    
    # Apply DBSCAN
    dbscan = DBSCAN(eps=0.5, min_samples=3)
    clusters = dbscan.fit_predict(features_scaled)
    
    # Add cluster labels
    window_features['cluster'] = clusters
    
    # Analyze outliers (cluster -1)
    outliers = window_features[window_features['cluster'] == -1]
    
    print(f"\n🚨 ANOMALOUS TIME WINDOWS DETECTED: {len(outliers)}")
    
    # Group outliers by IP for analysis
    outlier_ips = outliers.groupby(level='source_ip').agg({
        'total_attempts': ['sum', 'max'],
        'failures': ['sum', 'max'],
        'failure_rate': 'mean',
        'attempts_per_minute': 'max',
        'unique_users': 'max'
    })
    
    outlier_ips.columns = ['total_attempts', 'max_attempts_per_window', 'total_failures', 
                          'max_failures_per_window', 'avg_failure_rate', 
                          'max_attempts_per_minute', 'max_users_per_window']
    
    # Sort by severity
    outlier_ips['severity_score'] = (
        outlier_ips['max_attempts_per_minute'] * 0.4 +
        outlier_ips['avg_failure_rate'] * 0.3 +
        outlier_ips['max_users_per_window'] * 0.3
    )
    
    outlier_ips = outlier_ips.sort_values('severity_score', ascending=False)
    
    print(f"\n🎯 TOP SUSPICIOUS IPs BY ML ANALYSIS:")
    
    for i, (ip, row) in enumerate(outlier_ips.head().iterrows()):
        print(f"\n{i+1}. {ip} (severity score: {row['severity_score']:.2f})")
        print(f"   💥 Total attempts: {row['total_attempts']}")
        print(f"   📈 Max attempts/minute: {row['max_attempts_per_minute']:.1f}")
        print(f"   ❌ Average failure rate: {row['avg_failure_rate']:.2%}")
        print(f"   👥 Max users targeted in one window: {row['max_users_per_window']}")
        
        # Show timeline for this IP
        ip_data = df[df['source_ip'] == ip]
        print(f"   📅 Active period: {ip_data['timestamp'].min()} to {ip_data['timestamp'].max()}")
        
        # Show most common usernames
        top_users = ip_data['username'].value_counts().head(3)
        print(f"   🎯 Top targets: {dict(top_users)}")
    
    return {
        'window_features': window_features,
        'ip_features': ip_features,
        'outliers': outliers,
        'outlier_ips': outlier_ips,
        'clusters': clusters
    }

In [None]:
# Time the ML analysis
start_time = time.time()
ml_auth_results = ml_auth_analysis(auth_df)
ml_auth_time = time.time() - start_time

print(f"\n⏱️ ML analysis completed in {ml_auth_time:.2f} seconds")