# This notebook will generate fake (but realistic) data for the modules for the BTV AI workshop!

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import warnings
import time
warnings.filterwarnings('ignore')

# ML libraries
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.metrics import classification_report


Module 1: The Overwhelmed SOC Analyst

In [None]:
def generate_network_traffic_data(num_records=10000):
    """Generate realistic network traffic data with hidden anomalies"""
    
    # Normal traffic patterns
    normal_sources = [f"192.168.1.{i}" for i in range(10, 100)]
    normal_destinations = [f"10.0.{i}.{j}" for i in range(1, 5) for j in range(10, 50)]
    common_ports = [80, 443, 22, 25, 53, 110, 143, 993, 995]
    
    # Generate timestamps over last 24 hours
    start_time = datetime.now() - timedelta(hours=24)
    
    data = []
    
    # Normal traffic (90% of data)
    for _ in range(int(num_records * 0.90)):
        timestamp = start_time + timedelta(seconds=random.randint(0, 86400))
        source = random.choice(normal_sources)
        destination = random.choice(normal_destinations)
        port = random.choice(common_ports)
        bytes_sent = random.randint(64, 1500)
        
        data.append({
            'timestamp': timestamp.strftime('%Y-%m-%d %H:%M:%S'),
            'source_ip': source,
            'destination_ip': destination,
            'destination_port': port,
            'bytes': bytes_sent,
            'protocol': 'TCP'
        })
    
    # Suspicious traffic (10% of data) - More obvious for Excel detection
    suspicious_ips = ['185.220.70.43', '194.233.164.24', '91.234.99.12']
    suspicious_ports = [1337, 4444, 8080, 9999, 31337]
    
    for _ in range(int(num_records * 0.10)):
        timestamp = start_time + timedelta(seconds=random.randint(0, 86400))
        
        if random.random() < 0.4:  # External suspicious IP
            source = random.choice(suspicious_ips)
            destination = random.choice(normal_destinations)
            port = random.choice(suspicious_ports)
            bytes_sent = random.randint(5000, 50000)  # Much larger transfers
        else:  # Internal lateral movement
            source = random.choice(normal_sources)
            destination = random.choice(normal_sources)
            port = random.choice(suspicious_ports)
            bytes_sent = random.randint(2000, 10000)
        
        data.append({
            'timestamp': timestamp.strftime('%Y-%m-%d %H:%M:%S'),
            'source_ip': source,
            'destination_ip': destination,
            'destination_port': port,
            'bytes': bytes_sent,
            'protocol': 'TCP'
        })
    
    df = pd.DataFrame(data)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    return df.sample(frac=1).reset_index(drop=True)  # Shuffle the data

In [None]:
# Generate the dataset
traffic_df = generate_network_traffic_data(500000)
print(f"📊 Generated {len(traffic_df):,} network traffic records")
print(f"📅 Time range: {traffic_df['timestamp'].min()} to {traffic_df['timestamp'].max()}")

# Save to CSV for Excel analysis
traffic_df.to_csv('network_traffic.csv', index=False)
print("💾 Saved to network_traffic.csv")

# Display sample
print("\n🔍 Sample of the data:")
display(traffic_df.head())

📊 Generated 500,000 network traffic records
📅 Time range: 2025-07-20 12:03:27 to 2025-07-21 12:03:27
💾 Saved to network_traffic.csv

🔍 Sample of the data:


Unnamed: 0,timestamp,source_ip,destination_ip,destination_port,bytes,protocol
0,2025-07-20 19:37:14,192.168.1.83,10.0.1.41,25,295,TCP
1,2025-07-20 12:28:21,192.168.1.93,10.0.2.38,25,136,TCP
2,2025-07-20 12:28:14,192.168.1.56,192.168.1.45,9999,7217,TCP
3,2025-07-20 19:57:58,192.168.1.58,10.0.2.42,110,575,TCP
4,2025-07-21 08:20:15,192.168.1.59,10.0.4.41,993,1164,TCP


Module 2: The Incident Responder Under Pressure

In [None]:
def generate_auth_logs(num_records=8000):
    """Generate authentication logs with brute force attacks"""
    
    # Normal users and systems
    normal_users = [f"user{i:03d}" for i in range(1, 150)]
    service_accounts = ['backup_svc', 'monitoring_svc', 'db_service', 'web_service']
    normal_sources = [f"192.168.{i}.{j}" for i in range(1, 5) for j in range(10, 50)]
    
    # Attack sources and targets
    attack_sources = ['203.0.113.15', '198.51.100.42', '192.0.2.123', '185.199.108.153']
    common_targets = ['admin', 'administrator', 'root', 'guest', 'test', 'user']
    
    data = []
    start_time = datetime.now() - timedelta(hours=8)
    
    # Normal successful logins (70%)
    for _ in range(int(num_records * 0.70)):
        timestamp = start_time + timedelta(seconds=random.randint(0, 28800))
        user = random.choice(normal_users + service_accounts)
        source = random.choice(normal_sources)
        
        data.append({
            'timestamp': timestamp.strftime('%Y-%m-%d %H:%M:%S'),
            'username': user,
            'source_ip': source,
            'event_type': 'login_success',
            'service': random.choice(['ssh', 'rdp', 'web'])
        })
    
    # Normal failed logins (20% - typos, expired passwords, etc.)
    for _ in range(int(num_records * 0.20)):
        timestamp = start_time + timedelta(seconds=random.randint(0, 28800))
        user = random.choice(normal_users)
        source = random.choice(normal_sources)
        
        data.append({
            'timestamp': timestamp.strftime('%Y-%m-%d %H:%M:%S'),
            'username': user,
            'source_ip': source,
            'event_type': 'login_failure',
            'service': random.choice(['ssh', 'rdp', 'web'])
        })
    
    # Brute force attacks (10%)
    for attacker_ip in attack_sources:
        # Each attacker targets multiple accounts
        attack_start = start_time + timedelta(hours=random.randint(1, 6))
        
        # Generate rapid-fire attempts
        for i in range(random.randint(30, 120)):
            timestamp = attack_start + timedelta(seconds=i * random.randint(1, 10))
            
            # Mix of common targets and real usernames
            if random.random() < 0.6:
                user = random.choice(common_targets)
            else:
                user = random.choice(normal_users[:30])  # Target real users too
            
            data.append({
                'timestamp': timestamp.strftime('%Y-%m-%d %H:%M:%S'),
                'username': user,
                'source_ip': attacker_ip,
                'event_type': 'login_failure',
                'service': 'ssh'
            })
    
    df = pd.DataFrame(data)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    return df.sort_values('timestamp').reset_index(drop=True)

In [None]:
auth_df = generate_auth_logs(4000)
print(f"📊 Generated {len(auth_df):,} authentication records")
print(f"📅 Time range: {auth_df['timestamp'].min()} to {auth_df['timestamp'].max()}")

# Save to CSV
auth_df.to_csv('auth_logs.csv', index=False)
print("💾 Saved to auth_logs.csv")

# Show sample
print("\n🔍 Sample of the data:")
display(auth_df.head())

# Quick overview
print(f"\n📈 Event breakdown:")
print(auth_df['event_type'].value_counts())


📊 Generated 3,781 authentication records
📅 Time range: 2025-07-21 04:08:01 to 2025-07-21 12:07:53
💾 Saved to auth_logs.csv

🔍 Sample of the data:


Unnamed: 0,timestamp,username,source_ip,event_type,service
0,2025-07-21 04:08:01,user138,192.168.1.42,login_success,web
1,2025-07-21 04:08:09,user135,192.168.2.17,login_success,ssh
2,2025-07-21 04:08:16,user054,192.168.2.44,login_failure,ssh
3,2025-07-21 04:08:20,user121,192.168.4.35,login_success,web
4,2025-07-21 04:08:31,user101,192.168.4.40,login_success,rdp



📈 Event breakdown:
event_type
login_success    2800
login_failure     981
Name: count, dtype: int64
