In [None]:
import sys
import os
from pathlib import Path

# Add src to path for imports
sys.path.append(str(Path('../src').resolve()))

from data.preprocessing import ThreatIntelligencePreprocessor
import json
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print("✅ Imports successful")
print(f"📂 Working directory: {os.getcwd()}")


In [None]:
# Initialize the enhanced preprocessor
preprocessor = ThreatIntelligencePreprocessor(
    enable_multiprocessing=True,  # Enable parallel processing
    max_workers=4,                # Number of worker processes
    chunk_size=50,                # Documents per batch
    memory_limit_mb=256           # Memory limit per process
)

print("✅ Preprocessor initialized")
print(f"🔧 Configuration:")
print(f"   - Max workers: {preprocessor.max_workers}")
print(f"   - Chunk size: {preprocessor.chunk_size}")
print(f"   - Memory limit: {preprocessor.memory_limit_mb}MB")
print(f"   - IoC patterns: {len(preprocessor.ioc_patterns)}")


In [None]:
# Sample threat intelligence report with comprehensive IoCs
sample_report = """
SECURITY ADVISORY: APT29 Campaign Analysis
Date: 2024-01-15
Severity: High

Executive Summary:
Our threat intelligence team has identified a sophisticated campaign attributed to APT29 (Cozy Bear)
targeting government organizations. The campaign exploits CVE-2024-1234 to gain initial access.

Technical Details:
The malware sample (MD5: d41d8cd98f00b204e9800998ecf8427e) establishes persistence through
registry modification at HKEY_LOCAL_MACHINE\\Software\\Microsoft\\Windows\\CurrentVersion\\Run.
Communication with C2 infrastructure occurs via encrypted channels to 203.0.113.45:443.

Indicators of Compromise:
- IP Address: 203.0.113.45
- Domain: malicious-c2-server.com  
- File Hash (SHA256): 5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91b46729d73a27fb57e9
- Email: phishing@attacker-domain.net
- URL: https://malicious-c2-server.com/backdoor.php
- Bitcoin: 1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa

Mitigations:
1. Apply security patches for CVE-2024-1234
2. Block communication to identified C2 servers
3. Monitor for registry modifications
4. Implement email filtering for suspicious domains

Attribution:
This campaign shows TTPs consistent with APT29 operations.
"""

# Process the sample report
print("🔍 Processing sample threat intelligence report...")
result = preprocessor.process_document(
    text=sample_report,
    source="Demo Source",
    file_path="/demo/apt29_analysis.txt"
)

print("\\n📊 PROCESSING RESULTS:")
print("=" * 50)
print(f"✅ Processing successful: {'error' not in result}")
print(f"🏆 Quality score: {result['quality_scores']['overall']:.3f}")
print(f"✔️  Passes quality: {result['passes_quality']}")

print("\\n🔍 EXTRACTED IoCs:")
total_iocs = 0
for ioc_type, ioc_list in result['iocs'].items():
    if ioc_list:
        print(f"   {ioc_type}: {ioc_list}")
        total_iocs += len(ioc_list)
print(f"   Total IoCs: {total_iocs}")

print("\\n📋 IDENTIFIED SECTIONS:")
for section_name, content in result['sections'].items():
    preview = content[:80] + "..." if len(content) > 80 else content
    print(f"   {section_name}: {preview}")

print("\\n📊 QUALITY BREAKDOWN:")
for metric, score in result['quality_scores'].items():
    print(f"   {metric}: {score:.3f}")
