Environment Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split
from pathlib import Path
import json
import re
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

In [2]:
PROJECT_ROOT = Path(r"C:\Computer Science\AIMLDL\log-anomaly-detection")
DATA_PATH = PROJECT_ROOT / "dataset" / "structured_data"
OUTPUT_PATH = PROJECT_ROOT / "dataset" / "labeled_data"
OUTPUT_PATH.mkdir(exist_ok=True)

In [3]:
LOG_SOURCES = [
    'Android_2k', 'Apache_2k', 'BGL_2k', 'Hadoop_2k', 'HDFS_2k', 
    'HealthApp_2k', 'HPC_2k', 'Linux_2k', 'Mac_2k', 'OpenSSH_2k',
    'OpenStack_2k', 'Proxifier_2k', 'Spark_2k', 'Thunderbird_2k',
    'Windows_2k', 'Zookeeper_2k'
]

In [4]:
LABELS = {
    0: "normal",
    1: "security_anomaly", 
    2: "system_failure",
    3: "performance_issue",
    4: "network_anomaly", 
    5: "config_error",
    6: "hardware_issue",
    7: "unknown_anomaly"
}

In [5]:
ANOMALY_PATTERNS = {
    'security': ['authentication failure', 'invalid user', 'break-in attempt', 
                'failed password', 'unauthorized', 'access denied'],
    'system': ['error', 'critical', 'fatal', 'exception', 'crash', 'abort'],
    'network': ['timeout', 'connection refused', 'host unreachable'],
    'performance': ['slow', 'overload', 'resource exhausted', 'quota exceeded'],
    'hardware': ['hardware error', 'disk error', 'i/o error', 'device error']
}

Load Data

In [6]:
def load_data():
    datasets = {}
    for source in LOG_SOURCES:
        try:
            file_path = DATA_PATH / f"{source}.log_structured.csv"
            df = pd.read_csv(file_path)
            datasets[source] = df
            print(f"✓ Loaded {source}: {len(df):,} logs")
        except Exception as e:
            print(f"✗ Failed {source}: {e}")
    
    total = sum(len(df) for df in datasets.values())
    print(f"\nLoaded {len(datasets)} sources, {total:,} total logs")
    return datasets


Analyze Templates

In [7]:
def analyze_templates(datasets):
    stats = {}
    for source, df in datasets.items():
        if 'EventTemplate' not in df.columns:
            continue
            
        templates = df['EventTemplate'].value_counts()
        stats[source] = {
            'total_logs': len(df),
            'unique_templates': len(templates),
            'efficiency': len(df) / len(templates),
            'top_templates': templates.head(5)
        }
    
    return stats

Find Anomalies

In [8]:
def find_anomalies(datasets):
    results = {}
    
    for source, df in datasets.items():
        if 'Content' not in df.columns:
            continue
            
        content_lower = df['Content'].str.lower()
        anomaly_counts = {}
        
        for category, keywords in ANOMALY_PATTERNS.items():
            pattern = '|'.join(re.escape(kw) for kw in keywords)
            matches = content_lower.str.contains(pattern, na=False, regex=True)
            anomaly_counts[category] = matches.sum()
        
        total_anomalies = sum(anomaly_counts.values())
        results[source] = {
            'total_logs': len(df),
            'anomalies': total_anomalies,
            'anomaly_rate': (total_anomalies / len(df)) * 100,
            'categories': anomaly_counts
        }
    
    return results

Rank Sources

In [9]:
def rank_sources(template_stats, anomaly_stats):
    rankings = []
    
    for source in template_stats.keys():
        if source not in anomaly_stats:
            continue
            
        t_stats = template_stats[source] 
        a_stats = anomaly_stats[source]
        
        anomaly_score = a_stats['anomaly_rate']
        template_score = 100 - (t_stats['unique_templates'] / 10)  # Fewer templates = better
        efficiency_score = t_stats['efficiency'] / 10
        
        priority = (anomaly_score * 0.4 + template_score * 0.3 + efficiency_score * 0.3)
        
        rankings.append({
            'source': source,
            'priority_score': priority,
            'anomaly_rate': a_stats['anomaly_rate'],
            'templates': t_stats['unique_templates'],
            'efficiency': t_stats['efficiency']
        })
    
    return sorted(rankings, key=lambda x: x['priority_score'], reverse=True)

Prepare Templates

In [10]:
def prep_templates(df, source_name):
    """Fixed version of prep_templates"""
    if 'EventTemplate' not in df.columns or 'Content' not in df.columns:
        print(f"Missing required columns in {source_name}")
        print(f"Available columns: {list(df.columns)}")
        return []
    
    templates = df['EventTemplate'].value_counts()
    labeling_data = []
    
    print(f"Processing {len(templates)} unique templates...")
    
    for template, count in templates.items():
        # Get sample content - make sure we get actual strings
        matching_rows = df[df['EventTemplate'] == template]
        samples = matching_rows['Content'].head(3).tolist()
        
        # Debug: check what we got
        if len(samples) == 0:
            samples = ['No samples found']
        
        # Verify samples are strings
        samples = [str(s) if s is not None else 'Empty content' for s in samples]
        
        # Auto-suggest label
        suggested_label = 0
        confidence = "low"
        
        # Check all sample content together
        content_text = ' '.join(samples).lower()
        
        # Check for anomaly patterns
        for category, keywords in ANOMALY_PATTERNS.items():
            if any(kw in content_text for kw in keywords):
                if category == 'security':
                    suggested_label = 1
                elif category == 'system': 
                    suggested_label = 2
                elif category == 'performance':
                    suggested_label = 3
                elif category == 'network':
                    suggested_label = 4
                elif category == 'hardware':
                    suggested_label = 6
                confidence = "high"
                break
        
        labeling_data.append({
            'template': template,
            'count': count,
            'percentage': (count / len(df)) * 100,
            'samples': samples,  # This should now be proper strings
            'suggested': suggested_label,
            'confidence': confidence,
            'label': None,
            'notes': ''
        })
    
    return sorted(labeling_data, key=lambda x: x['count'], reverse=True)

Auto-label High Confidence

In [11]:
def auto_label(labeling_data):
    auto_count = 0
    for item in labeling_data:
        if item['confidence'] == 'high' and item['label'] is None:
            item['label'] = item['suggested']
            item['notes'] = 'Auto-labeled (high confidence)'
            auto_count += 1
    
    print(f"Auto-labeled {auto_count} templates")
    return auto_count

Interactive Labeling

In [12]:
def label_batch(labeling_data, source_name, start=0, count=5):
    """Fixed version with proper source name handling"""
    print("\n" + "="*60)
    print("TEMPLATE LABELING")
    print("Labels:", ", ".join(f"{k}:{v}" for k, v in LABELS.items()))
    print("Commands: 0-7 (label), 'skip', 'quit', 'save'")
    print("="*60)
    
    end = min(start + count, len(labeling_data))
    labeled = 0
    
    for i in range(start, end):
        item = labeling_data[i]
        
        print(f"\n[{i+1}/{len(labeling_data)}] Template")
        print(f"Frequency: {item['count']:,} logs ({item['percentage']:.1f}%)")
        print(f"Template: {item['template']}")
        print("Sample logs:")
        
        samples = item.get('samples', [])
        if samples:
            for j, sample in enumerate(samples[:3], 1):
                if isinstance(sample, str) and len(sample.strip()) > 0:
                    display_sample = sample[:150] + "..." if len(sample) > 150 else sample
                    print(f"  {j}. {display_sample}")
                else:
                    print(f"  {j}. [No content available]")
        else:
            print("  [No samples available]")
            
        print(f"Suggested: {item['suggested']} ({LABELS[item['suggested']]}) - {item['confidence']}")
        
        while True:
            response = input(f"\nLabel (suggested {item['suggested']}): ").strip().lower()
            
            if response == 'quit':
                return i, labeled
            elif response == 'skip':
                break
            elif response == 'save':
                save_progress(labeling_data, source_name)  # Fixed
                continue
            elif response.isdigit() and 0 <= int(response) <= 7:
                item['label'] = int(response) 
                notes = input("Notes (optional): ").strip()
                if notes:
                    item['notes'] = notes
                labeled += 1
                break
            else:
                print("Enter 0-7, 'skip', 'save', or 'quit'")
    
    print(f"\nLabeled {labeled} templates in this batch")
    return end, labeled

Quick Label Multiple

In [13]:
def quick_label(labeling_data, indices, labels):
    for idx, label in zip(indices, labels):
        if 0 <= idx < len(labeling_data) and 0 <= label <= 7:
            labeling_data[idx]['label'] = label
            labeling_data[idx]['notes'] = 'Quick label'
            print(f"Labeled template {idx}: {LABELS[label]}")

Save Progress

In [14]:
def save_progress(labeling_data, source_name):
    data_to_save = []
    for item in labeling_data:
        item_copy = item.copy()
        item_copy['samples'] = json.dumps(item['samples'])
        data_to_save.append(item_copy)
    
    df = pd.DataFrame(data_to_save)
    df.to_csv(OUTPUT_PATH / f"{source_name}_progress.csv", index=False)
    
    labeled = sum(1 for item in labeling_data if item['label'] is not None)
    print(f"Saved progress: {labeled}/{len(labeling_data)} templates")

Load Progress

In [15]:
def load_progress(source_name):
    file_path = OUTPUT_PATH / f"{source_name}_progress.csv"
    if file_path.exists():
        df = pd.read_csv(file_path)
        data = df.to_dict('records')
        
        for item in data:
            if 'samples' in item and isinstance(item['samples'], str):
                try:
                    item['samples'] = json.loads(item['samples'])
                except:
                    item['samples'] = ['Error loading samples']
        
        labeled = sum(1 for item in data if pd.notna(item.get('label')))
        print(f"Loaded progress: {labeled}/{len(data)} templates")
        return data
    return None

Show Progress

In [16]:
def show_progress(labeling_data):
    total = len(labeling_data)
    labeled = sum(1 for item in labeling_data if item['label'] is not None)
    
    print(f"\nProgress: {labeled}/{total} templates ({labeled/total*100:.1f}%)")
    
    if labeled > 0:
        dist = defaultdict(int)
        for item in labeling_data:
            if item['label'] is not None and not pd.isna(item['label']):
                dist[int(item['label'])] += item['count']
        
        print("Label distribution:")
        for label in sorted(dist.keys()):
            count = dist[label]
            if label in LABELS:
                print(f"  {label} ({LABELS[label]}): {count:,} logs")
            else:
                print(f"  {label} (unknown): {count:,} logs")

Export Final Dataset

In [17]:
def export_final(df, labeling_data, source_name):
    template_labels = {item['template']: item['label'] 
                      for item in labeling_data if item['label'] is not None}
    
    result_df = df.copy()
    result_df['AnomalyLabel'] = result_df['EventTemplate'].map(template_labels).fillna(-1).astype(int)
    result_df['AnomalyLabelName'] = result_df['AnomalyLabel'].map(lambda x: LABELS.get(x, 'unlabeled'))
    
    output_file = OUTPUT_PATH / f"{source_name}_labeled.csv"
    result_df.to_csv(output_file, index=False)
    
    total = len(result_df)
    labeled_count = (result_df['AnomalyLabel'] >= 0).sum()
    anomaly_count = (result_df['AnomalyLabel'] > 0).sum()
    
    print(f"\nFinal dataset: {output_file}")
    print(f"Total logs: {total:,}")
    print(f"Labeled: {labeled_count:,} ({labeled_count/total*100:.1f}%)")
    print(f"Anomalies: {anomaly_count:,} ({anomaly_count/labeled_count*100:.1f}% of labeled)")
    
    return result_df

Main Workflow

In [18]:
def run_workflow():
    print("LOG ANOMALY LABELING WORKFLOW")
    print("="*50)
    
    print("1. Loading datasets...")
    datasets = load_data()
    
    print("\n2. Analyzing templates...")
    template_stats = analyze_templates(datasets)
    
    print("\n3. Finding anomalies...")
    anomaly_stats = find_anomalies(datasets)
    
    print("\n4. Ranking sources...")
    rankings = rank_sources(template_stats, anomaly_stats)
    
    print("\nTop 3 sources for labeling:")
    for i, rank in enumerate(rankings[:3], 1):
        print(f"{i}. {rank['source']} (score: {rank['priority_score']:.1f})")
        print(f"   Anomaly rate: {rank['anomaly_rate']:.1f}%")
        print(f"   Templates: {rank['templates']}")
    
    best_source = rankings[0]['source']
    print(f"\n5. Preparing {best_source} for labeling...")
    
    labeling_data = load_progress(best_source)
    if labeling_data is None:
        labeling_data = prep_templates(datasets[best_source], best_source)
        auto_label(labeling_data)
    
    show_progress(labeling_data)
    
    print(f"\n6. Ready for labeling!")
    print("Next steps:")
    print(f"- label_batch(labeling_data) - Interactive labeling")
    print(f"- quick_label(labeling_data, [0,1,2], [0,1,2]) - Quick labeling")
    print(f"- show_progress(labeling_data) - Check progress")
    print(f"- export_final(datasets['{best_source}'], labeling_data, '{best_source}') - Export final dataset")
    
    return datasets, labeling_data, best_source

In [19]:
datasets = None
labeling_data = None 
best_source = None

Quality Check

In [20]:
def check_quality(labeling_data):
    labeled = [item for item in labeling_data if item['label'] is not None]
    
    if not labeled:
        print("No labeled data to check")
        return {}
    
    total_templates = len(labeling_data)
    labeled_count = len(labeled)
    total_logs = sum(item['count'] for item in labeling_data)
    labeled_logs = sum(item['count'] for item in labeled)
    
    template_coverage = (labeled_count / total_templates) * 100
    log_coverage = (labeled_logs / total_logs) * 100
    
    print(f"Coverage Report:")
    print(f"Templates: {labeled_count}/{total_templates} ({template_coverage:.1f}%)")
    print(f"Logs: {labeled_logs:,}/{total_logs:,} ({log_coverage:.1f}%)")
    
    label_counts = defaultdict(int)
    log_counts = defaultdict(int)
    
    for item in labeled:
        label = item['label']
        label_counts[label] += 1
        log_counts[label] += item['count']
    
    print(f"\nLabel Distribution:")
    for label in sorted(label_counts.keys()):
        templates = label_counts[label]
        logs = log_counts[label]
        print(f"{label} ({LABELS[label]}): {templates} templates, {logs:,} logs")
    
    return {
        'template_coverage': template_coverage,
        'log_coverage': log_coverage,
        'label_counts': dict(label_counts),
        'log_counts': dict(log_counts)
    }

Find Issues

In [21]:
def find_issues(labeling_data):
    issues = []
    
    for i, item in enumerate(labeling_data):
        if item['label'] is None:
            continue
        
        template = item['template'].lower()
        content = ' '.join(item['samples']).lower()
        label = item['label']
        
        if label == 1:
            security_words = ['auth', 'login', 'password', 'user', 'invalid', 'fail']
            if not any(word in content for word in security_words):
                issues.append(f"Template {i}: Security label without security keywords")
        
        elif label == 0:
            error_words = ['error', 'fail', 'critical', 'exception']
            if any(word in content for word in error_words):
                issues.append(f"Template {i}: Normal label with error keywords")
        
        elif label > 0 and item['percentage'] > 10:
            issues.append(f"Template {i}: High-frequency anomaly ({item['percentage']:.1f}%)")
    
    if issues:
        print(f"Found {len(issues)} potential issues:")
        for issue in issues[:10]:
            print(f"  {issue}")
    else:
        print("No issues found - labeling looks good!")
    
    return issues

Group Similar

In [22]:
def group_similar(labeling_data):
    unlabeled = [item for item in labeling_data if item['label'] is None]
    
    groups = {
        'connection': [],
        'auth': [],
        'error': [],
        'timeout': [],
        'success': []
    }
    
    for item in unlabeled:
        text = (item['template'] + ' ' + ' '.join(item['samples'])).lower()
        
        if 'connect' in text or 'connection' in text:
            groups['connection'].append(item)
        elif 'auth' in text or 'login' in text or 'user' in text:
            groups['auth'].append(item)
        elif 'error' in text or 'fail' in text or 'critical' in text:
            groups['error'].append(item)
        elif 'timeout' in text:
            groups['timeout'].append(item)
        elif 'success' in text or 'ok' in text or 'complete' in text:
            groups['success'].append(item)
    
    print(f"Similar template groups:")
    for name, items in groups.items():
        if items:
            total_logs = sum(item['count'] for item in items)
            print(f"{name}: {len(items)} templates, {total_logs:,} logs")
    
    return groups

Smart Suggest

In [23]:
def smart_suggest(labeling_data):
    labeled = [item for item in labeling_data if item['label'] is not None]
    
    label_words = defaultdict(set)
    for item in labeled:
        text = item['template'].lower() + ' ' + ' '.join(item['samples']).lower()
        words = set(text.split())
        label_words[item['label']].update(words)
    
    updated = 0
    for item in labeling_data:
        if item['label'] is not None:
            continue
        
        text = item['template'].lower() + ' ' + ' '.join(item['samples']).lower()
        words = set(text.split())
        
        best_label = 0
        best_score = 0
        
        for label_id, pattern_words in label_words.items():
            overlap = len(words.intersection(pattern_words))
            score = overlap / len(pattern_words) if pattern_words else 0
            
            if score > best_score and score > 0.2:
                best_score = score
                best_label = label_id
        
        if best_label != item['suggested']:
            item['suggested'] = best_label
            item['confidence'] = "medium" if best_score > 0.4 else "low"
            updated += 1
    
    print(f"Updated {updated} suggestions based on learned patterns")

Save Report

In [24]:
def save_report(labeling_data, source_name):
    report_file = OUTPUT_PATH / f"{source_name}_report.txt"
    
    with open(report_file, 'w') as f:
        f.write(f"LABELING REPORT: {source_name}\n")
        f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n")
        f.write("="*50 + "\n\n")
        
        total = len(labeling_data)
        labeled = sum(1 for item in labeling_data if item['label'] is not None)
        total_logs = sum(item['count'] for item in labeling_data)
        labeled_logs = sum(item['count'] for item in labeling_data if item['label'] is not None)
        
        f.write(f"Templates: {labeled}/{total} ({labeled/total*100:.1f}%)\n")
        f.write(f"Logs: {labeled_logs:,}/{total_logs:,} ({labeled_logs/total_logs*100:.1f}%)\n\n")
        
        label_dist = defaultdict(int)
        for item in labeling_data:
            if item['label'] is not None:
                label_dist[item['label']] += item['count']
        
        f.write("Label Distribution:\n")
        for label in sorted(label_dist.keys()):
            count = label_dist[label]
            f.write(f"{label} ({LABELS[label]}): {count:,} logs\n")
    
    print(f"Report saved: {report_file}")

Find Next Source

In [25]:
def find_next_source(datasets, labeling_data, current_source):
    patterns = defaultdict(set)
    for item in labeling_data:
        if item['label'] is not None:
            words = item['template'].lower().split()
            patterns[item['label']].update(words)
    
    remaining = [src for src in LOG_SOURCES if src != current_source]
    scores = []
    
    for source in remaining:
        try:
            df = datasets[source]
            if 'EventTemplate' not in df.columns:
                continue
            
            templates = df['EventTemplate'].head(100)
            matches = 0
            
            for template in templates:
                template_words = set(template.lower().split())
                for pattern_words in patterns.values():
                    if len(template_words.intersection(pattern_words)) > 0:
                        matches += 1
                        break
            
            score = matches / len(templates) if templates is not None and len(templates) > 0 else 0
            scores.append((source, score, len(df)))
            
        except Exception:
            continue
    
    scores.sort(key=lambda x: x[1], reverse=True)
    
    print(f"Next source recommendations:")
    for i, (source, score, logs) in enumerate(scores[:3], 1):
        print(f"{i}. {source} (similarity: {score:.1f}, logs: {logs:,})")
    
    return scores[0][0] if scores else None

Export for ML

In [26]:
def export_ml_data(final_dataset, source_name, test_size=0.2):
    labeled = final_dataset[final_dataset['AnomalyLabel'] >= 0].copy()
    
    if len(labeled) == 0:
        print("No labeled data to export")
        return
    
    labeled['ContentLength'] = labeled['Content'].str.len()
    labeled['HasError'] = labeled['Content'].str.lower().str.contains('error|fail|critical')
    labeled['HasAuth'] = labeled['Content'].str.lower().str.contains('auth|login|user')
    
    X = labeled[['Content', 'EventTemplate', 'ContentLength', 'HasError', 'HasAuth']]
    y = labeled['AnomalyLabel']
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=42
    )
    
    train_file = OUTPUT_PATH / f"{source_name}_train.csv"
    test_file = OUTPUT_PATH / f"{source_name}_test.csv"
    
    pd.concat([X_train, y_train], axis=1).to_csv(train_file, index=False)
    pd.concat([X_test, y_test], axis=1).to_csv(test_file, index=False)
    
    print(f"ML data exported:")
    print(f"Train: {len(X_train):,} samples -> {train_file}")
    print(f"Test: {len(X_test):,} samples -> {test_file}")
    
    print(f"Training labels:")
    for label, count in y_train.value_counts().sort_index().items():
        pct = count/len(y_train)*100
        print(f"  {label} ({LABELS[label]}): {count:,} ({pct:.1f}%)")

Quick Evaluation

In [27]:
def quick_eval(labeling_data, source_name):
    print(f"QUICK EVALUATION: {source_name}")
    print("="*40)
    
    quality = check_quality(labeling_data)
    
    issues = find_issues(labeling_data)
    
    groups = group_similar(labeling_data)
    
    smart_suggest(labeling_data)
    
    save_report(labeling_data, source_name)
    
    return quality, issues, groups

Setup Next Source

In [28]:
def setup_next(datasets, current_labeling_data, current_source):
    next_source = find_next_source(datasets, current_labeling_data, current_source)
    
    if not next_source:
        print("No suitable next source found")
        return None, None
    
    print(f"Setting up {next_source} for labeling...")
    
    next_data = prep_templates(datasets[next_source], next_source)
    
    labeled_items = [item for item in current_labeling_data if item['label'] is not None]
    
    if labeled_items:
        keyword_labels = {}
        for item in labeled_items:
            words = item['template'].lower().split()
            for word in words:
                if word not in keyword_labels:
                    keyword_labels[word] = []
                keyword_labels[word].append(item['label'])
        
        for item in next_data:
            words = item['template'].lower().split()
            label_votes = []
            for word in words:
                if word in keyword_labels:
                    label_votes.extend(keyword_labels[word])
            
            if label_votes:
                from collections import Counter
                most_common = Counter(label_votes).most_common(1)[0][0]
                item['suggested'] = most_common
                item['confidence'] = "medium"
    
    auto_label(next_data)
    
    print(f"Next source ready: {next_source}")
    print(f"Templates to label: {len([item for item in next_data if item['label'] is None])}")
    
    return next_source, next_data

Complete Workflow

In [29]:
def complete_workflow(datasets, labeling_data, source_name, final_dataset):
    print("COMPLETE WORKFLOW EVALUATION")
    print("="*50)
    
    print("1. Evaluating current labeling...")
    quality, issues, groups = quick_eval(labeling_data, source_name)
    
    print("\n2. Exporting ML-ready data...")
    export_ml_data(final_dataset, source_name)
    
    print("\n3. Setting up next source...")
    next_source, next_data = setup_next(datasets, labeling_data, source_name)
    
    print("\n4. RECOMMENDATIONS:")
    
    if quality['template_coverage'] < 80:
        print(f"- Complete current source (only {quality['template_coverage']:.1f}% done)")
    elif next_source:
        print(f"- Start labeling {next_source} using learned patterns")
        print(f"- Focus on templates that don't match existing patterns")
    
    if quality['log_coverage'] > 90:
        print(f"- Ready for ML model training")
        print(f"- Consider ensemble methods for better accuracy")
    
    if len(issues) > 5:
        print(f"- Review and fix {len(issues)} potential labeling issues")
    
    print(f"\n5. NEXT ACTIONS:")
    print(f"- Review issues found in quality check")
    print(f"- Continue labeling current source or start next source")
    print(f"- Begin ML model experiments with exported data")
    
    return quality, issues, groups, next_source, next_data

Execution

In [30]:
datasets, labeling_data, best_source = run_workflow()

LOG ANOMALY LABELING WORKFLOW
1. Loading datasets...
✓ Loaded Android_2k: 2,000 logs
✓ Loaded Apache_2k: 2,000 logs
✓ Loaded BGL_2k: 2,000 logs
✓ Loaded Hadoop_2k: 2,000 logs
✓ Loaded HDFS_2k: 2,000 logs
✓ Loaded HealthApp_2k: 2,000 logs
✓ Loaded HPC_2k: 2,000 logs
✓ Loaded Linux_2k: 2,000 logs
✓ Loaded Mac_2k: 2,000 logs
✓ Loaded OpenSSH_2k: 2,000 logs
✓ Loaded OpenStack_2k: 2,000 logs
✓ Loaded Proxifier_2k: 2,000 logs
✓ Loaded Spark_2k: 2,000 logs
✓ Loaded Thunderbird_2k: 2,000 logs
✓ Loaded Windows_2k: 2,000 logs
✓ Loaded Zookeeper_2k: 2,000 logs

Loaded 16 sources, 32,000 total logs

2. Analyzing templates...

3. Finding anomalies...

4. Ranking sources...

Top 3 sources for labeling:
1. OpenSSH_2k (score: 59.2)
   Anomaly rate: 69.5%
   Templates: 27
2. Apache_2k (score: 50.6)
   Anomaly rate: 27.0%
   Templates: 6
3. HPC_2k (score: 39.7)
   Anomaly rate: 24.5%
   Templates: 46

5. Preparing OpenSSH_2k for labeling...
Processing 27 unique templates...
Auto-labeled 17 templates

Pr

In [31]:
current_pos, labeled_count = label_batch(labeling_data, best_source, start=0, count=10)


TEMPLATE LABELING
Labels: 0:normal, 1:security_anomaly, 2:system_failure, 3:performance_issue, 4:network_anomaly, 5:config_error, 6:hardware_issue, 7:unknown_anomaly
Commands: 0-7 (label), 'skip', 'quit', 'save'

[1/27] Template
Frequency: 413 logs (20.6%)
Template: Received disconnect from <*>: <*>: Bye Bye [preauth]
Sample logs:
  1. Received disconnect from 52.80.34.196: 11: Bye Bye [preauth]
  2. Received disconnect from 202.100.179.208: 11: Bye Bye [preauth]
  3. Received disconnect from 112.95.230.3: 11: Bye Bye [preauth]
Suggested: 0 (normal) - low

[2/27] Template
Frequency: 384 logs (19.2%)
Template: pam_unix(sshd:auth): authentication failure; logname= uid=<*> euid=<*> tty=ssh ruser= rhost=<*> user=<*>
Sample logs:
  1. pam_unix(sshd:auth): authentication failure; logname= uid=0 euid=0 tty=ssh ruser= rhost=5.36.59.76.dynamic-dsl-ip.omantel.net.om  user=root
  2. pam_unix(sshd:auth): authentication failure; logname= uid=0 euid=0 tty=ssh ruser= rhost=112.95.230.3  user=root
  

In [37]:
current_pos, labeled_count = label_batch(labeling_data, best_source, start=current_pos, count=10)


TEMPLATE LABELING
Labels: 0:normal, 1:security_anomaly, 2:system_failure, 3:performance_issue, 4:network_anomaly, 5:config_error, 6:hardware_issue, 7:unknown_anomaly
Commands: 0-7 (label), 'skip', 'quit', 'save'

[21/27] Template
Frequency: 2 logs (0.1%)
Template: PAM <*> more authentication failure; logname= uid=<*> euid=<*> tty=ssh ruser= rhost=<*>
Sample logs:
  1. PAM 1 more authentication failure; logname= uid=0 euid=0 tty=ssh ruser= rhost=5.188.10.180
  2. PAM 1 more authentication failure; logname= uid=0 euid=0 tty=ssh ruser= rhost=185.190.58.151
Suggested: 1 (security_anomaly) - high

[22/27] Template
Frequency: 1 logs (0.1%)
Template: Accepted password for <*> from <*> port <*> ssh2
Sample logs:
  1. Accepted password for fztu from 119.137.62.142 port 49116 ssh2
Suggested: 0 (normal) - low

[23/27] Template
Frequency: 1 logs (0.1%)
Template: pam_unix(sshd:session): session opened for user <*> by (uid=<*>)
Sample logs:
  1. pam_unix(sshd:session): session opened for user fztu 

In [38]:
show_progress(labeling_data)


Progress: 27/27 templates (100.0%)
Label distribution:
  0 (normal): 609 logs
  1 (security_anomaly): 1,342 logs
  2 (system_failure): 49 logs


In [39]:
save_progress(labeling_data, best_source)

Saved progress: 27/27 templates


In [40]:
quality, issues, groups = quick_eval(labeling_data, best_source)

QUICK EVALUATION: OpenSSH_2k
Coverage Report:
Templates: 27/27 (100.0%)
Logs: 2,000/2,000 (100.0%)

Label Distribution:
0 (normal): 9 templates, 609 logs
1 (security_anomaly): 14 templates, 1,342 logs
2 (system_failure): 4 templates, 49 logs
No issues found - labeling looks good!
Similar template groups:
Updated 0 suggestions based on learned patterns
Report saved: C:\Computer Science\AIMLDL\log-anomaly-detection\dataset\labeled_data\OpenSSH_2k_report.txt


In [41]:
final_dataset = export_final(datasets[best_source], labeling_data, best_source)
print(f"Labeled coverage: {(final_dataset['AnomalyLabel'] >= 0).sum()}/{len(final_dataset)} logs")


Final dataset: C:\Computer Science\AIMLDL\log-anomaly-detection\dataset\labeled_data\OpenSSH_2k_labeled.csv
Total logs: 2,000
Labeled: 2,000 (100.0%)
Anomalies: 1,391 (69.5% of labeled)
Labeled coverage: 2000/2000 logs


Next Source

In [42]:
next_source, next_data = setup_next(datasets, labeling_data, best_source)

Next source recommendations:
1. Apache_2k (similarity: 1.0, logs: 2,000)
2. OpenStack_2k (similarity: 1.0, logs: 2,000)
3. Linux_2k (similarity: 1.0, logs: 2,000)
Setting up Apache_2k for labeling...
Processing 6 unique templates...
Auto-labeled 0 templates
Next source ready: Apache_2k
Templates to label: 6


In [None]:
current_pos, labeled_count = label_batch(next_data, next_source, start=0, count=10)

OpenSSH Pattern Analysis

In [None]:
def analyze_patterns(labeling_data):

    patterns = {
        0: {'keywords': set(), 'templates': []},  # normal
        1: {'keywords': set(), 'templates': []},  # security_anomaly
        2: {'keywords': set(), 'templates': []}   # system_failure
    }

    for item in labeling_data:
        if item['label'] is not None:
            label = item['label']
            template = item['template'].lower()
            content = ' '.join(item['samples']).lower()

            words = set(template.split() + content.split())
            patterns[label]['keywords'].update(words)
            patterns[label]['templates'].append(item['template'])

    print("LEARNED PATTERNS FROM OPENSSH:")
    print("=" * 50)

    for label, data in patterns.items():
        print(f"\n{LABELS[label].upper()}:")
        print(f"  Templates: {len(data['templates'])}")
        print(f"  Key indicators: {list(data['keywords'])[:10]}")  # Show top 10 keywords

    return patterns

In [None]:
def find_next_source(datasets, openssh_patterns, current_source='OpenSSH_2k'):

    remaining_sources = [src for src in LOG_SOURCES if src != current_source]
    similarity_scores = []

    print(f"\nANALYZING REMAINING SOURCES FOR PATTERN SIMILARITY:")
    print("=" * 60)

    for source in remaining_sources:
        try:
            df = datasets[source]
            if 'EventTemplate' not in df.columns or 'Content' not in df.columns:
                continue

            sample_templates = df['EventTemplate'].head(50)
            sample_content = df['Content'].head(50)

            security_matches = 0
            total_samples = len(sample_templates)

            for template, content in zip(sample_templates, sample_content):
                text = (str(template) + ' ' + str(content)).lower()
                text_words = set(text.split())

                security_overlap = len(text_words.intersection(openssh_patterns[1]['keywords']))
                if security_overlap > 2: 
                    security_matches += 1

            similarity_score = security_matches / total_samples if total_samples > 0 else 0

            unique_templates = df['EventTemplate'].nunique()
            total_logs = len(df)

            similarity_scores.append({
                'source': source,
                'similarity_score': similarity_score,
                'unique_templates': unique_templates,
                'total_logs': total_logs,
                'efficiency': total_logs / unique_templates
            })

            print(f"{source:15}: similarity={similarity_score:.2f}, templates={unique_templates:3d}, logs={total_logs:,}")

        except Exception as e:
            print(f"{source:15}: Error - {e}")
            continue

    similarity_scores.sort(key=lambda x: x['similarity_score'], reverse=True)

    print(f"\nTOP RECOMMENDATIONS:")
    print("-" * 40)
    for i, score in enumerate(similarity_scores[:3], 1):
        print(f"{i}. {score['source']}")
        print(f"   Pattern similarity: {score['similarity_score']:.2f}")
        print(f"   Templates: {score['unique_templates']} (efficiency: {score['efficiency']:.1f})")
        print(f"   Total logs: {score['total_logs']:,}")
        print()

    return similarity_scores

In [None]:
def setup_source(datasets, openssh_patterns, next_source):
    print(f"SETTING UP {next_source} WITH PATTERN TRANSFER:")
    print("=" * 60)

    next_labeling_data = prep_templates(datasets[next_source], next_source)
    if not next_labeling_data:
        print(f"Failed to prepare {next_source}")
        return None

    pattern_matches = 0

    for item in next_labeling_data:
        template = item['template'].lower()
        content = ' '.join(item['samples']).lower()
        text_words = set((template + ' ' + content).split())

        best_match_score = 0
        best_match_label = 0

        for label_id, pattern_data in openssh_patterns.items():
            overlap = len(text_words.intersection(pattern_data['keywords']))
            score = overlap / len(pattern_data['keywords']) if pattern_data['keywords'] else 0

            if score > best_match_score and score > 0.1:
                best_match_score = score
                best_match_label = label_id

        if best_match_score > 0.2: 
            old_suggestion = item['suggested']
            item['suggested'] = best_match_label
            item['confidence'] = "high" if best_match_score > 0.4 else "medium"

            if old_suggestion != best_match_label:
                pattern_matches += 1

    print(f"Pattern transfer results:")
    print(f"  Total templates: {len(next_labeling_data)}")
    print(f"  Pattern-based suggestions: {pattern_matches}")

    auto_labeled = auto_label(next_labeling_data)
    print(f"  Auto-labeled: {auto_labeled}")

    remaining_to_label = len([item for item in next_labeling_data if item['label'] is None])
    print(f"  Remaining for manual labeling: {remaining_to_label}")

    print(f"\nEXAMPLE PATTERN MATCHES:")
    print("-" * 30)

    pattern_examples = [item for item in next_labeling_data
                        if item['confidence'] in ['high', 'medium'] and item['label'] is None][:3]

    for i, item in enumerate(pattern_examples, 1):
        print(f"{i}. Template: {item['template'][:60]}...")
        print(f"   Sample: {item['samples'][0][:80]}...")
        print(f"   Suggested: {item['suggested']} ({LABELS[item['suggested']]}) - {item['confidence']}")
        print()

    return next_labeling_data

In [None]:
def compare_sources(openssh_data, second_source_data, second_source_name):

    print(f"CROSS-SOURCE PATTERN COMPARISON:")
    print("=" * 50)
    print(f"OpenSSH_2k vs {second_source_name}:")

    openssh_templates = len(openssh_data)
    second_templates = len(second_source_data)

    print(f"Templates: OpenSSH={openssh_templates}, {second_source_name}={second_templates}")

    openssh_auto = sum(1 for item in openssh_data if 'Auto-labeled' in item.get('notes', ''))
    second_auto = sum(1 for item in second_source_data if item['label'] is not None)

    print(f"Auto-labeled: OpenSSH={openssh_auto}, {second_source_name}={second_auto}")

    if second_auto > 0:
        transfer_rate = (second_auto / second_templates) * 100
        print(f"Pattern transfer success: {transfer_rate:.1f}%")

        if transfer_rate > 50:
            print("✓ High pattern similarity - continue with this source")
        elif transfer_rate > 25:
            print("⚠ Moderate similarity - some manual work needed")
        else:
            print("⚠ Low similarity - consider different source")

    return {
        'openssh_templates': openssh_templates,
        'second_templates': second_templates,
        'openssh_auto': openssh_auto,
        'second_auto': second_auto
    }

In [None]:
def continue_source():
    if 'second_source_data' not in globals() or second_source_data is None:
        print("Please run the expansion setup first")
        return

    print(f"CONTINUING WITH: {second_source_name}")
    print("=" * 50)

    show_progress(second_source_data)

    unlabeled_count = sum(1 for item in second_source_data if item['label'] is None)
    auto_labeled_count = sum(1 for item in second_source_data if item['label'] is not None)

    print(f"\nSecond source status:")
    print(f"Auto-labeled from patterns: {auto_labeled_count}")
    print(f"Remaining for manual review: {unlabeled_count}")

    if auto_labeled_count > 0:
        print(f"Pattern transfer working well!")

    if unlabeled_count > 0:
        print(f"\nReady for manual labeling:")
        print(f"Run: label_batch(second_source_data, '{second_source_name}', start=0, count=10)")
    else:
        print(f"\nAll templates labeled! Ready for export:")
        print(f"Run: export_final(datasets['{second_source_name}'], second_source_data, '{second_source_name}')")


In [None]:
def batch_label(source_data, source_name, batch_size=10):
    unlabeled_indices = [i for i, item in enumerate(source_data) if item['label'] is None]

    if not unlabeled_indices:
        print("All templates already labeled!")
        return 0, 0

    print(f"Found {len(unlabeled_indices)} unlabeled templates")

    print(f"\nNext {min(batch_size, len(unlabeled_indices))} templates to review:")
    for i, idx in enumerate(unlabeled_indices[:batch_size]):
        item = source_data[idx]
        print(f"{i+1}. [{item['count']:4,} logs] {item['template'][:60]}...")
        print(f"   Suggested: {item['suggested']} ({LABELS[item['suggested']]}) - {item['confidence']}")
        print(f"   Sample: {item['samples'][0][:80]}...")
        print()

    response = input(f"Continue with interactive labeling? (y/n): ").strip().lower()
    if response == 'y':
        return label_batch(source_data, source_name, start=0, count=batch_size)
    else:
        print("Skipping interactive labeling")
        return 0, 0

In [None]:
def validate_source():
    if 'second_source_data' not in globals():
        print("Second source not set up yet")
        return

    print(f"VALIDATING {second_source_name}")
    print("=" * 40)

    labeled_items = [item for item in second_source_data if item['label'] is not None]
    if not labeled_items:
        print("No labeled items to validate")
        return

    label_dist = defaultdict(int)
    for item in labeled_items:
        label_dist[item['label']] += item['count']

    print("Label distribution:")
    total_logs = sum(label_dist.values())
    for label in sorted(label_dist.keys()):
        count = label_dist[label]
        percentage = (count / total_logs) * 100
        print(f"  {label} ({LABELS[label]}): {count:,} logs ({percentage:.1f}%)")

    openssh_dist = defaultdict(int)
    for item in labeling_data:
        if item['label'] is not None:
            openssh_dist[item['label']] += item['count']

    print(f"\nComparison with OpenSSH:")
    print(f"{'Label':<15} {'OpenSSH %':<12} {second_source_name + ' %':<12} {'Difference':<10}")
    print("-" * 50)

    openssh_total = sum(openssh_dist.values())
    second_total = sum(label_dist.values())

    for label in sorted(set(list(openssh_dist.keys()) + list(label_dist.keys()))):
        openssh_pct = (openssh_dist[label] / openssh_total) * 100 if openssh_total > 0 else 0
        second_pct = (label_dist[label] / second_total) * 100 if second_total > 0 else 0
        diff = second_pct - openssh_pct

        print(f"{LABELS[label]:<15} {openssh_pct:>10.1f}% {second_pct:>10.1f}% {diff:>+8.1f}%")

    return label_dist

In [None]:
def combine_datasets():
    print("CREATING COMBINED DATASET")
    print("=" * 40)

    if 'second_source_name' in globals() and second_source_data:
        second_final = export_final(datasets[second_source_name], second_source_data, second_source_name)

        openssh_labeled = final_dataset.copy()
        openssh_labeled['Source'] = 'OpenSSH_2k'

        second_labeled = second_final.copy()
        second_labeled['Source'] = second_source_name

        combined = pd.concat([openssh_labeled, second_labeled], ignore_index=True)

        combined_file = OUTPUT_PATH / "combined_labeled_dataset.csv"
        combined.to_csv(combined_file, index=False)

        total_logs = len(combined)
        labeled_logs = (combined['AnomalyLabel'] >= 0).sum()
        anomaly_logs = (combined['AnomalyLabel'] > 0).sum()

        print(f"Combined dataset saved: {combined_file}")
        print(f"Total logs: {total_logs:,}")
        print(f"Labeled logs: {labeled_logs:,} ({labeled_logs/total_logs*100:.1f}%)")
        print(f"Anomaly logs: {anomaly_logs:,} ({anomaly_logs/labeled_logs*100:.1f}% of labeled)")

        for source in combined['Source'].unique():
            source_data = combined[combined['Source'] == source]
            total = len(source_data)
            labeled = (source_data['AnomalyLabel'] >= 0).sum()
            anomalies = (source_data['AnomalyLabel'] > 0).sum()
            print(f"  {source}: {total:,} logs, {labeled:,} labeled, {anomalies:,} anomalies")

        return combined
    else:
        print("Second source not ready yet")
        return None


In [None]:
def next_steps():
    print(f"\n" + "=" * 60)
    print("NEXT STEPS RECOMMENDATION")
    print("=" * 60)

    if 'second_source_data' not in globals() or second_source_data is None:
        print("IMMEDIATE ACTION: Set up second source")
        print("Run the expansion setup code above")
        return

    second_labeled = sum(1 for item in second_source_data if item['label'] is not None)
    second_total = len(second_source_data)
    second_progress = (second_labeled / second_total) * 100

    print(f"Current status:")
    print(f"  Source 1 (OpenSSH_2k): 100% complete (2,000 logs)")
    print(f"  Source 2 ({second_source_name}): {second_progress:.1f}% complete")

    if second_progress < 50:
        print(f"\nIMMEDIATE ACTION: Complete second source labeling")
        print(f"1. Run: batch_label(second_source_data, '{second_source_name}')")
        print(f"2. Or manual: label_batch(second_source_data, '{second_source_name}', count=10)")
    elif second_progress < 100:
        print(f"\nNEAR COMPLETION: Finish second source")
        print(f"1. Complete remaining templates")
        print(f"2. Run quality validation")
    else:
        print(f"\nREADY FOR NEXT PHASE: Model development")
        print(f"1. Create combined dataset: combine_datasets()")
        print(f"2. Export ML data: export_ml_data(combined_dataset, 'combined')")
        print(f"3. Start model training experiments")

    remaining_templates = sum(1 for item in second_source_data if item['label'] is None)
    if remaining_templates > 0:
        estimated_minutes = remaining_templates * 0.5
        print(f"\nEstimated time to complete: {estimated_minutes:.0f} minutes")

    print(f"\nOverall progress: 2 sources completed out of 16 total")
    print(f"Ready to scale to additional sources using learned patterns")


In [None]:
print("EXPANDING TO SECOND SOURCE")
print("=" * 40)

openssh_patterns = analyze_patterns(labeling_data)

similarity_scores = find_next_source(datasets, openssh_patterns)

# Step 3: Get the top recommendation
if similarity_scores:
    best_next = similarity_scores[0]['source']
    print(f"\nPROCEEDING WITH: {best_next}")
    print(f"Expected pattern transfer rate: {similarity_scores[0]['similarity_score']*100:.0f}%")

    # Step 4: Set up the second source
    second_source_data = setup_source(datasets, openssh_patterns, best_next)

    if second_source_data:
        # Step 5: Quick comparison
        comparison = compare_sources(labeling_data, second_source_data, best_next)

        print(f"\n" + "=" * 60)
        print("READY FOR SECOND SOURCE LABELING!")
        print("=" * 60)
        print(f"Next steps:")
        print(f"1. Review auto-labeled templates: show_progress(second_source_data)")
        print(f"2. Start labeling: label_batch(second_source_data, '{best_next}', start=0, count=10)")
        print(f"3. Save progress: save_progress(second_source_data, '{best_next}')")
        print(f"4. Export when done: export_final(datasets['{best_next}'], second_source_data, '{best_next}')")

        second_source_name = best_next
    else:
        print("Failed to set up second source")
        second_source_data = None
        second_source_name = None
else:
    print("No suitable sources found")
    second_source_data = None
    second_source_name = None

EXPANDING TO SECOND SOURCE
LEARNED PATTERNS FROM OPENSSH:

NORMAL:
  Templates: 9
  Key indicators: ['pam_unix(sshd:session):', 'fztu', '52.80.34.196:', 'identification', '>', '103.207.39.165:', 'closed', '6', '(uid=<*>)', '123.235.32.19']

SECURITY_ANOMALY:
  Templates: 14
  Key indicators: ['5.188.10.180', 'times:', 'none', 'rhost=106.5.5.195', 'uid=<*>', '5', 'rhost=<*>', 'rhost=173.234.31.186', 'ssh2', 'reverse']

SYSTEM_FAILURE:
  Templates: 4
  Key indicators: ['received', '[preauth]', 'reset', 'peer', 'available.', 'from', '11:', 'methods', 'authentication', 'error:']

ANALYZING REMAINING SOURCES FOR PATTERN SIMILARITY:
Android_2k     : similarity=0.02, templates=166, logs=2,000
Apache_2k      : similarity=0.00, templates=  6, logs=2,000
BGL_2k         : similarity=0.00, templates=120, logs=2,000
Hadoop_2k      : similarity=0.04, templates=114, logs=2,000
HDFS_2k        : similarity=0.16, templates= 14, logs=2,000
HealthApp_2k   : similarity=0.16, templates= 75, logs=2,000
HPC_2

In [73]:
if 'labeling_data' in locals() and 'datasets' in locals():
    print("CONTINUATION WORKFLOW")
    print("=" * 30)

    if 'second_source_data' in globals() and second_source_data is not None:
        continue_source()
        validate_source()
        next_steps()
    else:
        print("Run the expansion setup first to identify and prepare the second source")
        print("Then come back to this continuation workflow")
else:
    print("Please ensure your main workflow variables are loaded first")

CONTINUATION WORKFLOW
CONTINUING WITH: Linux_2k

Progress: 7/118 templates (5.9%)
Label distribution:
  1 (security_anomaly): 490 logs
  2 (system_failure): 16 logs

Second source status:
Auto-labeled from patterns: 7
Remaining for manual review: 111
Pattern transfer working well!

Ready for manual labeling:
Run: label_batch(second_source_data, 'Linux_2k', start=0, count=10)
VALIDATING Linux_2k
Label distribution:
  1 (security_anomaly): 490 logs (96.8%)
  2 (system_failure): 16 logs (3.2%)

Comparison with OpenSSH:
Label           OpenSSH %    Linux_2k %   Difference
--------------------------------------------------
normal                30.4%        0.0%    -30.4%
security_anomaly       67.1%       96.8%    +29.7%
system_failure         2.5%        3.2%     +0.7%

NEXT STEPS RECOMMENDATION
Current status:
  Source 1 (OpenSSH_2k): 100% complete (2,000 logs)
  Source 2 (Linux_2k): 5.9% complete

IMMEDIATE ACTION: Complete second source labeling
1. Run: batch_label(second_source_data, 

In [74]:
continue_source()

CONTINUING WITH: Linux_2k

Progress: 7/118 templates (5.9%)
Label distribution:
  1 (security_anomaly): 490 logs
  2 (system_failure): 16 logs

Second source status:
Auto-labeled from patterns: 7
Remaining for manual review: 111
Pattern transfer working well!

Ready for manual labeling:
Run: label_batch(second_source_data, 'Linux_2k', start=0, count=10)


In [75]:
validate_source()

VALIDATING Linux_2k
Label distribution:
  1 (security_anomaly): 490 logs (96.8%)
  2 (system_failure): 16 logs (3.2%)

Comparison with OpenSSH:
Label           OpenSSH %    Linux_2k %   Difference
--------------------------------------------------
normal                30.4%        0.0%    -30.4%
security_anomaly       67.1%       96.8%    +29.7%
system_failure         2.5%        3.2%     +0.7%


defaultdict(int, {1: 490, 2: 16, 0: 0})

In [76]:
next_steps()


NEXT STEPS RECOMMENDATION
Current status:
  Source 1 (OpenSSH_2k): 100% complete (2,000 logs)
  Source 2 (Linux_2k): 5.9% complete

IMMEDIATE ACTION: Complete second source labeling
1. Run: batch_label(second_source_data, 'Linux_2k')
2. Or manual: label_batch(second_source_data, 'Linux_2k', count=10)

Estimated time to complete: 56 minutes

Overall progress: 2 sources completed out of 16 total
Ready to scale to additional sources using learned patterns


In [90]:
unlabeled_indices = [i for i, item in enumerate(second_source_data) if item['label'] is None]

In [91]:
if unlabeled_indices:
    next_start = unlabeled_indices[0] 
    label_batch(second_source_data, 'Linux_2k', start=next_start, count=100)

In [92]:
validate_source()

VALIDATING Linux_2k
Label distribution:
  0 (normal): 1,486 logs (74.3%)
  1 (security_anomaly): 499 logs (24.9%)
  2 (system_failure): 15 logs (0.8%)

Comparison with OpenSSH:
Label           OpenSSH %    Linux_2k %   Difference
--------------------------------------------------
normal                30.4%       74.3%    +43.8%
security_anomaly       67.1%       24.9%    -42.2%
system_failure         2.5%        0.8%     -1.7%


defaultdict(int, {0: 1486, 1: 499, 2: 15})

In [93]:
second_final = export_final(datasets[second_source_name], second_source_data, second_source_name)


Final dataset: C:\Computer Science\AIMLDL\log-anomaly-detection\dataset\labeled_data\Linux_2k_labeled.csv
Total logs: 2,000
Labeled: 2,000 (100.0%)
Anomalies: 514 (25.7% of labeled)


In [94]:
combined_dataset = combine_datasets()

CREATING COMBINED DATASET

Final dataset: C:\Computer Science\AIMLDL\log-anomaly-detection\dataset\labeled_data\Linux_2k_labeled.csv
Total logs: 2,000
Labeled: 2,000 (100.0%)
Anomalies: 514 (25.7% of labeled)
Combined dataset saved: C:\Computer Science\AIMLDL\log-anomaly-detection\dataset\labeled_data\combined_labeled_dataset.csv
Total logs: 4,000
Labeled logs: 4,000 (100.0%)
Anomaly logs: 1,905 (47.6% of labeled)
  OpenSSH_2k: 2,000 logs, 2,000 labeled, 1,391 anomalies
  Linux_2k: 2,000 logs, 2,000 labeled, 514 anomalies
