Environment Setup

In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split
from pathlib import Path
import json
import re
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

In [27]:
PROJECT_ROOT = Path(r"C:\Computer Science\AIMLDL\log-anomaly-detection")
DATA_PATH = PROJECT_ROOT / "dataset" / "structured_data"
OUTPUT_PATH = PROJECT_ROOT / "dataset" / "labeled_data"
OUTPUT_PATH.mkdir(exist_ok=True)

In [28]:
LOG_SOURCES = [
    'Android_2k', 'Apache_2k', 'BGL_2k', 'Hadoop_2k', 'HDFS_2k', 
    'HealthApp_2k', 'HPC_2k', 'Linux_2k', 'Mac_2k', 'OpenSSH_2k',
    'OpenStack_2k', 'Proxifier_2k', 'Spark_2k', 'Thunderbird_2k',
    'Windows_2k', 'Zookeeper_2k'
]

In [29]:
LABELS = {
    0: "normal",
    1: "security_anomaly", 
    2: "system_failure",
    3: "performance_issue",
    4: "network_anomaly", 
    5: "config_error",
    6: "hardware_issue",
    7: "unknown_anomaly"
}

In [30]:
ANOMALY_PATTERNS = {
    'security': ['authentication failure', 'invalid user', 'break-in attempt', 
                'failed password', 'unauthorized', 'access denied'],
    'system': ['error', 'critical', 'fatal', 'exception', 'crash', 'abort'],
    'network': ['timeout', 'connection refused', 'host unreachable'],
    'performance': ['slow', 'overload', 'resource exhausted', 'quota exceeded'],
    'hardware': ['hardware error', 'disk error', 'i/o error', 'device error']
}

Load Data

In [31]:
def load_data():
    datasets = {}
    for source in LOG_SOURCES:
        try:
            file_path = DATA_PATH / f"{source}.log_structured.csv"
            df = pd.read_csv(file_path)
            datasets[source] = df
            print(f"✓ Loaded {source}: {len(df):,} logs")
        except Exception as e:
            print(f"✗ Failed {source}: {e}")
    
    total = sum(len(df) for df in datasets.values())
    print(f"\nLoaded {len(datasets)} sources, {total:,} total logs")
    return datasets


Analyze Templates

In [32]:
def analyze_templates(datasets):
    stats = {}
    for source, df in datasets.items():
        if 'EventTemplate' not in df.columns:
            continue
            
        templates = df['EventTemplate'].value_counts()
        stats[source] = {
            'total_logs': len(df),
            'unique_templates': len(templates),
            'efficiency': len(df) / len(templates),
            'top_templates': templates.head(5)
        }
    
    return stats

Find Anomalies

In [33]:
def find_anomalies(datasets):
    results = {}
    
    for source, df in datasets.items():
        if 'Content' not in df.columns:
            continue
            
        content_lower = df['Content'].str.lower()
        anomaly_counts = {}
        
        for category, keywords in ANOMALY_PATTERNS.items():
            pattern = '|'.join(re.escape(kw) for kw in keywords)
            matches = content_lower.str.contains(pattern, na=False, regex=True)
            anomaly_counts[category] = matches.sum()
        
        total_anomalies = sum(anomaly_counts.values())
        results[source] = {
            'total_logs': len(df),
            'anomalies': total_anomalies,
            'anomaly_rate': (total_anomalies / len(df)) * 100,
            'categories': anomaly_counts
        }
    
    return results

Rank Sources

In [34]:
def rank_sources(template_stats, anomaly_stats):
    rankings = []
    
    for source in template_stats.keys():
        if source not in anomaly_stats:
            continue
            
        t_stats = template_stats[source] 
        a_stats = anomaly_stats[source]
        
        anomaly_score = a_stats['anomaly_rate']
        template_score = 100 - (t_stats['unique_templates'] / 10)  # Fewer templates = better
        efficiency_score = t_stats['efficiency'] / 10
        
        priority = (anomaly_score * 0.4 + template_score * 0.3 + efficiency_score * 0.3)
        
        rankings.append({
            'source': source,
            'priority_score': priority,
            'anomaly_rate': a_stats['anomaly_rate'],
            'templates': t_stats['unique_templates'],
            'efficiency': t_stats['efficiency']
        })
    
    return sorted(rankings, key=lambda x: x['priority_score'], reverse=True)

Prepare Templates

In [35]:
def prep_templates(df, source_name):
    if 'EventTemplate' not in df.columns:
        return []
    
    templates = df['EventTemplate'].value_counts()
    labeling_data = []
    
    for template, count in templates.items():
        samples = df[df['EventTemplate'] == template]['Content'].head(3).tolist()
        
        suggested_label = 0  
        confidence = "low"
        
        content_text = ' '.join(samples).lower()
        
        for category, keywords in ANOMALY_PATTERNS.items():
            if any(kw in content_text for kw in keywords):
                if category == 'security':
                    suggested_label = 1
                elif category == 'system': 
                    suggested_label = 2
                elif category == 'performance':
                    suggested_label = 3
                elif category == 'network':
                    suggested_label = 4
                elif category == 'hardware':
                    suggested_label = 6
                confidence = "high"
                break
        
        labeling_data.append({
            'template': template,
            'count': count,
            'percentage': (count / len(df)) * 100,
            'samples': samples,
            'suggested': suggested_label,
            'confidence': confidence,
            'label': None,  
            'notes': ''
        })
    
    return sorted(labeling_data, key=lambda x: x['count'], reverse=True)

Auto-label High Confidence

In [36]:
def auto_label(labeling_data):
    auto_count = 0
    for item in labeling_data:
        if item['confidence'] == 'high' and item['label'] is None:
            item['label'] = item['suggested']
            item['notes'] = 'Auto-labeled (high confidence)'
            auto_count += 1
    
    print(f"Auto-labeled {auto_count} templates")
    return auto_count

Interactive Labeling

In [38]:
def label_batch(labeling_data, start=0, count=5):
    print("\n" + "="*60)
    print("TEMPLATE LABELING")
    print("Labels:", ", ".join(f"{k}:{v}" for k, v in LABELS.items()))
    print("Commands: 0-7 (label), 'skip', 'quit', 'save'")
    print("="*60)
    
    end = min(start + count, len(labeling_data))
    labeled = 0
    
    for i in range(start, end):
        item = labeling_data[i]
        
        print(f"\n[{i+1}/{len(labeling_data)}] Template")
        print(f"Frequency: {item['count']:,} logs ({item['percentage']:.1f}%)")
        print(f"Template: {item['template']}")
        print("Samples:")
        for j, sample in enumerate(item['samples'][:2], 1):
            print(f"  {j}. {sample}")
        print(f"Suggested: {item['suggested']} ({LABELS[item['suggested']]}) - {item['confidence']}")
        
        while True:
            response = input(f"\nLabel (suggested {item['suggested']}): ").strip().lower()
            
            if response == 'quit':
                return i, labeled
            elif response == 'skip':
                break
            elif response == 'save':
                save_progress(labeling_data, source_name)
                continue
            elif response.isdigit() and 0 <= int(response) <= 7:
                item['label'] = int(response) 
                notes = input("Notes (optional): ").strip()
                if notes:
                    item['notes'] = notes
                labeled += 1
                break
            else:
                print("Enter 0-7, 'skip', 'save', or 'quit'")
    
    print(f"\nLabeled {labeled} templates in this batch")
    return end, labeled

Quick Label Multiple

In [39]:
def quick_label(labeling_data, indices, labels):
    for idx, label in zip(indices, labels):
        if 0 <= idx < len(labeling_data) and 0 <= label <= 7:
            labeling_data[idx]['label'] = label
            labeling_data[idx]['notes'] = 'Quick label'
            print(f"Labeled template {idx}: {LABELS[label]}")

Save Progress

In [40]:
def save_progress(labeling_data, source_name):
    df = pd.DataFrame(labeling_data)
    df.to_csv(OUTPUT_PATH / f"{source_name}_progress.csv", index=False)
    
    labeled = sum(1 for item in labeling_data if item['label'] is not None)
    print(f"Saved progress: {labeled}/{len(labeling_data)} templates")

Load Progress

In [41]:
def load_progress(source_name):
    file_path = OUTPUT_PATH / f"{source_name}_progress.csv"
    if file_path.exists():
        df = pd.read_csv(file_path)
        data = df.to_dict('records')
        labeled = sum(1 for item in data if pd.notna(item.get('label')))
        print(f"Loaded progress: {labeled}/{len(data)} templates")
        return data
    return None

Show Progress

In [42]:
def show_progress(labeling_data):
    total = len(labeling_data)
    labeled = sum(1 for item in labeling_data if item['label'] is not None)
    
    print(f"\nProgress: {labeled}/{total} templates ({labeled/total*100:.1f}%)")
    
    if labeled > 0:
        dist = defaultdict(int)
        for item in labeling_data:
            if item['label'] is not None and not pd.isna(item['label']):
                dist[int(item['label'])] += item['count']
        
        print("Label distribution:")
        for label in sorted(dist.keys()):
            count = dist[label]
            if label in LABELS:
                print(f"  {label} ({LABELS[label]}): {count:,} logs")
            else:
                print(f"  {label} (unknown): {count:,} logs")

Export Final Dataset

In [43]:
def export_final(df, labeling_data, source_name):
    template_labels = {item['template']: item['label'] 
                      for item in labeling_data if item['label'] is not None}
    
    result_df = df.copy()
    result_df['AnomalyLabel'] = result_df['EventTemplate'].map(template_labels).fillna(-1).astype(int)
    result_df['AnomalyLabelName'] = result_df['AnomalyLabel'].map(lambda x: LABELS.get(x, 'unlabeled'))
    
    output_file = OUTPUT_PATH / f"{source_name}_labeled.csv"
    result_df.to_csv(output_file, index=False)
    
    total = len(result_df)
    labeled_count = (result_df['AnomalyLabel'] >= 0).sum()
    anomaly_count = (result_df['AnomalyLabel'] > 0).sum()
    
    print(f"\nFinal dataset: {output_file}")
    print(f"Total logs: {total:,}")
    print(f"Labeled: {labeled_count:,} ({labeled_count/total*100:.1f}%)")
    print(f"Anomalies: {anomaly_count:,} ({anomaly_count/labeled_count*100:.1f}% of labeled)")
    
    return result_df

Main Workflow

In [44]:
def run_workflow():
    print("LOG ANOMALY LABELING WORKFLOW")
    print("="*50)
    
    print("1. Loading datasets...")
    datasets = load_data()
    
    print("\n2. Analyzing templates...")
    template_stats = analyze_templates(datasets)
    
    print("\n3. Finding anomalies...")
    anomaly_stats = find_anomalies(datasets)
    
    print("\n4. Ranking sources...")
    rankings = rank_sources(template_stats, anomaly_stats)
    
    print("\nTop 3 sources for labeling:")
    for i, rank in enumerate(rankings[:3], 1):
        print(f"{i}. {rank['source']} (score: {rank['priority_score']:.1f})")
        print(f"   Anomaly rate: {rank['anomaly_rate']:.1f}%")
        print(f"   Templates: {rank['templates']}")
    
    best_source = rankings[0]['source']
    print(f"\n5. Preparing {best_source} for labeling...")
    
    labeling_data = load_progress(best_source)
    if labeling_data is None:
        labeling_data = prep_templates(datasets[best_source], best_source)
        auto_label(labeling_data)
    
    show_progress(labeling_data)
    
    print(f"\n6. Ready for labeling!")
    print("Next steps:")
    print(f"- label_batch(labeling_data) - Interactive labeling")
    print(f"- quick_label(labeling_data, [0,1,2], [0,1,2]) - Quick labeling")
    print(f"- show_progress(labeling_data) - Check progress")
    print(f"- export_final(datasets['{best_source}'], labeling_data, '{best_source}') - Export final dataset")
    
    return datasets, labeling_data, best_source

In [45]:
datasets = None
labeling_data = None 
best_source = None

Labeling

In [46]:
datasets, labeling_data, best_source = run_workflow()

LOG ANOMALY LABELING WORKFLOW
1. Loading datasets...
✓ Loaded Android_2k: 2,000 logs
✓ Loaded Apache_2k: 2,000 logs
✓ Loaded BGL_2k: 2,000 logs
✓ Loaded Hadoop_2k: 2,000 logs
✓ Loaded HDFS_2k: 2,000 logs
✓ Loaded HealthApp_2k: 2,000 logs
✓ Loaded HPC_2k: 2,000 logs
✓ Loaded Linux_2k: 2,000 logs
✓ Loaded Mac_2k: 2,000 logs
✓ Loaded OpenSSH_2k: 2,000 logs
✓ Loaded OpenStack_2k: 2,000 logs
✓ Loaded Proxifier_2k: 2,000 logs
✓ Loaded Spark_2k: 2,000 logs
✓ Loaded Thunderbird_2k: 2,000 logs
✓ Loaded Windows_2k: 2,000 logs
✓ Loaded Zookeeper_2k: 2,000 logs

Loaded 16 sources, 32,000 total logs

2. Analyzing templates...

3. Finding anomalies...

4. Ranking sources...

Top 3 sources for labeling:
1. OpenSSH_2k (score: 59.2)
   Anomaly rate: 69.5%
   Templates: 27
2. Apache_2k (score: 50.6)
   Anomaly rate: 27.0%
   Templates: 6
3. HPC_2k (score: 39.7)
   Anomaly rate: 24.5%
   Templates: 46

5. Preparing OpenSSH_2k for labeling...
Loaded progress: 19/27 templates

Progress: 27/27 templates (10

In [76]:
current_pos, labeled_count = label_batch(labeling_data, start=0, count=5)


TEMPLATE LABELING
Labels: 0:normal, 1:security_anomaly, 2:system_failure, 3:performance_issue, 4:network_anomaly, 5:config_error, 6:hardware_issue, 7:unknown_anomaly
Commands: 0-7 (label), 'skip', 'quit', 'save'

[1/27] Template
Frequency: 413 logs (20.6%)
Template: Received disconnect from <*>: <*>: Bye Bye [preauth]
Samples:
  1. Received disconnect from 52.80.34.196: 11: Bye Bye [preauth]
  2. Received disconnect from 202.100.179.208: 11: Bye Bye [preauth]
Suggested: 0 (normal) - low

[2/27] Template
Frequency: 384 logs (19.2%)
Template: pam_unix(sshd:auth): authentication failure; logname= uid=<*> euid=<*> tty=ssh ruser= rhost=<*> user=<*>
Samples:
  1. pam_unix(sshd:auth): authentication failure; logname= uid=0 euid=0 tty=ssh ruser= rhost=5.36.59.76.dynamic-dsl-ip.omantel.net.om  user=root
  2. pam_unix(sshd:auth): authentication failure; logname= uid=0 euid=0 tty=ssh ruser= rhost=112.95.230.3  user=root
Suggested: 1 (security_anomaly) - high

[3/27] Template
Frequency: 383 logs 

In [77]:
current_pos, labeled_count = label_batch(labeling_data, start=current_pos, count=5)


TEMPLATE LABELING
Labels: 0:normal, 1:security_anomaly, 2:system_failure, 3:performance_issue, 4:network_anomaly, 5:config_error, 6:hardware_issue, 7:unknown_anomaly
Commands: 0-7 (label), 'skip', 'quit', 'save'

[6/27] Template
Frequency: 113 logs (5.7%)
Template: Invalid user <*> from <*>
Samples:
  1. Invalid user webmaster from 173.234.31.186
  2. Invalid user test9 from 52.80.34.196
Suggested: 1 (security_anomaly) - high

[7/27] Template
Frequency: 113 logs (5.7%)
Template: input_userauth_request: invalid user <*> [preauth]
Samples:
  1. input_userauth_request: invalid user webmaster [preauth]
  2. input_userauth_request: invalid user test9 [preauth]
Suggested: 1 (security_anomaly) - high

[8/27] Template
Frequency: 110 logs (5.5%)
Template: pam_unix(sshd:auth): authentication failure; logname= uid=<*> euid=<*> tty=ssh ruser= rhost=<*>
Samples:
  1. pam_unix(sshd:auth): authentication failure; logname= uid=0 euid=0 tty=ssh ruser= rhost=173.234.31.186
  2. pam_unix(sshd:auth): aut

In [78]:
show_progress(labeling_data)


Progress: 19/27 templates (70.4%)
Label distribution:
  1 (security_anomaly): 1,890 logs
  2 (system_failure): 48 logs


In [79]:
save_progress(labeling_data, best_source)

Saved progress: 19/27 templates


In [80]:
final_dataset = export_final(datasets[best_source], labeling_data, best_source)


Final dataset: C:\Computer Science\AIMLDL\log-anomaly-detection\dataset\labeled_data\OpenSSH_2k_labeled.csv
Total logs: 2,000
Labeled: 1,938 (96.9%)
Anomalies: 1,938 (100.0% of labeled)


Quality Check

In [None]:
def check_quality(labeling_data):
    labeled = [item for item in labeling_data if item['label'] is not None]
    
    if not labeled:
        print("No labeled data to check")
        return {}
    
    total_templates = len(labeling_data)
    labeled_count = len(labeled)
    total_logs = sum(item['count'] for item in labeling_data)
    labeled_logs = sum(item['count'] for item in labeled)
    
    template_coverage = (labeled_count / total_templates) * 100
    log_coverage = (labeled_logs / total_logs) * 100
    
    print(f"Coverage Report:")
    print(f"Templates: {labeled_count}/{total_templates} ({template_coverage:.1f}%)")
    print(f"Logs: {labeled_logs:,}/{total_logs:,} ({log_coverage:.1f}%)")
    
    label_counts = defaultdict(int)
    log_counts = defaultdict(int)
    
    for item in labeled:
        label = item['label']
        label_counts[label] += 1
        log_counts[label] += item['count']
    
    print(f"\nLabel Distribution:")
    for label in sorted(label_counts.keys()):
        templates = label_counts[label]
        logs = log_counts[label]
        print(f"{label} ({LABELS[label]}): {templates} templates, {logs:,} logs")
    
    return {
        'template_coverage': template_coverage,
        'log_coverage': log_coverage,
        'label_counts': dict(label_counts),
        'log_counts': dict(log_counts)
    }

Find Issues

In [None]:
def find_issues(labeling_data):
    issues = []
    
    for i, item in enumerate(labeling_data):
        if item['label'] is None:
            continue
        
        template = item['template'].lower()
        content = ' '.join(item['samples']).lower()
        label = item['label']
        
        if label == 1:
            security_words = ['auth', 'login', 'password', 'user', 'invalid', 'fail']
            if not any(word in content for word in security_words):
                issues.append(f"Template {i}: Security label without security keywords")
        
        elif label == 0:
            error_words = ['error', 'fail', 'critical', 'exception']
            if any(word in content for word in error_words):
                issues.append(f"Template {i}: Normal label with error keywords")
        
        elif label > 0 and item['percentage'] > 10:
            issues.append(f"Template {i}: High-frequency anomaly ({item['percentage']:.1f}%)")
    
    if issues:
        print(f"Found {len(issues)} potential issues:")
        for issue in issues[:10]:
            print(f"  {issue}")
    else:
        print("No issues found - labeling looks good!")
    
    return issues

Group Similar

In [None]:
def group_similar(labeling_data):
    unlabeled = [item for item in labeling_data if item['label'] is None]
    
    groups = {
        'connection': [],
        'auth': [],
        'error': [],
        'timeout': [],
        'success': []
    }
    
    for item in unlabeled:
        text = (item['template'] + ' ' + ' '.join(item['samples'])).lower()
        
        if 'connect' in text or 'connection' in text:
            groups['connection'].append(item)
        elif 'auth' in text or 'login' in text or 'user' in text:
            groups['auth'].append(item)
        elif 'error' in text or 'fail' in text or 'critical' in text:
            groups['error'].append(item)
        elif 'timeout' in text:
            groups['timeout'].append(item)
        elif 'success' in text or 'ok' in text or 'complete' in text:
            groups['success'].append(item)
    
    print(f"Similar template groups:")
    for name, items in groups.items():
        if items:
            total_logs = sum(item['count'] for item in items)
            print(f"{name}: {len(items)} templates, {total_logs:,} logs")
    
    return groups

Smart Suggest

In [None]:
def smart_suggest(labeling_data):
    labeled = [item for item in labeling_data if item['label'] is not None]
    
    label_words = defaultdict(set)
    for item in labeled:
        text = item['template'].lower() + ' ' + ' '.join(item['samples']).lower()
        words = set(text.split())
        label_words[item['label']].update(words)
    
    updated = 0
    for item in labeling_data:
        if item['label'] is not None:
            continue
        
        text = item['template'].lower() + ' ' + ' '.join(item['samples']).lower()
        words = set(text.split())
        
        best_label = 0
        best_score = 0
        
        for label_id, pattern_words in label_words.items():
            overlap = len(words.intersection(pattern_words))
            score = overlap / len(pattern_words) if pattern_words else 0
            
            if score > best_score and score > 0.2:
                best_score = score
                best_label = label_id
        
        if best_label != item['suggested']:
            item['suggested'] = best_label
            item['confidence'] = "medium" if best_score > 0.4 else "low"
            updated += 1
    
    print(f"Updated {updated} suggestions based on learned patterns")

Save Report

In [None]:
def save_report(labeling_data, source_name):
    report_file = OUTPUT_PATH / f"{source_name}_report.txt"
    
    with open(report_file, 'w') as f:
        f.write(f"LABELING REPORT: {source_name}\n")
        f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n")
        f.write("="*50 + "\n\n")
        
        total = len(labeling_data)
        labeled = sum(1 for item in labeling_data if item['label'] is not None)
        total_logs = sum(item['count'] for item in labeling_data)
        labeled_logs = sum(item['count'] for item in labeling_data if item['label'] is not None)
        
        f.write(f"Templates: {labeled}/{total} ({labeled/total*100:.1f}%)\n")
        f.write(f"Logs: {labeled_logs:,}/{total_logs:,} ({labeled_logs/total_logs*100:.1f}%)\n\n")
        
        label_dist = defaultdict(int)
        for item in labeling_data:
            if item['label'] is not None:
                label_dist[item['label']] += item['count']
        
        f.write("Label Distribution:\n")
        for label in sorted(label_dist.keys()):
            count = label_dist[label]
            f.write(f"{label} ({LABELS[label]}): {count:,} logs\n")
    
    print(f"Report saved: {report_file}")

Find Next Source

In [None]:
def find_next_source(datasets, labeling_data, current_source):
    patterns = defaultdict(set)
    for item in labeling_data:
        if item['label'] is not None:
            words = item['template'].lower().split()
            patterns[item['label']].update(words)
    
    remaining = [src for src in LOG_SOURCES if src != current_source]
    scores = []
    
    for source in remaining:
        try:
            df = datasets[source]
            if 'EventTemplate' not in df.columns:
                continue
            
            templates = df['EventTemplate'].head(100)
            matches = 0
            
            for template in templates:
                template_words = set(template.lower().split())
                for pattern_words in patterns.values():
                    if len(template_words.intersection(pattern_words)) > 0:
                        matches += 1
                        break
            
            score = matches / len(templates) if templates is not None and len(templates) > 0 else 0
            scores.append((source, score, len(df)))
            
        except Exception:
            continue
    
    scores.sort(key=lambda x: x[1], reverse=True)
    
    print(f"Next source recommendations:")
    for i, (source, score, logs) in enumerate(scores[:3], 1):
        print(f"{i}. {source} (similarity: {score:.1f}, logs: {logs:,})")
    
    return scores[0][0] if scores else None

Export for ML

In [None]:
def export_ml_data(final_dataset, source_name, test_size=0.2):
    labeled = final_dataset[final_dataset['AnomalyLabel'] >= 0].copy()
    
    if len(labeled) == 0:
        print("No labeled data to export")
        return
    
    labeled['ContentLength'] = labeled['Content'].str.len()
    labeled['HasError'] = labeled['Content'].str.lower().str.contains('error|fail|critical')
    labeled['HasAuth'] = labeled['Content'].str.lower().str.contains('auth|login|user')
    
    X = labeled[['Content', 'EventTemplate', 'ContentLength', 'HasError', 'HasAuth']]
    y = labeled['AnomalyLabel']
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=42
    )
    
    train_file = OUTPUT_PATH / f"{source_name}_train.csv"
    test_file = OUTPUT_PATH / f"{source_name}_test.csv"
    
    pd.concat([X_train, y_train], axis=1).to_csv(train_file, index=False)
    pd.concat([X_test, y_test], axis=1).to_csv(test_file, index=False)
    
    print(f"ML data exported:")
    print(f"Train: {len(X_train):,} samples -> {train_file}")
    print(f"Test: {len(X_test):,} samples -> {test_file}")
    
    print(f"Training labels:")
    for label, count in y_train.value_counts().sort_index().items():
        pct = count/len(y_train)*100
        print(f"  {label} ({LABELS[label]}): {count:,} ({pct:.1f}%)")

Quick Evaluation

In [None]:
def quick_eval(labeling_data, source_name):
    print(f"QUICK EVALUATION: {source_name}")
    print("="*40)
    
    quality = check_quality(labeling_data)
    
    issues = find_issues(labeling_data)
    
    groups = group_similar(labeling_data)
    
    smart_suggest(labeling_data)
    
    save_report(labeling_data, source_name)
    
    return quality, issues, groups

Setup Next Source

In [None]:
def setup_next(datasets, current_labeling_data, current_source):
    next_source = find_next_source(datasets, current_labeling_data, current_source)
    
    if not next_source:
        print("No suitable next source found")
        return None, None
    
    print(f"Setting up {next_source} for labeling...")
    
    next_data = prep_templates(datasets[next_source], next_source)
    
    labeled_items = [item for item in current_labeling_data if item['label'] is not None]
    
    if labeled_items:
        keyword_labels = {}
        for item in labeled_items:
            words = item['template'].lower().split()
            for word in words:
                if word not in keyword_labels:
                    keyword_labels[word] = []
                keyword_labels[word].append(item['label'])
        
        for item in next_data:
            words = item['template'].lower().split()
            label_votes = []
            for word in words:
                if word in keyword_labels:
                    label_votes.extend(keyword_labels[word])
            
            if label_votes:
                from collections import Counter
                most_common = Counter(label_votes).most_common(1)[0][0]
                item['suggested'] = most_common
                item['confidence'] = "medium"
    
    auto_label(next_data)
    
    print(f"Next source ready: {next_source}")
    print(f"Templates to label: {len([item for item in next_data if item['label'] is None])}")
    
    return next_source, next_data

Complete Workflow

In [None]:
def complete_workflow(datasets, labeling_data, source_name, final_dataset):
    print("COMPLETE WORKFLOW EVALUATION")
    print("="*50)
    
    print("1. Evaluating current labeling...")
    quality, issues, groups = quick_eval(labeling_data, source_name)
    
    print("\n2. Exporting ML-ready data...")
    export_ml_data(final_dataset, source_name)
    
    print("\n3. Setting up next source...")
    next_source, next_data = setup_next(datasets, labeling_data, source_name)
    
    print("\n4. RECOMMENDATIONS:")
    
    if quality['template_coverage'] < 80:
        print(f"- Complete current source (only {quality['template_coverage']:.1f}% done)")
    elif next_source:
        print(f"- Start labeling {next_source} using learned patterns")
        print(f"- Focus on templates that don't match existing patterns")
    
    if quality['log_coverage'] > 90:
        print(f"- Ready for ML model training")
        print(f"- Consider ensemble methods for better accuracy")
    
    if len(issues) > 5:
        print(f"- Review and fix {len(issues)} potential labeling issues")
    
    print(f"\n5. NEXT ACTIONS:")
    print(f"- Review issues found in quality check")
    print(f"- Continue labeling current source or start next source")
    print(f"- Begin ML model experiments with exported data")
    
    return quality, issues, groups, next_source, next_data