In [2]:
import pandas as pd
import re
import collections

# Load your clustered dataset
df = pd.read_csv('../data/nova_logs_clustered.csv')


In [5]:
len(df)

54646

In [10]:
cluster_sizes = df['cluster_id'].value_counts()
print("Cluster sizes (largest first):")
for cluster_id, size in cluster_sizes.items():
    print(f"Cluster {cluster_id}: {size} logs")


Cluster sizes (largest first):
Cluster 1: 5268 logs
Cluster 8: 4507 logs
Cluster 18: 4405 logs
Cluster 4: 3643 logs
Cluster 2: 3285 logs
Cluster 3: 2967 logs
Cluster 0: 2887 logs
Cluster 11: 2693 logs
Cluster 7: 2500 logs
Cluster 5: 2467 logs
Cluster 6: 2462 logs
Cluster 12: 2241 logs
Cluster 20: 2021 logs
Cluster 15: 1941 logs
Cluster 10: 1658 logs
Cluster 17: 1352 logs
Cluster 14: 1188 logs
Cluster 19: 1130 logs
Cluster 9: 1100 logs
Cluster 21: 1058 logs
Cluster 13: 937 logs
Cluster 22: 864 logs
Cluster 16: 807 logs
Cluster 24: 659 logs
Cluster 23: 606 logs


In [12]:
# Focus on the 5 large clusters for regex
large_clusters = [1, 8, 18, 4, 2]
regex_target_logs = df[df['cluster_id'].isin(large_clusters)]
print(f"Target logs for regex: {len(regex_target_logs)} ({len(regex_target_logs)/len(df)*100:.1f}%)")

Target logs for regex: 21108 (38.6%)


In [15]:

cluster_samples = {}
for cluster_id in large_clusters:
    cluster_logs = df[df['cluster_id'] == cluster_id]['raw_log_text']
    samples = cluster_logs.head(10).tolist()
    cluster_samples[cluster_id] = samples
    
    print(f"\n--- CLUSTER {cluster_id} SAMPLES ---")
    for i, log in enumerate(samples[:5], 1):
        print(f"{i}. {log[:120]}...")



--- CLUSTER 1 SAMPLES ---
1. INFO nova.virt.libvirt.driver [req-297e6cd3-84b7-43af-982e-af5be68422dd] [instance: 5897334a-6797-4ccf-b299-cbbcbe6bfb14...
2. INFO nova.virt.libvirt.driver [req-65c799f8-e7f0-4c7f-ad2e-db39859caa15] [instance: 9f4a73a0-9f9c-44d1-b34a-3469314b78c7...
3. INFO nova.virt.libvirt.driver [req-b54a3755-1572-459d-82f9-4b7c57ae3e80] [instance: 8191edca-f070-403f-857a-887e7484f0e1...
4. INFO nova.virt.libvirt.driver [None req-dea5d08f-a057-4ad1-9f55-907cf758a673 admin admin] [instance: e4c3e087-ee75-40a3-...
5. INFO nova.virt.libvirt.driver [None req-beedf9d2-7e75-4c0a-ae44-57bc5cc76fea admin admin] [instance: 95883e27-b0f0-41ec-...

--- CLUSTER 8 SAMPLES ---
1. INFO nova.compute.manager [req-46123093-5f3c-4ff2-b9a3-b013b39f3f26] [instance: 26fa461c-54bc-4aaf-a0c1-9dcdbc13d7a9] Du...
2. INFO nova.compute.manager [None req-46123093-5f3c-4ff2-b9a3-b013b39f3f26 None None] [instance: 92de537e-8f8c-4474-a38d-e...
3. INFO nova.compute.manager [None req-eb337492-dfed-408c

In [17]:
# Define regex patterns for broader categories with cluster context
REGEX_PATTERNS = {
    # Cluster 1: LibVirt Driver Operations
    "System_Operations_LibVirt": [
        r"INFO nova\.virt\.libvirt\.driver \[.*?\] \[instance: [a-f0-9\-]+\] .*",
        r"INFO nova\.virt\.libvirt\.driver \[req-[a-f0-9\-]+ .*?\] \[instance: [a-f0-9\-]+\] .*"
    ],
    
    # Clusters 8, 18, 2: Compute Manager Operations  
    "Instance_Management_Compute": [
        r"INFO nova\.compute\.manager \[req-[a-f0-9\-]+ .*?\] \[instance: [a-f0-9\-]+\] .*",
        r"INFO nova\.compute\.manager \[.*?\] \[instance: [a-f0-9\-]+\] VM (Started|Stopped|Paused|Resumed|Suspended).*",
        r"INFO nova\.compute\.manager \[.*?\] \[instance: [a-f0-9\-]+\] Took [0-9\.]+ seconds.*"
    ],
    
    # Cluster 4: Compute Manager (None req pattern)
    "Instance_Management_System": [
        r"INFO nova\.compute\.manager \[None req-[a-f0-9\-]+ .*?\] \[instance: [a-f0-9\-]+\] .*"
    ]
}

print("regex patterns for broader categories:")
for category, patterns in REGEX_PATTERNS.items():
    print(f"\n{category}:")
    for i, pattern in enumerate(patterns, 1):
        print(f"  {i}. {pattern}")


regex patterns for broader categories:

System_Operations_LibVirt:
  1. INFO nova\.virt\.libvirt\.driver \[.*?\] \[instance: [a-f0-9\-]+\] .*
  2. INFO nova\.virt\.libvirt\.driver \[req-[a-f0-9\-]+ .*?\] \[instance: [a-f0-9\-]+\] .*

Instance_Management_Compute:
  1. INFO nova\.compute\.manager \[req-[a-f0-9\-]+ .*?\] \[instance: [a-f0-9\-]+\] .*
  2. INFO nova\.compute\.manager \[.*?\] \[instance: [a-f0-9\-]+\] VM (Started|Stopped|Paused|Resumed|Suspended).*
  3. INFO nova\.compute\.manager \[.*?\] \[instance: [a-f0-9\-]+\] Took [0-9\.]+ seconds.*

Instance_Management_System:
  1. INFO nova\.compute\.manager \[None req-[a-f0-9\-]+ .*?\] \[instance: [a-f0-9\-]+\] .*


In [18]:
def test_regex_on_cluster(cluster_id, patterns_dict):
    """Test regex patterns on a specific cluster's logs"""
    cluster_logs = df[df['cluster_id'] == cluster_id]['raw_log_text'].tolist()
    
    results = {}
    for category, patterns in patterns_dict.items():
        matches = 0
        for log in cluster_logs:
            for pattern in patterns:
                if re.search(pattern, log, re.IGNORECASE):
                    matches += 1
                    break
        
        match_rate = matches / len(cluster_logs) * 100
        results[category] = {
            'matches': matches,
            'total': len(cluster_logs),
            'rate': match_rate
        }
    
    return results

# Test regex on each large cluster
print("\n=== REGEX VALIDATION ON SAMPLES ===")
for cluster_id in large_clusters:
    print(f"\n--- CLUSTER {cluster_id} VALIDATION ---")
    results = test_regex_on_cluster(cluster_id, REGEX_PATTERNS)
    
    for category, result in results.items():
        if result['rate'] > 0:
            print(f"{category}: {result['matches']}/{result['total']} ({result['rate']:.1f}%)")



=== REGEX VALIDATION ON SAMPLES ===

--- CLUSTER 1 VALIDATION ---
System_Operations_LibVirt: 5156/5268 (97.9%)

--- CLUSTER 8 VALIDATION ---
Instance_Management_System: 2237/4507 (49.6%)

--- CLUSTER 18 VALIDATION ---
Instance_Management_Compute: 4405/4405 (100.0%)
Instance_Management_System: 1534/4405 (34.8%)

--- CLUSTER 4 VALIDATION ---
Instance_Management_Compute: 3643/3643 (100.0%)
Instance_Management_System: 1551/3643 (42.6%)

--- CLUSTER 2 VALIDATION ---
Instance_Management_Compute: 3161/3285 (96.2%)
Instance_Management_System: 1536/3285 (46.8%)


In [20]:
# Based on validation, refine patterns for better coverage
REFINED_REGEX_PATTERNS = {
    "System_Operations_LibVirt": [
        r"INFO nova\.virt\.libvirt\.driver.*?\[instance: [a-f0-9\-]+\].*",
    ],
    
    "Instance_Management_Compute": [
        r"INFO nova\.compute\.manager.*?\[instance: [a-f0-9\-]+\].*",
    ],
    
    "Instance_Management_System": [
        r"INFO nova\.compute\.manager \[None req-.*?\].*?\[instance: [a-f0-9\-]+\].*",
    ]
}

print("=== REFINED REGEX PATTERNS ===")
for category, patterns in REFINED_REGEX_PATTERNS.items():
    print(f"\n{category}:")
    for pattern in patterns:
        print(f"  {pattern}")


=== REFINED REGEX PATTERNS ===

System_Operations_LibVirt:
  INFO nova\.virt\.libvirt\.driver.*?\[instance: [a-f0-9\-]+\].*

Instance_Management_Compute:
  INFO nova\.compute\.manager.*?\[instance: [a-f0-9\-]+\].*

Instance_Management_System:
  INFO nova\.compute\.manager \[None req-.*?\].*?\[instance: [a-f0-9\-]+\].*


In [22]:
def classify_with_regex(log_text, patterns_dict):
    """Apply regex patterns to classify a log"""
    for category, patterns in patterns_dict.items():
        for pattern in patterns:
            if re.search(pattern, log_text, re.IGNORECASE):
                return category, pattern
    return None, None

# Apply regex classification to all logs
print("\n=== APPLYING REGEX TO ENTIRE DATASET ===")

df['regex_label'] = None
df['regex_rule'] = None

for idx, row in df.iterrows():
    label, rule = classify_with_regex(row['raw_log_text'], REFINED_REGEX_PATTERNS)
    if label:
        df.at[idx, 'regex_label'] = label
        df.at[idx, 'regex_rule'] = rule

# Count regex classification results
regex_classified = df['regex_label'].notnull().sum()
print(f"Regex classified: {regex_classified} logs ({regex_classified/len(df)*100:.1f}%)")



=== APPLYING REGEX TO ENTIRE DATASET ===
Regex classified: 35241 logs (64.5%)


In [23]:
# Analyze regex performance by category
print("\n=== REGEX PERFORMANCE ANALYSIS ===")

regex_stats = df[df['regex_label'].notnull()]['regex_label'].value_counts()
print("Logs classified by category:")
for category, count in regex_stats.items():
    print(f"  {category}: {count} logs")

# Check coverage on target large clusters
print("\n=== COVERAGE ON TARGET CLUSTERS ===")
for cluster_id in large_clusters:
    cluster_logs = df[df['cluster_id'] == cluster_id]
    classified = cluster_logs['regex_label'].notnull().sum()
    total = len(cluster_logs)
    print(f"Cluster {cluster_id}: {classified}/{total} ({classified/total*100:.1f}%)")



=== REGEX PERFORMANCE ANALYSIS ===
Logs classified by category:
  Instance_Management_Compute: 25378 logs
  System_Operations_LibVirt: 9863 logs

=== COVERAGE ON TARGET CLUSTERS ===
Cluster 1: 5156/5268 (97.9%)
Cluster 8: 4444/4507 (98.6%)
Cluster 18: 4405/4405 (100.0%)
Cluster 4: 3643/3643 (100.0%)
Cluster 2: 3285/3285 (100.0%)


In [24]:
# Save dataset with regex classifications
df.to_csv('../data/nova_logs_with_regex.csv', index=False)
print(f"\nSaved dataset with regex classifications to 'nova_logs_with_regex.csv'")

# Summary statistics
unclassified_logs = df['regex_label'].isnull().sum()
print(f"\nSTAGE 3 SUMMARY:")
print(f"Total logs: {len(df)}")
print(f"Regex classified: {regex_classified} ({regex_classified/len(df)*100:.1f}%)")
print(f"Remaining for BERT/LLM: {unclassified_logs} ({unclassified_logs/len(df)*100:.1f}%)")



Saved dataset with regex classifications to 'nova_logs_with_regex.csv'

STAGE 3 SUMMARY:
Total logs: 54646
Regex classified: 35241 (64.5%)
Remaining for BERT/LLM: 19405 (35.5%)


In [25]:
# Show sample unclassified logs that will go to BERT/LLM stages
print("\n=== SAMPLE UNCLASSIFIED LOGS (for BERT/LLM stages) ===")
unclassified = df[df['regex_label'].isnull()]

# Group by cluster to see what's left
unclassified_by_cluster = unclassified['cluster_id'].value_counts().sort_index()
print("Unclassified logs by cluster:")
for cluster_id, count in unclassified_by_cluster.head(10).items():
    print(f"  Cluster {cluster_id}: {count} logs")

# Show samples
print("\nSample unclassified logs:")
for i, log in enumerate(unclassified['raw_log_text'].head(5), 1):
    print(f"{i}. {log[:100]}...")



=== SAMPLE UNCLASSIFIED LOGS (for BERT/LLM stages) ===
Unclassified logs by cluster:
  Cluster 0: 184 logs
  Cluster 1: 112 logs
  Cluster 3: 2967 logs
  Cluster 5: 2467 logs
  Cluster 6: 2462 logs
  Cluster 7: 29 logs
  Cluster 8: 63 logs
  Cluster 9: 1100 logs
  Cluster 12: 7 logs
  Cluster 13: 937 logs

Sample unclassified logs:
2. ERROR nova.compute.manager [instance: c265f382-e5d8-44fb-98c8-84abd4592037]     self.force_reraise()...
3. INFO os_vif [None req-7c6fa9c3-a70a-42f3-bc33-96544dea14ed admin admin] Successfully unplugged vif V...
4. <entry name='serial'>f41265c7-0cc0-4212-8ab4-89626d362895</entry>...
5. INFO nova.scheduler.client.report [req-a322d5f4-1dcb-4709-9348-af4334ee24dc] Deleted allocation for ...
