In [10]:
import pandas as pd
import json
import time
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Load environment variables
from dotenv import load_dotenv
load_dotenv()


False

In [None]:
import os
from dotenv import load_dotenv

dotenv_path = os.path.abspath(os.path.join(os.getcwd(), '../.env'))
print(f"Loading .env from: {dotenv_path}")

load_dotenv(dotenv_path)

api_key = os.getenv('GROQ_API_KEY')
if api_key and api_key != 'your_groq_api_key_here':
    print("Groq API key loaded successfully")
else:
    print("Please set your GROQ_API_KEY in the .env file")


python-dotenv could not parse statement starting at line 1


Loading .env from: /Users/kxshrx/dev/log-classification/.env
Groq API key loaded successfully


In [27]:
from langchain_groq import ChatGroq

# Initialize the chat LLM
llm = ChatGroq(
    model="llama-3.1-8b-instant",
    temperature=0.0,
    max_retries=2,
)

# Prepare chat messages as list of tuples (role, content)
messages = [
    ("system", "You are a helpful translator. Translate the user sentence to French."),
    ("human", "I love programming."),
]

# Invoke the model
response = llm.invoke(messages)

print("Response:", response.content)


Response: The translation of "I love programming" to French is:

"J'adore le programmation."


In [None]:
# Load your BERT-processed dataset
df = pd.read_csv('../results/nova_logs_with_bert.csv')

unclassified_logs = df[
    (df['regex_label'].isnull()) & 
    (df['bert_label'].isnull())
].copy()

print(f"Data Analysis:")
print(f"Total logs: {len(df)}")
print(f"Regex classified: {df['regex_label'].notnull().sum()}")
print(f"BERT classified: {df['bert_label'].notnull().sum()}")
print(f"Remaining for LLM: {len(unclassified_logs)}")

print(f"\nSample logs for LLM classification:")
for i, log in enumerate(unclassified_logs['raw_log_text'].head(3), 1):
    print(f"{i}. {log[:100]}...")


Data Analysis:
Total logs: 61950
Regex classified: 36537
BERT classified: 14166
Remaining for LLM: 14972

Sample logs for LLM classification:
2. ERROR nova.compute.manager [instance: c265f382-e5d8-44fb-98c8-84abd4592037]     self.force_reraise()...
3. <entry name='serial'>f41265c7-0cc0-4212-8ab4-89626d362895</entry>...


In [None]:
from langchain_groq import ChatGroq
from langchain.schema import HumanMessage, SystemMessage
from langchain.prompts import PromptTemplate

try:
    llm = ChatGroq(
        groq_api_key=os.getenv('GROQ_API_KEY'),
        model_name=os.getenv('GROQ_MODEL', 'llama-3.3-70b-versatile'),
        temperature=0.3,
        max_tokens=200
    )
    
    print("LangChain Groq client initialized successfully")
    print(f"Using model: {os.getenv('GROQ_MODEL', 'llama-3.3-70b-versatile')}")
    
    test_response = llm.invoke([HumanMessage(content="Hello, respond with 'Connection successful'")])
    print(f"Connection test: {test_response.content}")
    
except Exception as e:
    print(f"Error initializing LangChain Groq client: {e}")
    print("Please check your API key in .env file")


LangChain Groq client initialized successfully
Using model: llama-3.3-70b-versatile
Connection test: Connection successful


# Define Pydantic Model

In [31]:
from pydantic import BaseModel, Field

# Define Pydantic output structure
class LogClassification(BaseModel):
    category: str = Field(..., description="The classification category")
    confidence: float = Field(..., ge=0.0, le=1.0, description="Confidence score between 0 and 1")
    reasoning: str = Field(..., description="Brief explanation for the classification")

print("Pydantic LogClassification model defined")
print(f"Model fields: {list(LogClassification.model_fields.keys())}")


Pydantic LogClassification model defined
Model fields: ['category', 'confidence', 'reasoning']


In [37]:
def get_examples_by_category(df):
    """Get real examples from your dataset by category"""
    
    # Get regex examples
    regex_examples = df[df['regex_label'].notnull()]
    bert_examples = df[df['bert_label'].notnull()]
    
    category_examples = {
        'System_Operations': [],
        'Instance_Management': [],
        'Network_Operations': [],
        'Resource_Management': [],
        'Scheduler_Operations': [],
        'Error_Handling': []
    }
    
    # Add regex examples
    for _, row in regex_examples.head(100).iterrows():  # Limit for speed
        if 'System_Operations' in str(row['regex_label']):
            category_examples['System_Operations'].append(row['raw_log_text'])
        elif 'Instance_Management' in str(row['regex_label']):
            category_examples['Instance_Management'].append(row['raw_log_text'])
    
    # Add BERT examples
    for _, row in bert_examples.head(100).iterrows():  # Limit for speed
        category = row['bert_label']
        if category in category_examples:
            category_examples[category].append(row['raw_log_text'])
    
    return category_examples

# Extract examples
examples_by_category = get_examples_by_category(df)

print("Examples extracted by category")
for category, examples in examples_by_category.items():
    print(f"  {category}: {len(examples)} examples")


Examples extracted by category
  System_Operations: 27 examples
  Instance_Management: 75 examples
  Network_Operations: 68 examples
  Resource_Management: 0 examples
  Scheduler_Operations: 30 examples
  Error_Handling: 0 examples


In [42]:
import random

def select_random_examples(category_examples, n_per_category=2):
    """Randomly select examples for each category"""
    
    selected = {}
    
    for category, examples in category_examples.items():
        if len(examples) > 0:
            # Select random examples
            n_select = min(n_per_category, len(examples))
            selected[category] = random.sample(examples, n_select)
        else:
            # Fallback if no examples
            fallback = {
                'System_Operations': ["INFO nova.virt.libvirt.driver [req-abc] Creating image"],
                'Instance_Management': ["INFO nova.compute.manager [req-def] VM Started"],
                'Network_Operations': ["INFO os_vif [req-ghi] Successfully plugged vif"],
                'Resource_Management': ["INFO nova.compute.claims [req-jkl] Claim successful"],
                'Scheduler_Operations': ["INFO nova.scheduler.client.report [req-mno] Deleted allocation"],
                'Error_Handling': ["ERROR nova.compute.manager [instance: xyz] File not found"]
            }
            selected[category] = [fallback[category][0]]
    
    return selected

# Select random examples
selected_examples = select_random_examples(examples_by_category, n_per_category=2)

print("Random examples selected")
for category, examples in selected_examples.items():
    print(f"  {category}: {len(examples)} selected")
    if examples:
        print(f"    Sample: {examples[0][:60]}...")


Random examples selected
  System_Operations: 2 selected
    Sample: INFO nova.virt.libvirt.driver [req-c081868d-4495-4a56-adaa-3...
  Instance_Management: 2 selected
    Sample: INFO nova.compute.manager [None req-77b899cc-1990-4518-8ca6-...
  Network_Operations: 2 selected
    Sample: INFO os_vif [req-2b352c90-31b9-440d-8dfd-83a797a95e41] Succe...
  Resource_Management: 1 selected
    Sample: INFO nova.compute.claims [req-jkl] Claim successful...
  Scheduler_Operations: 2 selected
    Sample: ERROR nova.compute.manager [instance: 80cd044d-b8f2-4dab-b6b...
  Error_Handling: 1 selected
    Sample: ERROR nova.compute.manager [instance: xyz] File not found...


In [43]:
def format_examples_for_prompt(selected_examples):
    """Format examples into prompt text"""
    
    examples_text = []
    
    for category, example_logs in selected_examples.items():
        for log in example_logs:
            # Truncate long logs
            truncated = log[:120] + "..." if len(log) > 120 else log
            examples_text.append(f"{category}: \"{truncated}\"")
    
    return "\n".join(examples_text)

# Format examples
formatted_examples = format_examples_for_prompt(selected_examples)

print("Examples formatted for prompt")
print(f"Formatted examples length: {len(formatted_examples)} characters")
print(f"\nSample formatted examples:")
print(formatted_examples[:300] + "...")


Examples formatted for prompt
Formatted examples length: 1237 characters

Sample formatted examples:
System_Operations: "INFO nova.virt.libvirt.driver [req-c081868d-4495-4a56-adaa-349c1e09b072] [instance: 56b8e26a-1ce7-460d-a6e5-d60f2ffbf7fc..."
System_Operations: "INFO nova.virt.libvirt.driver [req-3a04b6a9-5a46-409e-ab95-cfbea20059aa] [instance: ba40f9c5-a0d8-4922-9e3a-8d4904ce797c..."
Instance_M...


In [44]:
from langchain.prompts import PromptTemplate

def create_prompt_with_examples(formatted_examples):
    """Create prompt template with dynamic examples"""
    
    template = f"""You are an expert OpenStack log analyst. Classify the following log message into one of these categories:

CATEGORIES:
1. System_Operations - LibVirt driver operations, system-level tasks
2. Instance_Management - VM lifecycle, instance operations  
3. Network_Operations - VIF operations, network connectivity
4. Resource_Management - Compute claims, resource allocation
5. Scheduler_Operations - Nova scheduler activities, allocation reports
6. Error_Handling - Error conditions, failures, exceptions

REAL EXAMPLES FROM YOUR DATASET:
{formatted_examples}

INSTRUCTIONS:
- Analyze the log message components (service, action, context)
- Consider the technical operation being performed
- Focus on the primary purpose/function
- Provide confidence between 0.0 and 1.0

LOG MESSAGE: {{log_message}}

Respond in valid JSON format:
{{{{
  "category": "category_name",
  "confidence": 0.85,
  "reasoning": "brief explanation"
}}}}"""
    
    return template

# Create dynamic template
dynamic_template = create_prompt_with_examples(formatted_examples)

# Create LangChain PromptTemplate
prompt = PromptTemplate(
    input_variables=["log_message"],
    template=dynamic_template
)

print("Dynamic prompt template created")
print(f"Template length: {len(dynamic_template)} characters")


Dynamic prompt template created
Template length: 2137 characters


In [49]:
print(prompt)

input_variables=['log_message'] input_types={} partial_variables={} template='You are an expert OpenStack log analyst. Classify the following log message into one of these categories:\n\nCATEGORIES:\n1. System_Operations - LibVirt driver operations, system-level tasks\n2. Instance_Management - VM lifecycle, instance operations  \n3. Network_Operations - VIF operations, network connectivity\n4. Resource_Management - Compute claims, resource allocation\n5. Scheduler_Operations - Nova scheduler activities, allocation reports\n6. Error_Handling - Error conditions, failures, exceptions\n\nREAL EXAMPLES FROM YOUR DATASET:\nSystem_Operations: "INFO nova.virt.libvirt.driver [req-c081868d-4495-4a56-adaa-349c1e09b072] [instance: 56b8e26a-1ce7-460d-a6e5-d60f2ffbf7fc..."\nSystem_Operations: "INFO nova.virt.libvirt.driver [req-3a04b6a9-5a46-409e-ab95-cfbea20059aa] [instance: ba40f9c5-a0d8-4922-9e3a-8d4904ce797c..."\nInstance_Management: "INFO nova.compute.manager [None req-77b899cc-1990-4518-8ca6-4

In [50]:
# Test the dynamic template
test_log = "INFO nova.compute.manager [req-test] [instance: test-id] Testing message"

# Format the prompt
formatted_prompt = prompt.format(log_message=test_log)

print("Template test completed")
print(f"Formatted prompt preview:")
print(formatted_prompt[:400] + "...")
print(f"\nPrompt stats:")
print(f"  Total length: {len(formatted_prompt)} characters")
print(f"  Contains examples: {'REAL EXAMPLES' in formatted_prompt}")
print(f"  Contains categories: {'CATEGORIES:' in formatted_prompt}")


Template test completed
Formatted prompt preview:
You are an expert OpenStack log analyst. Classify the following log message into one of these categories:

CATEGORIES:
1. System_Operations - LibVirt driver operations, system-level tasks
2. Instance_Management - VM lifecycle, instance operations  
3. Network_Operations - VIF operations, network connectivity
4. Resource_Management - Compute claims, resource allocation
5. Scheduler_Operations - Nov...

Prompt stats:
  Total length: 2194 characters
  Contains examples: True
  Contains categories: True


# Create Pydantic Classification Function

In [52]:
import json
from langchain.schema import HumanMessage

def classify_log_with_pydantic(log_message: str, llm, prompt_template) -> LogClassification:
    """Classify a single log using LangChain with Pydantic output"""
    try:
        # Format the prompt
        formatted_prompt = prompt_template.format(log_message=log_message)
        
        # Create message
        messages = [HumanMessage(content=formatted_prompt)]
        
        # Get response from LLM
        response = llm.invoke(messages)
        response_text = response.content.strip()
        
        print(f"Raw LLM response: {response_text[:100]}...")
        
        return response_text
        
    except Exception as e:
        print(f"Error in LLM call: {e}")
        return None

print("Basic classification function created")


Basic classification function created


In [53]:
def parse_llm_response(response_text: str) -> dict:
    """Parse LLM response to extract JSON"""
    try:
        # Try direct JSON parsing
        result = json.loads(response_text)
        print("Direct JSON parsing successful")
        return result
        
    except json.JSONDecodeError:
        # Try to extract JSON from response
        import re
        json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
        
        if json_match:
            json_str = json_match.group()
            try:
                result = json.loads(json_str)
                print("Extracted JSON parsing successful")
                return result
            except json.JSONDecodeError:
                print("Extracted JSON parsing failed")
                return None
        else:
            print("No JSON found in response")
            return None

# Test JSON parsing
test_json = '{"category": "System_Operations", "confidence": 0.85, "reasoning": "Test"}'
parsed = parse_llm_response(test_json)
print(f"Test parsing result: {parsed}")


Direct JSON parsing successful
Test parsing result: {'category': 'System_Operations', 'confidence': 0.85, 'reasoning': 'Test'}


In [54]:
def create_pydantic_result(json_data: dict) -> LogClassification:
    """Create Pydantic object from JSON data"""
    try:
        # Validate required fields
        required_fields = ['category', 'confidence', 'reasoning']
        if not all(field in json_data for field in required_fields):
            missing = [f for f in required_fields if f not in json_data]
            raise ValueError(f"Missing fields: {missing}")
        
        # Create Pydantic object
        result = LogClassification(**json_data)
        print(f"Pydantic object created: {result.category}")
        return result
        
    except Exception as e:
        print(f"Pydantic creation failed: {e}")
        # Return default object
        return LogClassification(
            category="Unknown",
            confidence=0.0,
            reasoning=f"Pydantic error: {str(e)}"
        )

# Test Pydantic creation
test_data = {"category": "System_Operations", "confidence": 0.85, "reasoning": "Test reasoning"}
pydantic_obj = create_pydantic_result(test_data)
print(f"Test Pydantic object: {pydantic_obj}")


Pydantic object created: System_Operations
Test Pydantic object: category='System_Operations' confidence=0.85 reasoning='Test reasoning'


In [55]:
def classify_single_log_complete(log_message: str, llm, prompt_template) -> LogClassification:
    """Complete classification function with all steps"""
    
    # Step 1: Get LLM response
    response_text = classify_log_with_pydantic(log_message, llm, prompt_template)
    if response_text is None:
        return LogClassification(
            category="Error",
            confidence=0.0,
            reasoning="LLM call failed"
        )
    
    # Step 2: Parse JSON
    json_data = parse_llm_response(response_text)
    if json_data is None:
        return LogClassification(
            category="Unknown",
            confidence=0.0,
            reasoning="JSON parsing failed"
        )
    
    # Step 3: Create Pydantic object
    result = create_pydantic_result(json_data)
    return result

print("Complete classification function created")


Complete classification function created


In [56]:
# Test with a real log
test_log = unclassified_logs.iloc[0]['raw_log_text']
print(f"Testing complete function with log:")
print(f"   {test_log[:80]}...")

# Run complete classification
result = classify_single_log_complete(test_log, llm, prompt)

print(f"\nComplete test result:")
print(f"   Category: {result.category}")
print(f"   Confidence: {result.confidence}")
print(f"   Reasoning: {result.reasoning}")
print(f"   Type: {type(result)}")


Testing complete function with log:
Raw LLM response: ```json
{
  "category": "Error_Handling",
  "confidence": 0.85,
  "reasoning": "The log message indi...
Extracted JSON parsing successful
Pydantic object created: Error_Handling

Complete test result:
   Category: Error_Handling
   Confidence: 0.85
   Type: <class '__main__.LogClassification'>


# Create Batch Processing Function

In [58]:
import time

def process_batch_of_logs(logs_batch, llm, prompt_template, delay=0.1):
    """Process a small batch of logs"""
    results = []
    
    print(f"Processing batch of {len(logs_batch)} logs...")
    
    for idx, log_message in enumerate(logs_batch):
        print(f"  Log {idx+1}/{len(logs_batch)}", end='\r')
        
        # Classify single log
        result = classify_single_log_complete(log_message, llm, prompt_template)
        results.append(result)
        
        # Rate limiting
        time.sleep(delay)
    
    print(f"\nBatch processing completed!")
    return results

print("Batch processing function created")


Batch processing function created


In [59]:
# Test with 3 logs first
test_batch_size = 3
test_logs_batch = unclassified_logs.head(test_batch_size)['raw_log_text'].tolist()

print(f"Testing batch processing with {test_batch_size} logs...")
print("This will take about 30-60 seconds...")

# Process small test batch
batch_results = process_batch_of_logs(
    test_logs_batch, 
    llm, 
    prompt,
    delay=0.2  # Slower for testing
)

print(f"\nBatch results summary:")
for i, result in enumerate(batch_results):
    print(f"  {i+1}. {result.category} (conf: {result.confidence})")


Testing batch processing with 3 logs...
This will take about 30-60 seconds...
Processing batch of 3 logs...
Raw LLM response: ```json
{
  "category": "Error_Handling",
  "confidence": 0.85,
  "reasoning": "The log message indi...
Extracted JSON parsing successful
Pydantic object created: Error_Handling
Raw LLM response: ```json
{
  "category": "Error_Handling",
  "confidence": 0.9,
  "reasoning": "The log message conta...
Extracted JSON parsing successful
Pydantic object created: Error_Handling
Raw LLM response: ```
{
  "category": "Error_Handling",
  "confidence": 0.6,
  "reasoning": "The log message does not ...
Extracted JSON parsing successful
Pydantic object created: Error_Handling

Batch processing completed!

Batch results summary:
  1. Error_Handling (conf: 0.85)
  2. Error_Handling (conf: 0.9)
  3. Error_Handling (conf: 0.6)


In [60]:
def results_to_dataframe(results, original_indices):
    """Convert Pydantic results to DataFrame"""
    
    data = []
    for i, result in enumerate(results):
        data.append({
            'original_index': original_indices[i],
            'llm_category': result.category,
            'llm_confidence': result.confidence,
            'llm_reasoning': result.reasoning
        })
    
    return pd.DataFrame(data)

# Convert test results
test_indices = unclassified_logs.head(test_batch_size).index.tolist()
results_df = results_to_dataframe(batch_results, test_indices)

print("Results converted to DataFrame")
print(f"Results DataFrame shape: {results_df.shape}")
print(f"\nSample results:")
print(results_df[['llm_category', 'llm_confidence']].head())


Results converted to DataFrame
Results DataFrame shape: (3, 4)

Sample results:
     llm_category  llm_confidence
0  Error_Handling            0.85
1  Error_Handling            0.90
2  Error_Handling            0.60


In [61]:
def analyze_batch_results(results):
    """Analyze batch classification results"""
    
    categories = [r.category for r in results]
    confidences = [r.confidence for r in results]
    
    print(f"Batch Analysis:")
    print(f"  Total processed: {len(results)}")
    
    # Category distribution
    from collections import Counter
    category_counts = Counter(categories)
    print(f"\nCategories found:")
    for category, count in category_counts.items():
        print(f"  {category}: {count}")
    
    # Confidence stats
    if confidences:
        avg_conf = sum(confidences) / len(confidences)
        high_conf = len([c for c in confidences if c >= 0.7])
        print(f"\nConfidence stats:")
        print(f"  Average: {avg_conf:.3f}")
        print(f"  High confidence (≥0.7): {high_conf}/{len(confidences)}")
    
    # Show sample reasoning
    print(f"\nSample reasoning:")
    for i, result in enumerate(results[:2]):
        print(f"  {i+1}. {result.reasoning[:60]}...")

# Analyze test batch
analyze_batch_results(batch_results)


Batch Analysis:
  Total processed: 3

Categories found:
  Error_Handling: 3

Confidence stats:
  Average: 0.783
  High confidence (≥0.7): 2/3

Sample reasoning:
  2. The log message contains an ERROR level notification from no...


# Apply Confidence Threshold

In [63]:
def filter_by_confidence(results_df, confidence_threshold=0.7):
    """Filter results by confidence threshold"""
    
    high_confidence = results_df[results_df['llm_confidence'] >= confidence_threshold]
    low_confidence = results_df[results_df['llm_confidence'] < confidence_threshold]
    
    print(f"Confidence filtering (threshold: {confidence_threshold}):")
    print(f"  High confidence: {len(high_confidence)}")
    print(f"  Low confidence: {len(low_confidence)}")
    
    return high_confidence, low_confidence

# Apply confidence filtering to test results
high_conf_results, low_conf_results = filter_by_confidence(results_df, confidence_threshold=0.7)

print(f"\nHigh confidence results:")
print(high_conf_results[['llm_category', 'llm_confidence']])

if len(low_conf_results) > 0:
    print(f"\n⚠️  Low confidence results:")
    print(low_conf_results[['llm_category', 'llm_confidence']])


Confidence filtering (threshold: 0.7):
  High confidence: 2
  Low confidence: 1

High confidence results:
     llm_category  llm_confidence
0  Error_Handling            0.85
1  Error_Handling            0.90

⚠️  Low confidence results:
     llm_category  llm_confidence
2  Error_Handling             0.6


In [64]:
# Process a larger batch (20 logs)
larger_batch_size = 20
larger_test_logs = unclassified_logs.head(larger_batch_size)['raw_log_text'].tolist()

print(f"Processing larger batch of {larger_batch_size} logs...")
print("This will take about 3-5 minutes...")
print("⏳ Please wait...")

# Process larger batch
larger_batch_results = process_batch_of_logs(
    larger_test_logs, 
    llm, 
    prompt,
    delay=0.15  # Slightly faster
)

print(f"\nLarger batch completed!")
print(f"Processed {len(larger_batch_results)} logs")


Processing larger batch of 20 logs...
This will take about 3-5 minutes...
⏳ Please wait...
Processing batch of 20 logs...
Raw LLM response: ```json
{
  "category": "Error_Handling",
  "confidence": 0.85,
  "reasoning": "The log message indi...
Extracted JSON parsing successful
Pydantic object created: Error_Handling
Raw LLM response: ```json
{
  "category": "Error_Handling",
  "confidence": 0.95,
  "reasoning": "The log message cont...
Extracted JSON parsing successful
Pydantic object created: Error_Handling
Raw LLM response: ```
{
  "category": "Error_Handling",
  "confidence": 0.6,
  "reasoning": "The log message does not ...
Extracted JSON parsing successful
Pydantic object created: Error_Handling
Raw LLM response: ```json
{
  "category": "Error_Handling",
  "confidence": 0.9,
  "reasoning": "The log message conta...
Extracted JSON parsing successful
Pydantic object created: Error_Handling
Raw LLM response: ```json
{
  "category": "Error_Handling",
  "confidence": 0.95,
  "reasoning

In [65]:
# Analyze larger batch results
analyze_batch_results(larger_batch_results)

# Convert to DataFrame
larger_indices = unclassified_logs.head(larger_batch_size).index.tolist()
larger_results_df = results_to_dataframe(larger_batch_results, larger_indices)

# Apply confidence filtering
high_conf_larger, low_conf_larger = filter_by_confidence(larger_results_df, confidence_threshold=0.7)

print(f"\nLarger batch summary:")
print(f"  Total processed: {len(larger_results_df)}")
print(f"  High confidence: {len(high_conf_larger)} ({len(high_conf_larger)/len(larger_results_df)*100:.1f}%)")
print(f"  Low confidence: {len(low_conf_larger)} ({len(low_conf_larger)/len(larger_results_df)*100:.1f}%)")


Batch Analysis:
  Total processed: 20

Categories found:
  Error_Handling: 18
  Resource_Management: 2

Confidence stats:
  Average: 0.812
  High confidence (≥0.7): 16/20

Sample reasoning:
  2. The log message contains an 'ERROR' level notification from ...
Confidence filtering (threshold: 0.7):
  High confidence: 16
  Low confidence: 4

Larger batch summary:
  Total processed: 20
  High confidence: 16 (80.0%)
  Low confidence: 4 (20.0%)


In [67]:
# Estimate time and cost for full dataset
total_unclassified = len(unclassified_logs)
avg_time_per_log = 0.15 + 0.2  # delay + processing time
estimated_time_minutes = (total_unclassified * avg_time_per_log) / 60

print(f"Full dataset processing estimates:")
print(f"  Total unclassified logs: {total_unclassified:,}")
print(f"  Estimated time: {estimated_time_minutes:.1f} minutes ({estimated_time_minutes/60:.1f} hours)")
print(f"  Estimated API calls: {total_unclassified:,}")

# Based on larger batch performance
if len(larger_batch_results) > 0:
    high_conf_rate = len(high_conf_larger) / len(larger_results_df)
    estimated_high_conf = int(total_unclassified * high_conf_rate)
    estimated_low_conf = total_unclassified - estimated_high_conf
    
    print(f"\nEstimated final results:")
    print(f"  High confidence classifications: {estimated_high_conf:,}")
    print(f"  Low confidence (manual review): {estimated_low_conf:,}")
    
    # Final pipeline estimate
    regex_count = df['regex_label'].notnull().sum()
    bert_count = df['bert_label'].notnull().sum()
    
    print(f"\nComplete pipeline estimate:")
    print(f"  Regex classified: {regex_count:,} ({regex_count/len(df)*100:.1f}%)")
    print(f"  BERT classified: {bert_count:,} ({bert_count/len(df)*100:.1f}%)")
    print(f"  LLM classified: {estimated_high_conf:,} ({estimated_high_conf/len(df)*100:.1f}%)")
    print(f"  Manual review: {estimated_low_conf:,} ({estimated_low_conf/len(df)*100:.1f}%)")
    
    total_automated = regex_count + bert_count + estimated_high_conf
    print(f"  Total automated: {total_automated:,} ({total_automated/len(df)*100:.1f}%)")


Full dataset processing estimates:
  Total unclassified logs: 14,972
  Estimated time: 87.3 minutes (1.5 hours)
  Estimated API calls: 14,972

Estimated final results:
  High confidence classifications: 11,977
  Low confidence (manual review): 2,995

Complete pipeline estimate:
  Regex classified: 36,537 (59.0%)
  BERT classified: 14,166 (22.9%)
  LLM classified: 11,977 (19.3%)
  Manual review: 2,995 (4.8%)
  Total automated: 62,680 (101.2%)


# Create Error Subcategories

In [68]:
# Enhanced categories with error subcategories
ENHANCED_CATEGORIES = {
    # System Operations
    'System_Operations': 'LibVirt driver operations, system-level tasks',
    'Instance_Management': 'VM lifecycle, instance operations',
    'Network_Operations': 'VIF operations, network connectivity', 
    'Resource_Management': 'Compute claims, resource allocation',
    'Scheduler_Operations': 'Nova scheduler activities, allocation reports',
    
    # Error Subcategories (instead of generic Error_Handling)
    'Boot_Timeout_Errors': 'VM boot timeouts, startup failures',
    'Network_Connection_Errors': 'VIF connection failures, network issues',
    'Resource_Allocation_Errors': 'Memory/CPU allocation failures, resource exhaustion',
    'File_System_Errors': 'File not found, permission errors, I/O failures',
    'Configuration_Errors': 'Invalid config, missing parameters, setup issues',
    'Service_Communication_Errors': 'API timeouts, service unavailable, connection refused'
}

print("Enhanced categories with error subcategories defined")
print(f"Total categories: {len(ENHANCED_CATEGORIES)}")
for category, description in ENHANCED_CATEGORIES.items():
    print(f"  {category}: {description}")


Enhanced categories with error subcategories defined
Total categories: 11
  System_Operations: LibVirt driver operations, system-level tasks
  Instance_Management: VM lifecycle, instance operations
  Network_Operations: VIF operations, network connectivity
  Resource_Management: Compute claims, resource allocation
  Scheduler_Operations: Nova scheduler activities, allocation reports
  Boot_Timeout_Errors: VM boot timeouts, startup failures
  Network_Connection_Errors: VIF connection failures, network issues
  Resource_Allocation_Errors: Memory/CPU allocation failures, resource exhaustion
  File_System_Errors: File not found, permission errors, I/O failures
  Configuration_Errors: Invalid config, missing parameters, setup issues
  Service_Communication_Errors: API timeouts, service unavailable, connection refused


In [69]:
def create_error_subcategory_prompt(formatted_examples):
    """Create prompt with specific error subcategories"""
    
    subcategory_template = f"""You are an expert OpenStack log analyst. Classify the following log message into the MOST SPECIFIC category that matches the error type or operation.

CATEGORIES:

SYSTEM OPERATIONS:
1. System_Operations - LibVirt driver operations, system-level tasks
2. Instance_Management - VM lifecycle (start/stop/pause), instance state changes  
3. Network_Operations - VIF operations, network setup/teardown
4. Resource_Management - Compute claims, resource allocation
5. Scheduler_Operations - Nova scheduler activities, placement decisions

ERROR SUBCATEGORIES (Be Specific!):
6. Boot_Timeout_Errors - VM boot timeouts, startup failures, "_wait_for_boot" issues
7. Network_Connection_Errors - VIF connection failures, network setup issues
8. Resource_Allocation_Errors - Memory/CPU allocation failures, resource exhaustion
9. File_System_Errors - File not found, permission errors, I/O failures  
10. Configuration_Errors - Invalid config, missing parameters, setup issues
11. Service_Communication_Errors - API timeouts, service unavailable, connection refused

CLASSIFICATION RULES:
- For WARNING/ERROR logs: Choose the MOST SPECIFIC error subcategory
- For INFO logs: Choose the appropriate system operation category
- Look for keywords: "timeout" → Boot_Timeout_Errors, "file" → File_System_Errors
- Focus on the ROOT CAUSE, not just that it's an error

REAL EXAMPLES:
{formatted_examples}

INSTRUCTIONS:
- Read the ENTIRE log message
- Identify the specific type of error or operation
- Choose the MOST SPECIFIC category that fits
- Provide confidence 0.6-1.0 (be realistic)

LOG MESSAGE: {{log_message}}

Respond in JSON format:
{{{{
  "category": "specific_category_name",
  "confidence": 0.85,
  "reasoning": "explanation focusing on why this specific subcategory"
}}}}"""
    
    return subcategory_template

# Create enhanced prompt
enhanced_examples = format_examples_for_prompt(selected_examples)
subcategory_prompt_template = create_error_subcategory_prompt(enhanced_examples)
subcategory_prompt = PromptTemplate(
    input_variables=["log_message"],
    template=subcategory_prompt_template
)

print("Error subcategory prompt created")


Error subcategory prompt created


In [70]:
# Test with your existing logs to see subcategory distribution
test_subcategory_size = 10
test_subcategory_logs = unclassified_logs.head(test_subcategory_size)['raw_log_text'].tolist()

print(f"Testing error subcategory classification with {test_subcategory_size} logs...")

# Process with subcategory prompt
subcategory_results = process_batch_of_logs(
    test_subcategory_logs, 
    llm, 
    subcategory_prompt,
    delay=0.15
)

print(f"\nSubcategory results:")
categories = [r.category for r in subcategory_results]
from collections import Counter
subcategory_dist = Counter(categories)

for category, count in subcategory_dist.items():
    print(f"  {category}: {count} ({count/len(subcategory_results)*100:.1f}%)")

# Show sample classifications
print(f"\nSample subcategory classifications:")
for i, result in enumerate(subcategory_results[:3]):
    print(f"  {i+1}. {result.category} (conf: {result.confidence})")
    print(f"     Reasoning: {result.reasoning[:80]}...")


Testing error subcategory classification with 10 logs...
Processing batch of 10 logs...
Raw LLM response: ```json
{
  "category": "Boot_Timeout_Errors",
  "confidence": 0.9,
  "reasoning": "The log message ...
Extracted JSON parsing successful
Pydantic object created: Boot_Timeout_Errors
Raw LLM response: ```
{
  "category": "Service_Communication_Errors",
  "confidence": 0.8,
  "reasoning": "The log mes...
Extracted JSON parsing successful
Pydantic object created: Service_Communication_Errors
Raw LLM response: ```
{
  "category": "System_Operations",
  "confidence": 0.6,
  "reasoning": "The log message provid...
Extracted JSON parsing successful
Pydantic object created: System_Operations
Raw LLM response: ```json
{
  "category": "Service_Communication_Errors",
  "confidence": 0.8,
  "reasoning": "The log...
Extracted JSON parsing successful
Pydantic object created: Service_Communication_Errors
Raw LLM response: ```json
{
  "category": "Resource_Allocation_Errors",
  "confidence": 0.85

In [80]:
# Compare the distribution improvement
print(f"Classification Distribution Comparison:")
print(f"\n🔴 BEFORE (Generic Error_Handling):")
print(f"  Error_Handling: 18/20 (90%)")
print(f"  Resource_Management: 2/20 (10%)")
print(f"  Other categories: 0/20 (0%)")

print(f"\n🟢 AFTER (Error Subcategories):")
for category, count in subcategory_dist.items():
    print(f"  {category}: {count}/{len(subcategory_results)} ({count/len(subcategory_results)*100:.1f}%)")

# Calculate diversity improvement
before_categories = 2  # Only Error_Handling + Resource_Management
after_categories = len(subcategory_dist)
diversity_improvement = (after_categories - before_categories) / before_categories * 100

print(f"\nImprovement Metrics:")
print(f"  Category diversity: +{diversity_improvement:.0f}% ({before_categories} → {after_categories} categories)")
print(f"  Specificity: Much higher (specific error types vs generic 'error')")
print(f"  Actionability: Better (teams know exact error type)")


Classification Distribution Comparison:

🔴 BEFORE (Generic Error_Handling):
  Error_Handling: 18/20 (90%)
  Resource_Management: 2/20 (10%)
  Other categories: 0/20 (0%)

🟢 AFTER (Error Subcategories):
  Boot_Timeout_Errors: 1/10 (10.0%)
  Service_Communication_Errors: 2/10 (20.0%)
  System_Operations: 2/10 (20.0%)
  Resource_Allocation_Errors: 1/10 (10.0%)
  File_System_Errors: 2/10 (20.0%)
  Instance_Management: 1/10 (10.0%)
  Network_Connection_Errors: 1/10 (10.0%)

Improvement Metrics:
  Category diversity: +250% (2 → 7 categories)
  Specificity: Much higher (specific error types vs generic 'error')
  Actionability: Better (teams know exact error type)


# Scale to Full Dataset

In [None]:
# Process a substantial sample to validate consistency
large_sample_size = 500
large_sample_logs = unclassified_logs.head(large_sample_size)['raw_log_text'].tolist()

print(f"Processing large sample of {large_sample_size} logs with error subcategories...")
print("⏳ This will take about 15-20 minutes...")
print("This will give us reliable distribution statistics")

# Process large sample
large_subcategory_results = process_batch_of_logs(
    large_sample_logs, 
    llm, 
    subcategory_prompt,
    delay=0.12  # Optimized speed
)

print(f"\nLarge sample processing completed!")


Processing large sample of 500 logs with error subcategories...
⏳ This will take about 15-20 minutes...
This will give us reliable distribution statistics
Processing batch of 500 logs...
Raw LLM response: ```json
{
  "category": "Boot_Timeout_Errors",
  "confidence": 0.9,
  "reasoning": "The log message ...
Extracted JSON parsing successful
Pydantic object created: Boot_Timeout_Errors
Raw LLM response: {
  "category": "Service_Communication_Errors",
  "confidence": 0.8,
  "reasoning": "The log message...
Direct JSON parsing successful
Pydantic object created: Service_Communication_Errors
Raw LLM response: ```
{
  "category": "System_Operations",
  "confidence": 0.6,
  "reasoning": "The log message provid...
Extracted JSON parsing successful
Pydantic object created: System_Operations
Raw LLM response: ```json
{
  "category": "Service_Communication_Errors",
  "confidence": 0.8,
  "reasoning": "The log...
Extracted JSON parsing successful
Pydantic object created: Service_Communication_Err

In [75]:
# Analyze the results you already have
successful_results = [r for r in large_subcategory_results if hasattr(r, 'category') and r.category != 'Error']

print(f"Successfully processed: {len(successful_results)} logs")

# Analyze distribution
categories = [r.category for r in successful_results]
from collections import Counter
distribution = Counter(categories)

print("Subcategory Distribution from 130 logs:")
for category, count in distribution.items():
    print(f"  {category}: {count} ({count/len(successful_results)*100:.1f}%)")

# Calculate average confidence
confidences = [r.confidence for r in successful_results]
avg_confidence = sum(confidences) / len(confidences)
print(f"Average confidence: {avg_confidence:.3f}")


Successfully processed: 65 logs
Subcategory Distribution from 130 logs:
  Boot_Timeout_Errors: 3 (4.6%)
  Service_Communication_Errors: 7 (10.8%)
  System_Operations: 5 (7.7%)
  File_System_Errors: 13 (20.0%)
  Scheduler_Operations: 1 (1.5%)
  Network_Connection_Errors: 7 (10.8%)
  Resource_Management: 8 (12.3%)
  Configuration_Errors: 3 (4.6%)
  Resource_Allocation_Errors: 1 (1.5%)
  Network_Operations: 11 (16.9%)
  Instance_Management: 3 (4.6%)
  Unknown: 3 (4.6%)
Average confidence: 0.793


In [None]:
# Count successful results (filter out errors)
successful_results = []
for result in large_subcategory_results:
    if hasattr(result, 'category') and result.category not in ['Error', 'Unknown']:
        successful_results.append(result)

print(f"Successfully processed: {len(successful_results)} logs")

# Convert to DataFrame and save
import pandas as pd

results_data = []
for i, result in enumerate(successful_results):
    results_data.append({
        'log_index': i,
        'llm_category': result.category,
        'llm_confidence': result.confidence,
        'llm_reasoning': result.reasoning
    })

results_df = pd.DataFrame(results_data)

# Save the results
results_df.to_csv('../results/llm_classification_results_130_logs.csv', index=False)
print("LLM results saved!")

# Also save the actual log texts that were classified
processed_indices = unclassified_logs.head(len(successful_results)).index
processed_logs = unclassified_logs.loc[processed_indices].copy()
processed_logs['llm_category'] = [r.category for r in successful_results]
processed_logs['llm_confidence'] = [r.confidence for r in successful_results]

processed_logs.to_csv('../results/llm_classified_logs_with_text.csv', index=False)
print("Logs with LLM classifications saved!")


Successfully processed: 62 logs
LLM results saved!
Logs with LLM classifications saved!
