# under development

In [1]:
import pandas as pd
import numpy as np
import json
import time
import os
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain.schema import HumanMessage
from langchain.prompts import PromptTemplate
from pydantic import BaseModel, Field
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Load environment variables
load_dotenv()

print("LLM v2: Smart Sampling Approach")


LLM v2: Smart Sampling Approach


In [6]:

# Load BERT-processed dataset
df = pd.read_csv('../data/nova_logs_with_bert.csv')
print(df.shape)

# Identify unclassified logs
unclassified_logs = df[
    (df['regex_label'].isnull()) & 
    (df['bert_label'].isnull())
].copy()

print(f"Total unclassified logs: {len(unclassified_logs)}")
print(f"Available for LLM processing: {len(unclassified_logs)}")


(61950, 10)
Total unclassified logs: 14972
Available for LLM processing: 14972


In [8]:
def create_smart_sample(unclassified_logs, target_size=1500):
    """Create smart sample prioritizing high-value logs"""
    
    smart_sample = []
    
    # Priority 1: ERROR and WARNING logs (highest value)
    error_logs = unclassified_logs[
        unclassified_logs['raw_log_text'].str.contains('ERROR|WARNING', case=False, na=False)
    ]
    priority_1 = error_logs.sample(n=min(600, len(error_logs)), random_state=42)
    smart_sample.append(priority_1)
    print(f"Priority 1 (ERROR/WARNING): {len(priority_1)} logs")


    
    # Priority 2: Small cluster logs (edge cases)
    small_cluster_logs = unclassified_logs[
        unclassified_logs['cluster_id'].isin([13, 22, 16, 24, 23])
    ]
    remaining_small = small_cluster_logs[~small_cluster_logs.index.isin(priority_1.index)]
    priority_2 = remaining_small.sample(n=min(400, len(remaining_small)), random_state=42)
    smart_sample.append(priority_2)
    print(f"Priority 2 (Small clusters): {len(priority_2)} logs")


    
    # Priority 3: Random sample from remaining
    used_indices = pd.concat(smart_sample).index
    remaining_logs = unclassified_logs[~unclassified_logs.index.isin(used_indices)]
    remaining_needed = target_size - sum(len(s) for s in smart_sample)
    priority_3 = remaining_logs.sample(n=min(remaining_needed, len(remaining_logs)), random_state=42)
    smart_sample.append(priority_3)
    print(f"Priority 3 (Random sample): {len(priority_3)} logs")


    
    # Combine all priorities
    final_sample = pd.concat(smart_sample, ignore_index=True)
    return final_sample

# Create smart sample
smart_sample_logs = create_smart_sample(unclassified_logs, target_size=1500)
print(f"Smart sample created: {len(smart_sample_logs)} logs")


Priority 2 (Small clusters): 400 logs
Priority 3 (Random sample): 500 logs
Smart sample created: 1500 logs


In [9]:
# Define Pydantic model for structured output
class LogClassification(BaseModel):
    category: str = Field(..., description="The classification category")
    confidence: float = Field(..., ge=0.0, le=1.0, description="Confidence score")
    reasoning: str = Field(..., description="Brief explanation")

# Enhanced categories with error subcategories
ENHANCED_CATEGORIES = {
    'System_Operations': 'LibVirt driver operations, system-level tasks',
    'Instance_Management': 'VM lifecycle, instance operations',
    'Network_Operations': 'VIF operations, network connectivity', 
    'Resource_Management': 'Compute claims, resource allocation',
    'Scheduler_Operations': 'Nova scheduler activities, allocation reports',
    'Boot_Timeout_Errors': 'VM boot timeouts, startup failures',
    'Network_Connection_Errors': 'VIF connection failures, network issues',
    'Resource_Allocation_Errors': 'Memory/CPU allocation failures',
    'File_System_Errors': 'File not found, permission errors, I/O failures',
    'Configuration_Errors': 'Invalid config, missing parameters',
    'Service_Communication_Errors': 'API timeouts, service unavailable'
}

print("Enhanced categories defined:")
for category, description in ENHANCED_CATEGORIES.items():
    print(f"  {category}: {description}")


Enhanced categories defined:
  System_Operations: LibVirt driver operations, system-level tasks
  Instance_Management: VM lifecycle, instance operations
  Network_Operations: VIF operations, network connectivity
  Resource_Management: Compute claims, resource allocation
  Scheduler_Operations: Nova scheduler activities, allocation reports
  Boot_Timeout_Errors: VM boot timeouts, startup failures
  Network_Connection_Errors: VIF connection failures, network issues
  Resource_Allocation_Errors: Memory/CPU allocation failures
  File_System_Errors: File not found, permission errors, I/O failures
  Configuration_Errors: Invalid config, missing parameters
  Service_Communication_Errors: API timeouts, service unavailable


In [16]:
import os
from dotenv import load_dotenv
from langchain_groq import ChatGroq  # Adjust import as needed
from langchain.prompts import PromptTemplate

# Load .env from parent directory
dotenv_path = os.path.abspath(os.path.join(os.getcwd(), '../.env'))
print(f"Loading .env from: {dotenv_path}")
load_dotenv(dotenv_path)

# Verify GROQ_API_KEY
api_key = os.getenv('GROQ_API_KEY')
if api_key and api_key != 'your_groq_api_key_here':
    print("Groq API key loaded successfully")
else:
    raise ValueError("GROQ_API_KEY not set or invalid in .env file")




python-dotenv could not parse statement starting at line 1


Loading .env from: /Users/kxshrx/dev/log-classification/.env
Groq API key loaded successfully


In [17]:

# Initialize LangChain Groq client
try:
    llm = ChatGroq(
        groq_api_key=os.getenv('GROQ_API_KEY'),
        model_name='llama-3.3-70b-versatile',
        temperature=0.3,
        max_tokens=150
    )
    print("LangChain Groq client initialized")
except Exception as e:
    print(f"Error initializing client: {e}")

# Create optimized prompt template (reduced tokens)
classification_template = """Classify this OpenStack log into the most specific category:

CATEGORIES:
1. System_Operations - LibVirt driver, system tasks
2. Instance_Management - VM lifecycle operations  
3. Network_Operations - VIF operations, connectivity
4. Resource_Management - Compute claims, allocation
5. Scheduler_Operations - Nova scheduler activities
6. Boot_Timeout_Errors - VM boot timeouts, startup failures
7. Network_Connection_Errors - VIF connection failures
8. Resource_Allocation_Errors - Memory/CPU allocation failures
9. File_System_Errors - File not found, I/O failures
10. Configuration_Errors - Invalid config, setup issues
11. Service_Communication_Errors - API timeouts, service unavailable

RULES:
- Focus on the primary operation or error type
- Be specific with error subcategories
- Provide confidence 0.6-1.0

LOG: {log_message}

JSON response:
{{"category": "category_name", "confidence": 0.8, "reasoning": "brief reason"}}"""

prompt = PromptTemplate(
    input_variables=["log_message"],
    template=classification_template
)

print("Optimized prompt template created")

LangChain Groq client initialized
Optimized prompt template created


In [18]:
def classify_log_with_uncertainty(log_message, llm, prompt_template, confidence_threshold=0.7):
    """Classify log with uncertainty handling"""
    try:
        # Format prompt
        formatted_prompt = prompt_template.format(log_message=log_message)
        messages = [HumanMessage(content=formatted_prompt)]
        
        # Get LLM response
        response = llm.invoke(messages)
        response_text = response.content.strip()
        
        # Parse JSON response
        try:
            # Extract JSON from response
            import re
            json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
            if json_match:
                json_str = json_match.group()
                result_data = json.loads(json_str)
            else:
                result_data = json.loads(response_text)
            
            # Create Pydantic object
            result = LogClassification(**result_data)
            
            # Apply uncertainty handling
            if result.confidence >= confidence_threshold:
                return result
            else:
                # Mark as uncertain with likely prediction
                uncertain_result = LogClassification(
                    category=f"Uncertain (likely: {result.category})",
                    confidence=result.confidence,
                    reasoning=f"Low confidence: {result.reasoning}"
                )
                return uncertain_result
                
        except (json.JSONDecodeError, Exception):
            # Return default for parsing errors
            return LogClassification(
                category="Unknown",
                confidence=0.0,
                reasoning="JSON parsing failed"
            )
            
    except Exception as e:
        return LogClassification(
            category="Error",
            confidence=0.0,
            reasoning=f"LLM error: {str(e)}"
        )

print("Classification function with uncertainty handling created")



Classification function with uncertainty handling created


In [19]:
def process_batch_with_rate_limits(logs_batch, llm, prompt_template, delay=0.2):
    """Process batch with rate limit awareness"""
    results = []
    
    print(f"Processing {len(logs_batch)} logs...")
    
    for idx, log_message in enumerate(logs_batch):
        if (idx + 1) % 50 == 0:
            print(f"Processed {idx + 1}/{len(logs_batch)} logs")
        
        try:
            result = classify_log_with_uncertainty(log_message, llm, prompt_template)
            results.append(result)
            
            # Rate limiting delay
            time.sleep(delay)
            
        except Exception as e:
            if "rate limit" in str(e).lower() or "429" in str(e):
                print(f"Rate limit reached at log {idx + 1}")
                print(f"Successfully processed {len(results)} logs before limit")
                break
            else:
                # Handle other errors
                error_result = LogClassification(
                    category="Error",
                    confidence=0.0,
                    reasoning=f"Processing error: {str(e)}"
                )
                results.append(error_result)
    
    return results

print("Batch processing function with rate limit handling created")


Batch processing function with rate limit handling created


In [20]:
# Process smart sample with rate limit awareness
print("Starting smart sample processing...")
print("This will process until rate limits are reached")

# Convert to list for processing
smart_sample_texts = smart_sample_logs['raw_log_text'].tolist()

# Process the smart sample
classification_results = process_batch_with_rate_limits(
    smart_sample_texts,
    llm,
    prompt,
    delay=0.15  # Optimized delay
)

print(f"Processing completed: {len(classification_results)} logs classified")

# Quick analysis of results
categories = [r.category for r in classification_results]
category_dist = Counter(categories)

print("Classification distribution:")
for category, count in category_dist.most_common():
    print(f"  {category}: {count}")


Starting smart sample processing...
This will process until rate limits are reached
Processing 1500 logs...
Processed 50/1500 logs
Processed 100/1500 logs
Processed 150/1500 logs
Processed 200/1500 logs
Processed 250/1500 logs
Processed 300/1500 logs
Processed 350/1500 logs
Processed 400/1500 logs
Processed 450/1500 logs
Processed 500/1500 logs
Processed 550/1500 logs
Processed 600/1500 logs
Processed 650/1500 logs
Processed 700/1500 logs
Processed 750/1500 logs
Processed 800/1500 logs
Processed 850/1500 logs
Processed 900/1500 logs
Processed 950/1500 logs
Processed 1000/1500 logs
Processed 1050/1500 logs
Processed 1100/1500 logs
Processed 1150/1500 logs
Processed 1200/1500 logs
Processed 1250/1500 logs
Processed 1300/1500 logs
Processed 1350/1500 logs
Processed 1400/1500 logs
Processed 1450/1500 logs
Processed 1500/1500 logs
Processing completed: 1500 logs classified
Classification distribution:
  Error: 1494
  Instance_Management: 2
  Unknown: 2
  Resource_Allocation_Errors: 1
  Netw

In [21]:
import pandas as pd
import numpy as np
import json
import time
import os
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain.schema import HumanMessage
from langchain.prompts import PromptTemplate
from pydantic import BaseModel, Field
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

print("LLM v2: Smart Sampling Approach - Clean Version")

# Load environment variables
load_dotenv()

# Load BERT-processed dataset
df = pd.read_csv('../data/nova_logs_with_bert.csv')

# Identify unclassified logs
unclassified_logs = df[(df['regex_label'].isnull()) & (df['bert_label'].isnull())].copy()
print(f"Total unclassified logs: {len(unclassified_logs)}")

# Smart sampling function
def create_smart_sample(unclassified_logs, target_size=1500):
    smart_sample = []
    
    # Priority 1: ERROR and WARNING logs
    error_logs = unclassified_logs[unclassified_logs['raw_log_text'].str.contains('ERROR|WARNING', case=False, na=False)]
    priority_1 = error_logs.sample(n=min(600, len(error_logs)), random_state=42)
    smart_sample.append(priority_1)
    print(f"Priority 1 (ERROR/WARNING): {len(priority_1)} logs")
    
    # Priority 2: Small cluster logs
    small_cluster_logs = unclassified_logs[unclassified_logs['cluster_id'].isin([13, 22, 16, 24, 23])]
    remaining_small = small_cluster_logs[~small_cluster_logs.index.isin(priority_1.index)]
    priority_2 = remaining_small.sample(n=min(400, len(remaining_small)), random_state=42)
    smart_sample.append(priority_2)
    print(f"Priority 2 (Small clusters): {len(priority_2)} logs")
    
    # Priority 3: Random sample from remaining
    used_indices = pd.concat(smart_sample).index
    remaining_logs = unclassified_logs[~unclassified_logs.index.isin(used_indices)]
    remaining_needed = target_size - sum(len(s) for s in smart_sample)
    priority_3 = remaining_logs.sample(n=min(remaining_needed, len(remaining_logs)), random_state=42)
    smart_sample.append(priority_3)
    print(f"Priority 3 (Random sample): {len(priority_3)} logs")
    
    final_sample = pd.concat(smart_sample, ignore_index=True)
    return final_sample

smart_sample_logs = create_smart_sample(unclassified_logs, target_size=1500)
print(f"Smart sample created: {len(smart_sample_logs)} logs")

# Pydantic model
class LogClassification(BaseModel):
    category: str = Field(..., description="The classification category")
    confidence: float = Field(..., ge=0.0, le=1.0, description="Confidence score")
    reasoning: str = Field(..., description="Brief explanation")

# Enhanced categories
ENHANCED_CATEGORIES = {
    'System_Operations': 'LibVirt driver operations, system-level tasks',
    'Instance_Management': 'VM lifecycle, instance operations',
    'Network_Operations': 'VIF operations, network connectivity', 
    'Resource_Management': 'Compute claims, resource allocation',
    'Scheduler_Operations': 'Nova scheduler activities, allocation reports',
    'Boot_Timeout_Errors': 'VM boot timeouts, startup failures',
    'Network_Connection_Errors': 'VIF connection failures, network issues',
    'Resource_Allocation_Errors': 'Memory/CPU allocation failures',
    'File_System_Errors': 'File not found, permission errors, I/O failures',
    'Configuration_Errors': 'Invalid config, missing parameters',
    'Service_Communication_Errors': 'API timeouts, service unavailable'
}

print("Enhanced categories defined")

# Initialize LangChain Groq client
llm = ChatGroq(
    groq_api_key=os.getenv('GROQ_API_KEY'),
    model_name='llama-3.3-70b-versatile',
    temperature=0.3,
    max_tokens=150
)
print("LangChain client initialized")

# Optimized prompt template (fixed escape sequences)
classification_template = """Classify this OpenStack log into the most specific category:

CATEGORIES:
1. System_Operations - LibVirt driver, system tasks
2. Instance_Management - VM lifecycle operations  
3. Network_Operations - VIF operations, connectivity
4. Resource_Management - Compute claims, allocation
5. Scheduler_Operations - Nova scheduler activities
6. Boot_Timeout_Errors - VM boot timeouts, startup failures
7. Network_Connection_Errors - VIF connection failures
8. Resource_Allocation_Errors - Memory/CPU allocation failures
9. File_System_Errors - File not found, I/O failures
10. Configuration_Errors - Invalid config, setup issues
11. Service_Communication_Errors - API timeouts, service unavailable

RULES:
- Focus on the primary operation or error type
- Be specific with error subcategories
- Provide confidence 0.6-1.0

LOG: {log_message}

JSON response:
{{"category": "category_name", "confidence": 0.8, "reasoning": "brief reason"}}"""

prompt = PromptTemplate(
    input_variables=["log_message"],
    template=classification_template
)
print("Prompt template created")

# Classification function with uncertainty handling
def classify_log_with_uncertainty(log_message, llm, prompt_template, confidence_threshold=0.7):
    try:
        formatted_prompt = prompt_template.format(log_message=log_message)
        messages = [HumanMessage(content=formatted_prompt)]
        response = llm.invoke(messages)
        response_text = response.content.strip()
        
        # Parse JSON response
        import re
        json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
        if json_match:
            json_str = json_match.group()
            result_data = json.loads(json_str)
        else:
            result_data = json.loads(response_text)
        
        result = LogClassification(**result_data)
        
        # Apply uncertainty handling
        if result.confidence >= confidence_threshold:
            return result
        else:
            uncertain_result = LogClassification(
                category=f"Uncertain (likely: {result.category})",
                confidence=result.confidence,
                reasoning=f"Low confidence: {result.reasoning}"
            )
            return uncertain_result
            
    except (json.JSONDecodeError, Exception):
        return LogClassification(
            category="Unknown",
            confidence=0.0,
            reasoning="JSON parsing failed"
        )
    except Exception as e:
        return LogClassification(
            category="Error",
            confidence=0.0,
            reasoning=f"LLM error: {str(e)}"
        )

# Batch processing with rate limit awareness
def process_batch_with_rate_limits(logs_batch, llm, prompt_template, delay=0.15):
    results = []
    print(f"Processing {len(logs_batch)} logs...")
    
    for idx, log_message in enumerate(logs_batch):
        if (idx + 1) % 50 == 0:
            print(f"Processed {idx + 1}/{len(logs_batch)} logs")
        
        try:
            result = classify_log_with_uncertainty(log_message, llm, prompt_template)
            results.append(result)
            time.sleep(delay)
            
        except Exception as e:
            error_str = str(e).lower()
            if "rate limit" in error_str or "429" in error_str or "too many requests" in error_str:
                print(f"Rate limit reached at log {idx + 1}")
                print(f"Successfully processed {len(results)} logs before limit")
                break
            else:
                error_result = LogClassification(
                    category="Processing_Error",
                    confidence=0.0,
                    reasoning=f"Processing error: {str(e)}"
                )
                results.append(error_result)
    
    return results

# Process smart sample
smart_sample_texts = smart_sample_logs['raw_log_text'].tolist()
classification_results = process_batch_with_rate_limits(smart_sample_texts, llm, prompt, delay=0.15)

print(f"Processing completed: {len(classification_results)} logs classified")

# Analyze results
categories = [r.category for r in classification_results]
category_dist = Counter(categories)

print("Classification distribution:")
for category, count in category_dist.most_common():
    print(f"  {category}: {count}")

# Calculate success rate
successful_classifications = [r for r in classification_results if r.category not in ['Error', 'Processing_Error', 'Unknown']]
success_rate = len(successful_classifications) / len(classification_results) * 100
print(f"Success rate: {success_rate:.1f}%")

# Save results to CSV
results_data = []
for i, result in enumerate(classification_results):
    results_data.append({
        'log_index': i,
        'llm_category': result.category,
        'llm_confidence': result.confidence,
        'llm_reasoning': result.reasoning
    })

results_df = pd.DataFrame(results_data)
results_df.to_csv('../data/nova_logs_with_llm_v2.csv', index=False)
print("Results saved to nova_logs_with_llm_v2.csv")

print("LLM v2 processing complete")


LLM v2: Smart Sampling Approach - Clean Version
Total unclassified logs: 14972
Priority 2 (Small clusters): 400 logs
Priority 3 (Random sample): 500 logs
Smart sample created: 1500 logs
Enhanced categories defined
LangChain client initialized
Prompt template created
Processing 1500 logs...


KeyboardInterrupt: 