In [7]:
!pip install litellm
!pip install mermaid-magic


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [21]:
%load_ext mermaid_magic

The mermaid_magic extension is already loaded. To reload it, use:
  %reload_ext mermaid_magic


In [41]:
%%mermaid
graph TD
    A[Start Jupyter Notebook] --> B{Load Configuration};
    B --> C{Load Benchmark Data};
    C --> D{Select LLM Model via Ollama Endpoint};
    D --> E{Iterate Through Log Snippets Scenarios};
    E --> F{For Each Log Snippet};
    F --> G{Format Prompt for LLM};
    G --> H{Send Request to Ollama Endpoint};
    H --> I{Receive LLM Response};
    I --> J{Evaluate LLM Response};
    J --> K{Store Evaluation Metrics};
    K --> E;
    E -- All Snippets Processed --> L{Aggregate and Analyze Results};
    L --> M{Visualize Results};
    M --> N[End Report Findings];

    subgraph "Model Abstraction Layer Pluggable"
        O[Ollama Model 1 Interface]
        P[Ollama Model 2 Interface]
        Q[More Models...]
    end

    D -.-> O;
    D -.-> P;
    D -.-> Q;

    subgraph "Benchmark Data Store"
        R[Log Snippet 1 example Error Log]
        S[Expected Analysis 1 Ground Truth]
        T[Log Snippet 2 example Security Event]
        U[Expected Analysis 2 Ground Truth]
        V[More Snippets...]
    end

    C --> R;
    C --> S;
    C --> T;
    C --> U;
    C --> V;

    subgraph "Evaluation Logic"
        W[Define Evaluation Criteria example Accuracy Relevance Completeness]
        X[Scoring Mechanism example Keyword Match Semantic Similarity LLM as a judge]
    end

    J --> W;
    J --> X;

In [44]:
import os

class LogProcessor:
    """
    A class to load, process, and chunk log data for LLM analysis.
    """

    def __init__(self, config):
        """
        Initializes the LogProcessor.

        Args:
            config (dict): Configuration dictionary. Expected keys:
                - "window_size_lines" (int): Number of log lines per chunk.
                - "max_line_length" (int): Maximum characters allowed per line.
                                           Lines exceeding this will be truncated.
                - "slide_step" (int): Number of lines the window slides forward
                                      for the next chunk.
        """
        if not isinstance(config, dict):
            raise TypeError("Config must be a dictionary.")

        self.config = config
        self._validate_config()

        self.raw_log_lines = []
        self.processed_log_lines = []

    def _validate_config(self):
        """Validates the provided configuration."""
        required_keys = ["window_size_lines", "max_line_length", "slide_step"]
        for key in required_keys:
            if key not in self.config:
                raise ValueError(f"Missing required config key: {key}")

        if not isinstance(self.config["window_size_lines"], int) or self.config["window_size_lines"] <= 0:
            raise ValueError("Config 'window_size_lines' must be a positive integer.")
        if not isinstance(self.config["max_line_length"], int) or self.config["max_line_length"] <= 0:
            raise ValueError("Config 'max_line_length' must be a positive integer.")
        if not isinstance(self.config["slide_step"], int) or self.config["slide_step"] <= 0:
            raise ValueError("Config 'slide_step' must be a positive integer.")

    def _truncate_line(self, line: str) -> str:
        """
        Truncates a single line to the configured max_line_length.
        """
        max_len = self.config["max_line_length"]
        if len(line) > max_len:
            return line[:max_len]
        return line

    def load_logs(self, log_source):
        """
        Loads log lines from a specified source (file path or list of strings).
        Each loaded line is immediately processed (truncated).

        Args:
            log_source (str or list): The source of the log data.
                                      If str, it's treated as a file path.
                                      If list, it's treated as a list of log line strings.

        Raises:
            FileNotFoundError: If log_source is a path and the file doesn't exist.
            IOError: If there's an error reading the file.
            TypeError: If log_source is not a str or list, or if list elements are not strings.
        """
        self.raw_log_lines = []
        self.processed_log_lines = [] # Clear previously processed lines

        current_raw_lines = []
        if isinstance(log_source, str):
            if not os.path.exists(log_source):
                raise FileNotFoundError(f"Log file not found: {log_source}")
            try:
                with open(log_source, 'r', encoding='utf-8') as f:
                    # Use rstrip to remove various trailing newline characters
                    current_raw_lines = [line.rstrip('\r\n') for line in f]
            except Exception as e:
                raise IOError(f"Error reading log file {log_source}: {e}")
        elif isinstance(log_source, list):
            if not all(isinstance(line, str) for line in log_source):
                raise TypeError("If log_source is a list, all its elements must be strings.")
            # Also strip various trailing newlines from list input for consistency
            current_raw_lines = [line.rstrip('\r\n') for line in log_source]
        else:
            raise TypeError("log_source must be a file path (str) or a list of strings.")

        self.raw_log_lines = current_raw_lines
        # Process (truncate) lines immediately after loading
        self.processed_log_lines = [self._truncate_line(line) for line in self.raw_log_lines]


    def get_chunks(self):
        """
        Generates log chunks based on the configured window size and slide step.
        Each chunk is a list of processed (truncated) log lines.
        This method is a generator.

        Yields:
            list: A chunk of log lines (list of strings).
                  Returns an empty iterator if no logs are loaded or logs are empty.
        """
        if not self.processed_log_lines:
            return iter([]) # Return an empty iterator if no processed lines

        window_size = self.config["window_size_lines"]
        step = self.config["slide_step"]
        num_processed_lines = len(self.processed_log_lines)

        current_pos = 0
        while current_pos < num_processed_lines:
            window_end = current_pos + window_size
            chunk = self.processed_log_lines[current_pos:window_end]
            
            if chunk: # Only yield if the chunk is not empty
                yield chunk
            
            # Optimization: if the last chunk was smaller than window_size,
            # and step is 1, we might generate empty or redundant small chunks.
            # However, the current logic is simpler and correct.
            # The main check `current_pos < num_processed_lines` handles termination.
            # If the last chunk was full or partial, and current_pos + step >= num_processed_lines,
            # the next iteration won't yield if the slice is empty or current_pos is too high.

            # If the last chunk was already smaller than the window size,
            # and we've taken all lines, further steps won't yield new full windows.
            # The condition `if chunk:` already handles not yielding empty lists if
            # current_pos somehow goes beyond where meaningful slices can be made.
            if window_end >= num_processed_lines and len(chunk) < window_size:
                 # If the current chunk is the last possible partial chunk
                 if current_pos + step >= num_processed_lines and len(chunk) < step : # and we won't get a new line by sliding
                    pass # allow loop to terminate naturally by current_pos update

            current_pos += step

In [46]:
import unittest
import os
import tempfile

# Assuming the LogProcessor class definition is above or imported

class TestLogProcessor(unittest.TestCase):

    def setUp(self):
        self.temp_file = tempfile.NamedTemporaryFile(mode="w+", delete=False, encoding="utf-8")
        self.temp_file_path = self.temp_file.name

    def tearDown(self):
        self.temp_file.close()
        os.remove(self.temp_file_path)

    def _write_to_temp_file(self, lines):
        self.temp_file.seek(0)
        self.temp_file.truncate()
        for line in lines:
            self.temp_file.write(line + "\n")
        self.temp_file.flush()

    # 1. Initialization Tests
    def test_valid_initialization(self):
        config = {"window_size_lines": 3, "max_line_length": 10, "slide_step": 1}
        processor = LogProcessor(config)
        self.assertEqual(processor.config, config)

    def test_init_missing_config_key(self):
        with self.assertRaisesRegex(ValueError, "Missing required config key: max_line_length"):
            LogProcessor({"window_size_lines": 3, "slide_step": 1})

    def test_init_invalid_config_value_type(self):
        with self.assertRaisesRegex(ValueError, "must be a positive integer"):
            LogProcessor({"window_size_lines": "3", "max_line_length": 10, "slide_step": 1})

    def test_init_invalid_config_value_non_positive(self):
        with self.assertRaisesRegex(ValueError, "must be a positive integer"):
            LogProcessor({"window_size_lines": 0, "max_line_length": 10, "slide_step": 1})

    def test_init_non_dict_config(self):
        with self.assertRaisesRegex(TypeError, "Config must be a dictionary."):
            LogProcessor("not_a_dict")

    # 2. Log Loading and Line Processing Tests
    def test_load_from_list_and_truncate(self):
        config = {"window_size_lines": 1, "max_line_length": 5, "slide_step": 1}
        processor = LogProcessor(config)
        logs = ["AAAAA", "BBBBBBBBB", "CCC\n", "DD"]
        processor.load_logs(logs)
        expected_processed = ["AAAAA", "BBBBB", "CCC", "DD"]
        self.assertEqual(processor.processed_log_lines, expected_processed)

    def test_load_from_file_and_truncate(self):
        config = {"window_size_lines": 1, "max_line_length": 3, "slide_step": 1}
        processor = LogProcessor(config)
        file_lines = ["apple", "banana", "pi"]
        self._write_to_temp_file(file_lines)
        processor.load_logs(self.temp_file_path)
        expected_processed = ["app", "ban", "pi"]
        self.assertEqual(processor.processed_log_lines, expected_processed)

    def test_load_empty_list(self):
        config = {"window_size_lines": 1, "max_line_length": 10, "slide_step": 1}
        processor = LogProcessor(config)
        processor.load_logs([])
        self.assertEqual(processor.processed_log_lines, [])

    def test_load_empty_file(self):
        config = {"window_size_lines": 1, "max_line_length": 10, "slide_step": 1}
        processor = LogProcessor(config)
        self._write_to_temp_file([])
        processor.load_logs(self.temp_file_path)
        self.assertEqual(processor.processed_log_lines, [])

    def test_load_non_existent_file(self):
        config = {"window_size_lines": 1, "max_line_length": 10, "slide_step": 1}
        processor = LogProcessor(config)
        with self.assertRaises(FileNotFoundError):
            processor.load_logs("non_existent_file.log")

    def test_load_invalid_source_type(self):
        config = {"window_size_lines": 1, "max_line_length": 10, "slide_step": 1}
        processor = LogProcessor(config)
        with self.assertRaisesRegex(TypeError, "log_source must be a file path .* or a list"):
            processor.load_logs(123)

    def test_load_list_with_non_string_elements(self):
        config = {"window_size_lines": 1, "max_line_length": 10, "slide_step": 1}
        processor = LogProcessor(config)
        with self.assertRaisesRegex(TypeError, "all its elements must be strings"):
            processor.load_logs(["line1", 123, "line3"])
            
    def test_load_strips_trailing_newlines_from_list_input(self): # This test should now pass
        config = {"window_size_lines": 1, "max_line_length": 10, "slide_step": 1}
        processor = LogProcessor(config)
        logs = ["line1\n", "line2\r\n", "line3\r", "line4"]
        processor.load_logs(logs)
        expected_processed = ["line1", "line2", "line3", "line4"]
        self.assertEqual(processor.processed_log_lines, expected_processed)

    # 3. Chunk Generation Tests
    def test_get_chunks_basic_sliding_window(self):
        config = {"window_size_lines": 2, "max_line_length": 10, "slide_step": 1}
        processor = LogProcessor(config)
        logs = ["L1", "L2", "L3", "L4"]
        processor.load_logs(logs)
        chunks = list(processor.get_chunks())
        expected_chunks = [
            ["L1", "L2"],
            ["L2", "L3"],
            ["L3", "L4"],
            ["L4"]
        ]
        self.assertEqual(chunks, expected_chunks)

    def test_get_chunks_non_overlapping(self):
        config = {"window_size_lines": 2, "max_line_length": 10, "slide_step": 2}
        processor = LogProcessor(config)
        logs = ["L1", "L2", "L3", "L4", "L5"]
        processor.load_logs(logs)
        chunks = list(processor.get_chunks())
        expected_chunks = [
            ["L1", "L2"],
            ["L3", "L4"],
            ["L5"]
        ]
        self.assertEqual(chunks, expected_chunks)

    def test_get_chunks_step_larger_than_window(self):
        config = {"window_size_lines": 2, "max_line_length": 10, "slide_step": 3}
        processor = LogProcessor(config)
        logs = ["L1", "L2", "L3", "L4", "L5", "L6"]
        processor.load_logs(logs)
        chunks = list(processor.get_chunks())
        expected_chunks = [
            ["L1", "L2"], 
            ["L4", "L5"] 
        ]
        self.assertEqual(chunks, expected_chunks)

    def test_get_chunks_total_lines_less_than_window(self): # Corrected expectation
        config = {"window_size_lines": 5, "max_line_length": 10, "slide_step": 1}
        processor = LogProcessor(config)
        logs = ["L1", "L2", "L3"]
        processor.load_logs(logs)
        chunks = list(processor.get_chunks())
        # If slide_step is 1, it will slide until only the last element is left
        expected_chunks = [
            ["L1", "L2", "L3"],
            ["L2", "L3"],
            ["L3"]
        ]
        self.assertEqual(chunks, expected_chunks)

    def test_get_chunks_total_lines_equals_window(self):
        config = {"window_size_lines": 3, "max_line_length": 10, "slide_step": 1}
        processor = LogProcessor(config)
        logs = ["L1", "L2", "L3"]
        processor.load_logs(logs)
        chunks = list(processor.get_chunks())
        expected_chunks_slide1 = [["L1", "L2", "L3"], ["L2", "L3"], ["L3"]]
        self.assertEqual(chunks, expected_chunks_slide1)

    def test_get_chunks_empty_logs(self):
        config = {"window_size_lines": 3, "max_line_length": 10, "slide_step": 1}
        processor = LogProcessor(config)
        processor.load_logs([]) 
        chunks = list(processor.get_chunks())
        self.assertEqual(chunks, [])

    def test_get_chunks_with_line_truncation(self):
        config = {"window_size_lines": 1, "max_line_length": 3, "slide_step": 1}
        processor = LogProcessor(config)
        logs = ["AAAAA", "BB", "CCCCC"]
        processor.load_logs(logs)
        chunks = list(processor.get_chunks())
        expected_chunks = [
            ["AAA"],
            ["BB"],
            ["CCC"]
        ]
        self.assertEqual(chunks, expected_chunks)

    def test_reloading_logs_clears_previous_and_chunks_new(self):
        config = {"window_size_lines": 1, "max_line_length": 10, "slide_step": 1}
        processor = LogProcessor(config)
        
        logs1 = ["A", "B"]
        processor.load_logs(logs1)
        chunks1 = list(processor.get_chunks())
        self.assertEqual(chunks1, [["A"], ["B"]])
        self.assertEqual(processor.processed_log_lines, ["A", "B"])

        logs2 = ["X", "Y", "Z"]
        processor.load_logs(logs2) 
        chunks2 = list(processor.get_chunks())
        self.assertEqual(chunks2, [["X"], ["Y"], ["Z"]])
        self.assertEqual(processor.processed_log_lines, ["X", "Y", "Z"])


if __name__ == '__main__':
    # If running in a script:
    # unittest.main() 
    # If running in Jupyter or similar:
    suite = unittest.TestSuite()
    suite.addTest(unittest.makeSuite(TestLogProcessor))
    runner = unittest.TextTestRunner(verbosity=2) # Increased verbosity
    runner.run(suite)

test_get_chunks_basic_sliding_window (__main__.TestLogProcessor) ... ok
test_get_chunks_empty_logs (__main__.TestLogProcessor) ... ok
test_get_chunks_non_overlapping (__main__.TestLogProcessor) ... ok
test_get_chunks_step_larger_than_window (__main__.TestLogProcessor) ... ok
test_get_chunks_total_lines_equals_window (__main__.TestLogProcessor) ... ok
test_get_chunks_total_lines_less_than_window (__main__.TestLogProcessor) ... ok
test_get_chunks_with_line_truncation (__main__.TestLogProcessor) ... ok
test_init_invalid_config_value_non_positive (__main__.TestLogProcessor) ... ok
test_init_invalid_config_value_type (__main__.TestLogProcessor) ... ok
test_init_missing_config_key (__main__.TestLogProcessor) ... ok
test_init_non_dict_config (__main__.TestLogProcessor) ... ok
test_load_empty_file (__main__.TestLogProcessor) ... ok
test_load_empty_list (__main__.TestLogProcessor) ... ok
test_load_from_file_and_truncate (__main__.TestLogProcessor) ... ok
test_load_from_list_and_truncate (__main

In [48]:
import os
import re
import litellm
from typing import List, Dict, Any, Optional
from dataclasses import dataclass

class LogAnalysisLLM:
    """Wrapper class for LiteLLM to analyze log files using Gemma 3:4b via Ollama"""
    
    def __init__(self, model_name: str = "gemma3:4b", base_url: str = "http://localhost:11434"):
        """
        Initialize the LogAnalysisLLM with specified model
        
        Args:
            model_name: The name of the model to use (default: gemma3:4b)
            base_url: The base URL for Ollama API (default: http://localhost:11434)
        """
        self.model_name = model_name
        self.base_url = base_url
        
    def analyze_logs(self, log_content: str, system_prompt: str, user_prompt: Optional[str] = None) -> str:
        """
        Send log content to the LLM for analysis
        
        Args:
            log_content: The content of the log file to analyze
            system_prompt: The system prompt to guide the LLM's analysis
            user_prompt: Optional user prompt to be sent along with log content
        
        Returns:
            The LLM's response as a string
        """
        # If no user prompt is provided, create a default one
        if user_prompt is None:
            user_prompt = f"""Analyze the following log file:

{log_content}"""
        else:
            user_prompt = f"""{user_prompt}

{log_content}"""
        
        try:
            response = litellm.completion(
                model="ollama/" + self.model_name,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                api_base=self.base_url,
                temperature=0.1,  # Low temperature for more deterministic responses
                max_tokens=1000
            )
            
            return response.choices[0].message.content
        except Exception as e:
            return f"Error in LLM request: {str(e)}"

def read_log_file(file_path: str) -> str:
    """Read a log file and return its contents as a string"""
    try:
        with open(file_path, 'r') as file:
            return file.read()
    except Exception as e:
        print(f"Error reading file {file_path}: {str(e)}")
        return ""

def evaluate_task1_response(response: str) -> bool:
    """
    Evaluate the response for Task 1.1 (count 403 status codes)
    Expected answer: 6 (5 originally + 1 we added for 203.0.113.22)
    """
    # Look for numbers in the response
    numbers = re.findall(r'\b6\b', response)
    return len(numbers) > 0 and '6' in numbers

def evaluate_task2_response(response: str) -> bool:
    """
    Evaluate the response for Task 1.2 (list IPs with 403 status)
    Expected IPs: 192.0.2.101, 198.51.100.55, 203.0.113.22, 192.0.2.105
    """
    required_ips = ['192.0.2.101', '198.51.100.55', '203.0.113.22', '192.0.2.105']
    return all(ip in response for ip in required_ips)

def evaluate_task3_response(response: str) -> bool:
    """
    Evaluate the response for Task 1.3 (IPs with 403 but no subsequent 200)
    Expected IPs: 203.0.113.22, 192.0.2.105 
    """
    # Note: 192.0.2.101 and 198.51.100.55 later have 200s, so should not be in response
    required_ips = ['203.0.113.22', '192.0.2.105']
    excluded_ips = ['192.0.2.101', '198.51.100.55']
    
    return (all(ip in response for ip in required_ips) and 
            not any(ip in response for ip in excluded_ips))

def main():
    # Initialize the LLM wrapper
    llm = LogAnalysisLLM()
    
    # Read the log file
    log_content = read_log_file("./evals/security_simple.log")
    if not log_content:
        print("Failed to read log file. Exiting.")
        return

    # System prompts for each task
    system_prompts = {
        "task1": """You are a cybersecurity log analyzer. Your task is to count how many HTTP requests in the log file resulted in a 403 Forbidden status code.
Provide only the count as a number without any explanation.""",
        
        "task2": """You are a cybersecurity log analyzer. Your task is to list all unique IP addresses that made HTTP requests that resulted in a 403 Forbidden status code.
List each IP address on a separate line without any additional text.""",
        
        "task3": """You are a cybersecurity log analyzer. Your task is to identify all IP addresses that made requests resulting in a 403 Forbidden status code
but did not subsequently make any successful requests (with 200 status code). List only these IP addresses.""",
        
        "all_tasks": """You are a cybersecurity log analyzer. Analyze the provided log file to answer these three questions:

1. How many HTTP requests in the log resulted in a 403 Forbidden status code?
2. List all unique IP addresses that made HTTP requests resulting in a 403 Forbidden status code.
3. List all unique IP addresses that made requests resulting in a 403 Forbidden status code but did not subsequently make any successful requests (with 200 status code).

For each question, provide a clear, concise answer with the exact information requested. Be very precise in your analysis."""
    }

    # Process each task
    results = {}
    
    print("Running Task 1.1: Count 403 status codes")
    results["task1"] = llm.analyze_logs(log_content, system_prompts["task1"])
    task1_correct = evaluate_task1_response(results["task1"])
    print(f"Response: {results['task1']}")
    print(f"Correct: {task1_correct}\n")
    
    print("Running Task 1.2: List IPs with 403 status")
    results["task2"] = llm.analyze_logs(log_content, system_prompts["task2"])
    task2_correct = evaluate_task2_response(results["task2"])
    print(f"Response: {results['task2']}")
    print(f"Correct: {task2_correct}\n")
    
    print("Running Task 1.3: List IPs with 403 but no subsequent 200")
    results["task3"] = llm.analyze_logs(log_content, system_prompts["task3"])
    task3_correct = evaluate_task3_response(results["task3"])
    print(f"Response: {results['task3']}")
    print(f"Correct: {task3_correct}\n")
    
    print("Running All Tasks Combined")
    results["all_tasks"] = llm.analyze_logs(log_content, system_prompts["all_tasks"])
    all_task1_correct = evaluate_task1_response(results["all_tasks"])
    all_task2_correct = evaluate_task2_response(results["all_tasks"])
    all_task3_correct = evaluate_task3_response(results["all_tasks"])
    print(f"Response:\n{results['all_tasks']}")
    print(f"Task 1 Correct: {all_task1_correct}")
    print(f"Task 2 Correct: {all_task2_correct}")
    print(f"Task 3 Correct: {all_task3_correct}")
    
    # Overall results summary
    print("\n===== SUMMARY =====")
    print(f"Task 1.1: {'✅ PASS' if task1_correct else '❌ FAIL'}")
    print(f"Task 1.2: {'✅ PASS' if task2_correct else '❌ FAIL'}")
    print(f"Task 1.3: {'✅ PASS' if task3_correct else '❌ FAIL'}")
    print(f"All Tasks (Task 1.1): {'✅ PASS' if all_task1_correct else '❌ FAIL'}")
    print(f"All Tasks (Task 1.2): {'✅ PASS' if all_task2_correct else '❌ FAIL'}")
    print(f"All Tasks (Task 1.3): {'✅ PASS' if all_task3_correct else '❌ FAIL'}")

if __name__ == "__main__":
    main()

Running Task 1.1: Count 403 status codes
Response: Okay, I've analyzed the provided log data. Here's a breakdown of the key observations and potential insights:

**Overall Trends & Observations:**

* **Variety of HTTP Methods:** The log data includes a mix of GET, POST, and likely other methods (though not explicitly shown). This indicates a diverse range of operations being performed.
* **403 Forbidden Errors:** A significant number of requests result in 403 Forbidden errors. This suggests access control issues, potentially due to incorrect permissions, authentication problems, or misconfigured security rules.  The repeated 403s are a critical area to investigate.
* **404 Not Found Errors:**  Several requests result in 404 Not Found errors. This indicates resources are missing or URLs are incorrect.
* **200 OK Responses:** A large portion of requests return a 200 OK status, indicating successful operations.
* **Session Management:** The "logout" request (with a 200 OK) suggests a sess

In [51]:
import sys

def run_with_custom_prompts():
    """Run the log analysis with custom system prompts"""
    
    # Initialize the LLM wrapper
    llm = LogAnalysisLLM(model_name="gemma3:4b")
    
    # Read the log file
    log_path = "./evals/security_simple.log"
    log_content = read_log_file(log_path)
    if not log_content:
        print(f"Failed to read log file at {log_path}. Exiting.")
        return

    # Define custom system prompts - these can be modified as needed
    custom_prompts = {
        "task1_direct": """You are analyzing security logs. Count exactly how many lines in the log file contain a 403 status code. 
Output only the number, with no explanations, rationale, or additional text.""",
        
        "task1_detailed": """You are a security analyst examining web server logs. Your task is to count the occurrences of 403 Forbidden status codes.
Follow these steps:
1. Scan each line for the status code field
2. Increment a counter each time you find "403"
3. Report the final count

Format your response as: "Number of 403 status codes: [count]".""",
        
        "task2_direct": """Identify all unique IP addresses that made requests resulting in a 403 Forbidden status code.
List only the IP addresses, one per line, with no additional text.""",
        
        "task2_detailed": """You are investigating potential security breach attempts. Find all IP addresses that received HTTP 403 Forbidden responses.
For each line in the log:
1. Check if the status code is 403
2. If yes, extract the IP address
3. Add it to a running list of unique IPs

Format your response as a list with header "IPs with 403 responses:" followed by the unique IP addresses, one per line.""",
        
        "task3_direct": """Find all IP addresses that received a 403 status code but never subsequently made a successful request (200 status code).
List only those IP addresses, one per line, with no other text.""",
        
        "task3_detailed": """You are detecting persistent unauthorized access attempts.
Your task is to find IP addresses that:
1. Received at least one 403 Forbidden response
2. Did not later succeed with a 200 OK response

For each IP that had a 403:
- Check if that same IP ever has a 200 status code in a later log entry
- If it never succeeds, include it in your results

Format your response with a brief explanation and then list the IP addresses.""",
        
        "all_tasks_oneshot": """You are analyzing web server logs to identify security patterns. Answer these three questions:

1. How many total log entries have a 403 Forbidden status code?
2. What are all the unique IP addresses that received at least one 403 Forbidden response?
3. Which IP addresses received 403 Forbidden responses but never subsequently made a successful request (200 OK)?

For question 1: Provide just the number.
For questions 2 and 3: List each IP on a separate line.
Label each answer clearly.""",
        
        "all_tasks_cot": """You are analyzing web server security logs. Perform three analysis tasks:

Task 1: Count how many log entries have a 403 Forbidden status code.
Task 2: Extract all unique IP addresses that received 403 Forbidden responses.
Task 3: Identify IP addresses that received 403 Forbidden responses but never successfully connected later (never got a 200 status code in a subsequent request).

Use step-by-step reasoning for each task:
- For Task 1: Go through each line, check for 403 status, and keep a running count.
- For Task 2: Collect all IPs with 403 status, ensuring no duplicates.
- For Task 3: For each IP with a 403, check if it has any 200 responses later in the log.

Format each answer separately and clearly."""
    }

    # Test all the different prompting approaches
    print("====== Testing Different System Prompts ======\n")
    
    # Task 1 tests
    print("--- Task 1: Count 403 Status Codes ---")
    
    for prompt_key in ["task1_direct", "task1_detailed"]:
        print(f"\nUsing prompt: {prompt_key}")
        response = llm.analyze_logs(log_content, custom_prompts[prompt_key])
        is_correct = evaluate_task1_response(response)
        print(f"Response: {response.strip()}")
        print(f"Correct: {'✅ Yes' if is_correct else '❌ No'}")
    
    # Task 2 tests
    print("\n--- Task 2: List IPs with 403 Status ---")
    
    for prompt_key in ["task2_direct", "task2_detailed"]:
        print(f"\nUsing prompt: {prompt_key}")
        response = llm.analyze_logs(log_content, custom_prompts[prompt_key])
        is_correct = evaluate_task2_response(response)
        print(f"Response:\n{response.strip()}")
        print(f"Correct: {'✅ Yes' if is_correct else '❌ No'}")
    
    # Task 3 tests
    print("\n--- Task 3: List IPs with 403 But No Subsequent 200 ---")
    
    for prompt_key in ["task3_direct", "task3_detailed"]:
        print(f"\nUsing prompt: {prompt_key}")
        response = llm.analyze_logs(log_content, custom_prompts[prompt_key])
        is_correct = evaluate_task3_response(response)
        print(f"Response:\n{response.strip()}")
        print(f"Correct: {'✅ Yes' if is_correct else '❌ No'}")
    
    # All tasks combined
    print("\n--- All Tasks Combined ---")
    
    for prompt_key in ["all_tasks_oneshot", "all_tasks_cot"]:
        print(f"\nUsing prompt: {prompt_key}")
        response = llm.analyze_logs(log_content, custom_prompts[prompt_key])
        task1_correct = evaluate_task1_response(response)
        task2_correct = evaluate_task2_response(response)
        task3_correct = evaluate_task3_response(response)
        print(f"Response:\n{response.strip()}")
        print(f"Task 1 Correct: {'✅ Yes' if task1_correct else '❌ No'}")
        print(f"Task 2 Correct: {'✅ Yes' if task2_correct else '❌ No'}")
        print(f"Task 3 Correct: {'✅ Yes' if task3_correct else '❌ No'}")

if __name__ == "__main__":
    run_with_custom_prompts()


--- Task 1: Count 403 Status Codes ---

Using prompt: task1_direct
Response: Okay, I've analyzed the provided log data. Here's a breakdown of the key observations and potential insights:

**Overall Trends & Observations:**

* **Mixed HTTP Status Codes:** The data contains a significant number of different HTTP status codes, indicating a complex system with various endpoints and potential issues.
* **403 Forbidden:** The "403 Forbidden" status code appears frequently, suggesting access control problems. This could be due to incorrect permissions, authentication issues, or misconfigured access rules.
* **404 Not Found:**  The "404 Not Found" status code is also present, likely indicating that some requested resources don't exist or that URLs are incorrect.
* **502 Bad Gateway:** The presence of a 502 status code suggests a problem with a backend server. This could be due to issues with the server itself, network connectivity, or problems with dependent services.
* **Diverse Client Agent

In [53]:
"""
Benchmark runner for log analysis tasks
This script runs all the log analysis tasks and reports the results
"""

import os
import time
import json

def prepare_environment():
    """Prepare the environment for benchmarking"""
    # Create the evals directory if it doesn't exist
    os.makedirs("./evals", exist_ok=True)
    
    # Ensure the log file exists
    log_path = "./evals/security_simple.log"
    if not os.path.exists(log_path):
        print(f"Creating log file at {log_path}")
        # Add the log line for 203.0.113.22 to the original content
        original_content = read_log_file("paste.txt")
        additional_line = '"2025-05-09T10:02:42.789012Z" "GET" "/api/v1/protected/reports" "403" "203.0.113.22" "MyClient/1.1" "SESS_pqr678" "/reports/dashboard" "Reports_Access_Forbidden"'
        
        # Insert the new line at an appropriate position 
        lines = original_content.strip().split('\n')
        # Insert around line 36 (after the 10:02:40 timestamp)
        position = 36
        lines.insert(position, additional_line)
        modified_content = '\n'.join(lines)
        
        # Write the modified content to the log file
        with open(log_path, 'w') as f:
            f.write(modified_content)
        print(f"Created log file with {len(lines)} lines")
    else:
        print(f"Log file already exists at {log_path}")

def run_benchmarks():
    """Run benchmarks for all log analysis tasks"""
    # Initialize the LLM wrapper
    llm = LogAnalysisLLM()
    
    # Read the log file
    log_path = "./evals/security_simple.log"
    log_content = read_log_file(log_path)
    if not log_content:
        print(f"Failed to read log file at {log_path}. Exiting.")
        return
    
    # Define system prompts for benchmarking
    task_prompts = {
        "Task 1 (Count 403)": """You are examining web server logs. Count exactly how many HTTP requests resulted in a 403 Forbidden status code.
Output only the count as a number and nothing else.""",
        
        "Task 2 (List 403 IPs)": """You are examining web server logs. List all unique IP addresses that made HTTP requests that resulted in a 403 Forbidden status code.
List only the IP addresses, one per line, with no additional text.""",
        
        "Task 3 (IPs with 403 but no 200)": """You are examining web server logs. Find all IP addresses that received a 403 Forbidden status code but never subsequently made a successful request (with 200 status code).
List only those IP addresses, one per line, with no other text.""",
        
        "All Tasks": """Analyze these web server logs to answer three questions:

1. How many log entries have a 403 Forbidden status code?
2. List all unique IP addresses that received 403 Forbidden responses.
3. List all IP addresses that received 403 Forbidden responses but never subsequently made successful 200 requests.

Label your answers clearly."""
    }
    
    # Evaluation functions for each task
    eval_functions = {
        "Task 1 (Count 403)": evaluate_task1_response,
        "Task 2 (List 403 IPs)": evaluate_task2_response,
        "Task 3 (IPs with 403 but no 200)": evaluate_task3_response
    }

    # Store results
    benchmark_results = {}
    
    # Run each task individually
    for task_name, prompt in task_prompts.items():
        print(f"\nRunning {task_name}...")
        start_time = time.time()
        response = llm.analyze_logs(log_content, prompt)
        end_time = time.time()
        
        # Store the response
        benchmark_results[task_name] = {
            "prompt": prompt,
            "response": response,
            "execution_time": end_time - start_time
        }
        
        # For individual tasks, evaluate the result
        if task_name in eval_functions:
            benchmark_results[task_name]["correct"] = eval_functions[task_name](response)
            print(f"Response: {response.strip()}")
            print(f"Correct: {'✅ Yes' if benchmark_results[task_name]['correct'] else '❌ No'}")
            print(f"Time: {benchmark_results[task_name]['execution_time']:.2f} seconds")
        else:
            # For "All Tasks", evaluate each sub-task
            print(f"Response:\n{response.strip()}")
            task1_correct = evaluate_task1_response(response)
            task2_correct = evaluate_task2_response(response)
            task3_correct = evaluate_task3_response(response)
            
            benchmark_results[task_name]["subtasks"] = {
                "Task 1": task1_correct,
                "Task 2": task2_correct,
                "Task 3": task3_correct
            }
            
            print(f"Task 1 Correct: {'✅ Yes' if task1_correct else '❌ No'}")
            print(f"Task 2 Correct: {'✅ Yes' if task2_correct else '❌ No'}")
            print(f"Task 3 Correct: {'✅ Yes' if task3_correct else '❌ No'}")
            print(f"Time: {benchmark_results[task_name]['execution_time']:.2f} seconds")
    
    # Save benchmark results to file
    with open("benchmark_results.json", "w") as f:
        json.dump(benchmark_results, f, indent=2)
    
    # Print summary
    print("\n===== BENCHMARK SUMMARY =====")
    for task_name, result in benchmark_results.items():
        if task_name != "All Tasks":
            print(f"{task_name}: {'✅ PASS' if result['correct'] else '❌ FAIL'} - {result['execution_time']:.2f}s")
        else:
            subtasks = result["subtasks"]
            overall = all(subtasks.values())
            print(f"{task_name}: {'✅ PASS' if overall else '❌ FAIL'} - {result['execution_time']:.2f}s")
            for subtask, correct in subtasks.items():
                print(f"  - {subtask}: {'✅ PASS' if correct else '❌ FAIL'}")

if __name__ == "__main__":
    prepare_environment()
    run_benchmarks()

Log file already exists at ./evals/security_simple.log

Running Task 1 (Count 403)...
Response: Okay, I've analyzed the provided log data. Here's a breakdown of the key observations and potential insights:

**Overall Trends & Observations:**

* **Diverse Request Types:** The log data contains a wide range of HTTP request methods (GET, POST, Logout). This indicates a complex application with various functionalities.
* **403 Forbidden Errors:** There are several 403 errors, suggesting access control issues.  These are frequently associated with authentication or authorization problems. The requests with 403 errors often involve protected resources like `/api/v1/protected/reports`.
* **404 Not Found Errors:**  A few 404 errors are present, likely due to resources that don't exist or routes that are misconfigured. The "Resource Actually 404 Not 403" entry is particularly interesting – it suggests a misconfiguration where a route intended to return a 403 is actually returning a 404.
* **Log