**Assumptions & Simplifications (Hackathon Context):**
-   **MindsDB is Mocked:** Interactions for maintenance logs and operational context are simulated.
-   **Data Source:** We use `attack_z.csv` to get samples, assuming it represents incoming pre-processed data.
-   **Focus:** Demonstrate model integration with CrewAI for an end-to-end (simulated) flow.

## Imports and Environment Setup

In [13]:
from crewai import Agent, Task, Crew, Process
import joblib

In [14]:


# --- LLM Configuration for Anthropic Claude ---
from langchain_anthropic import ChatAnthropic
import os # Ensure os is imported if not already in this cell or a prior one that's run

anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY")
if anthropic_api_key:
    print(f"ANTHROPIC_API_KEY found in environment. Starts with: {anthropic_api_key[:10]}...")
    # You can choose other Claude models like claude-3-sonnet-20240229 or claude-3-opus-20240229
    # Haiku is generally faster and more cost-effective for many tasks.
    llm = ChatAnthropic(model_name="claude-3-haiku-20240307", anthropic_api_key=anthropic_api_key, temperature=0.7)
    print("Anthropic Claude LLM (claude-3-haiku-20240307) initialized.")
else:
    print("ANTHROPIC_API_KEY NOT FOUND in environment. Please add it to your .env file (e.g., ANTHROPIC_API_KEY='your_key').")
    print("The agent will likely fail or attempt to use a default LLM if not configured.")
    llm = None # Set llm to None if key is not found

ANTHROPIC_API_KEY found in environment. Starts with: sk-ant-api...
Anthropic Claude LLM (claude-3-haiku-20240307) initialized.


In [15]:
import os
from dotenv import load_dotenv
import pandas as pd # Keep other necessary imports too

# --- Environment Setup ---
# Construct the path to the .env file in the project root
# Assumes this notebook is in a subdirectory (e.g., 'notebooks') of the project root

try:
    current_notebook_dir = os.getcwd() # Should be '.../infra-triage/notebooks'
    project_root = os.path.abspath(os.path.join(current_notebook_dir, '..')) # Goes up to '.../infra-triage'
    dotenv_path = os.path.join(project_root, '.env')

    print(f"Attempting to load .env file from: {dotenv_path}")
    if os.path.exists(dotenv_path):
        # override=True ensures it reloads if you modify .env and re-run the cell
        load_dotenv(dotenv_path=dotenv_path, override=True)
        print(".env file loaded.")

        # Test if the API key is loaded into the environment
        openai_api_key = os.environ.get("ANTHROPIC_API_KEY")
        if openai_api_key:
            print(f"ANTHROPIC_API_KEY found in environment. Starts with: {openai_api_key[:10]}...")
        else:
            print("ANTHROPIC_API_KEY NOT FOUND in environment after loading .env!")
    else:
        print(f".env file NOT FOUND at {dotenv_path}. Please ensure it exists there.")
        print("ANTHROPIC_API_KEY will not be loaded from this file.")

except Exception as e:
    print(f"An error occurred during .env loading: {e}")

# --- REST OF YOUR IMPORTS AND SETUP CODE (like loading anomaly_model, attack_z_df, etc.) ---
# Make sure all CrewAI agent, task, and crew definitions happen AFTER this .env loading block.

Attempting to load .env file from: /Users/tim/myhub.com/us-infra-avengers/bkd/.env
.env file loaded.
ANTHROPIC_API_KEY found in environment. Starts with: sk-ant-api...


In [16]:
# --- LLM Configuration for Anthropic Claude ---
from langchain_anthropic import ChatAnthropic
import os

retrieved_anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY")

if retrieved_anthropic_api_key:
    print(f"ANTHROPIC_API_KEY confirmed available for LLM setup. Starts with: {retrieved_anthropic_api_key[:10]}...")

    # Define the model name
    anthropic_model_name = "claude-3-haiku-20240307" # You can change this to other Claude models

    # Updated LLM configuration for newer CrewAI versions
    llm = ChatAnthropic(
        model=anthropic_model_name,
        anthropic_api_key=retrieved_anthropic_api_key,
        temperature=0.7,
        max_tokens=4000,  # Added explicit max_tokens
        timeout=60,       # Added timeout
        max_retries=3     # Added retry configuration
    )

    # Test the LLM with a simple call to ensure it's working
    try:
        test_response = llm.invoke("Test message")
        print(f"LLM test successful. Response type: {type(test_response)}")
        print(f"Anthropic Claude LLM ({anthropic_model_name}) initialized and tested successfully.")
    except Exception as e:
        print(f"LLM test failed: {e}")
        print("There may be an issue with the LLM configuration.")
        llm = None
else:
    print("ERROR: ANTHROPIC_API_KEY was NOT found in os.environ at the point of LLM setup.")
    print("Please ensure the .env loading cell was run successfully before this cell.")
    llm = None

ANTHROPIC_API_KEY confirmed available for LLM setup. Starts with: sk-ant-api...
LLM test successful. Response type: <class 'langchain_core.messages.ai.AIMessage'>
Anthropic Claude LLM (claude-3-haiku-20240307) initialized and tested successfully.


In [17]:
from crewai.tools import tool # Ensure this is imported if not done in Cell 3

mock_maintenance_knowledge = {
    "LIT101": "Recent maintenance on LIT101 involved a sensor calibration completed 2 hours before the anomaly. Sensor readings might still be settling. Monitor for stability.",
    "P101": "No recent maintenance scheduled or performed on P101 pump.",
    "FIT201": "FIT201 underwent a firmware update yesterday. Unlikely related to immediate operational anomalies unless the update failed.",
    "AIT501": "Scheduled cleaning for AIT501 is due next week. No active maintenance.",
    "MV501": "MV501 valve actuator was inspected this morning; no issues reported. Normal operation expected.",
    "PIT501": "PIT501 pressure sensor was replaced 3 days ago due to erratic readings. New sensor should be stable."
}

mock_operational_knowledge = {
    "LIT101": "LIT101 (Tank Level - Process 1) is critical. Sustained anomalies can lead to overflow or dry run. Brief spikes post-calibration are sometimes observed but should stabilize. Cross-reference with P101 and MV101.",
    "P101": "P101 is a primary raw water pump (Process 1). Unexpected stoppage or erratic behavior is high-priority. Check power and downstream flow (FIT101/FIT201).",
    "FIT201": "FIT201 (Flow Rate - Process 2) anomalies can indicate pump issues, blockages, or leaks. Correlate with LIT101/LIT301.",
    "AIT501": "AIT501 measures water quality (e.g., conductivity - Process 5). Gradual drifts might indicate sensor fouling. Sudden changes can signify contamination.",
    "MV501": "MV501 is a motorized valve in Process 5. Failure can disrupt flow. Check actuator and feedback.",
    "PIT501": "PIT501 measures pressure in Process 5. Abnormal pressure can indicate blockages, leaks, or pump failures. Correlated with P501."
}

@tool("Maintenance Log Checker")
def maintenance_tool(sensor_id: str) -> str:
    """Checks and returns maintenance log summary for a given SCADA sensor ID. Input should be the sensor_id string."""
    print(f"\\n<Tool Call: Maintenance Log Checker(sensor_id='{sensor_id}')>")
    response = mock_maintenance_knowledge.get(sensor_id, f"No specific recent maintenance information found for sensor {sensor_id}.")
    print(f"<Tool Response: {response}>\\n")
    return response

@tool("Operational Context Retriever")
def operational_context_tool(sensor_id: str) -> str:
    """Retrieves and returns operational context, known behaviors, and potential implications for a given SCADA sensor ID. Input should be the sensor_id string."""
    print(f"\\n<Tool Call: Operational Context Retriever(sensor_id='{sensor_id}')>")
    response = mock_operational_knowledge.get(sensor_id, f"No specific operational context available for sensor {sensor_id}.")
    print(f"<Tool Response: {response}>\\n")
    return response

## Load Trained Anomaly Detection Model

In [18]:
model_path = os.path.join(project_root, 'models', 'iforest_swat_model.pkl')
print(f"Loading model from: {model_path}")
try:
    anomaly_model = joblib.load(model_path)
    print("Anomaly detection model loaded successfully.")
except FileNotFoundError:
    print(f"ERROR: Model file not found at {model_path}. Please ensure '02_model_training.ipynb' was run and the model was saved.")
    anomaly_model = None
except Exception as e:
    print(f"ERROR: Could not load the model. {e}")
    anomaly_model = None

Loading model from: /Users/tim/myhub.com/us-infra-avengers/bkd/models/iforest_swat_model.pkl
Anomaly detection model loaded successfully.


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


## Define SCADA Triage Agent

In [19]:
if llm is not None:
    scada_triage_specialist = Agent(
        role='SCADA Anomaly Triage Specialist',
        goal="Analyze SCADA sensor anomalies and provide Assessment and RecommendedAction.",
        backstory="You are an expert SCADA analyst who determines if anomalies need urgent attention.",
        tools=[maintenance_tool, operational_context_tool],
        llm=llm,
        verbose=True,  # Changed to True for debugging
        allow_delegation=False,
        max_iter=3,    # Added iteration limit
        max_execution_time=180  # Added execution timeout (3 minutes)
    )
    print("SCADA Triage Specialist agent created successfully.")
else:
    print("ERROR: Cannot create agent - LLM is not properly initialized.")
    scada_triage_specialist = None

SCADA Triage Specialist agent created successfully.


## Define Triage Task

In [20]:
triage_task_definition = Task(
    description="""
    Analyze this anomaly: {anomaly_details}

    1. Check maintenance history using the Maintenance Log Checker tool
    2. If needed, get operational context using the Operational Context Retriever tool
    3. Provide your analysis in this exact format:

    Assessment: [Your assessment]
    RecommendedAction: [Your recommendation]
    """,
    expected_output="Assessment: [analysis]\nRecommendedAction: [recommendation]",
    agent=scada_triage_specialist
)

## Assemble Crew

In [21]:
if scada_triage_specialist is not None and triage_task_definition is not None:
    scada_crew = Crew(
        agents=[scada_triage_specialist],
        tasks=[triage_task_definition],
        process=Process.sequential,
        verbose=True  # Changed to True for debugging
    )
    print("SCADA Crew assembled successfully.")
else:
    print("ERROR: Cannot create crew - agent or task not properly initialized.")
    scada_crew = None

SCADA Crew assembled successfully.


## Process Data Sample with Model and Trigger Crew

In [22]:
# Cell 8: Define the main simulation loop and supporting functions

def process_data_and_trigger_crew(data_sample_series, data_timestamp, model, crew, all_sensor_columns):
    """
    Processes a single data sample using the anomaly detection model
    and triggers the CrewAI crew if an anomaly is detected.

    Args:
        data_sample_series (pd.Series): A single row of sensor readings (features only).
        data_timestamp (pd.Timestamp): The timestamp of the data sample.
        model: The trained anomaly detection model.
        crew: The CrewAI crew instance.
        all_sensor_columns (list): List of all sensor column names model was trained on.
    """
    if model is None:
        print("Model not loaded. Skipping prediction.")
        return

    # Ensure the sample is a DataFrame with the correct columns for the model
    sample_df = pd.DataFrame([data_sample_series], columns=all_sensor_columns)

    # Get raw anomaly score and prediction
    # Model predicts 1 for anomaly, 0 for normal (based on PyOD's IsolationForest if 'predict' is used)
    # For IsolationForest, lower decision_function scores are typically more anomalous.
    # The model.threshold_ is used by .predict() to make the 0/1 classification.
    # A score > model.threshold_ means it's classified as an anomaly (1) by predict().

    decision_scores = model.decision_function(sample_df) # Get the raw scores
    prediction = model.predict(sample_df) # Get 0 or 1 prediction

    print(f"\nProcessing sample from Timestamp: {data_timestamp}")
    print(f"Raw Anomaly Score: {decision_scores[0]:.4f}") # Print the score

    if prediction[0] == 1: # Check if it's an anomaly (PyOD: 1 for anomaly)
        print(f"ALERT: Anomaly DETECTED by model at {data_timestamp}!")

        # Determine the primarily affected sensor for reporting (simplified heuristic)
        anomalous_sensor_id = None
        # Option 1: Check against mock data keys
        for col in sample_df.columns:
            if col in mock_maintenance_knowledge: # or mock_operational_knowledge
                anomalous_sensor_id = col
                break # Found one, use it

        # Option 2: If none from mock data, pick one with max absolute Z-score (assuming data is Z-scaled)
        if not anomalous_sensor_id:
            anomalous_sensor_id = sample_df.abs().idxmax(axis=1).iloc[0]

        # Option 3: Fallback (if all Z-scores are 0 or columns are empty)
        if not anomalous_sensor_id and len(sample_df.columns) > 0:
             anomalous_sensor_id = sample_df.columns[0]

        if not anomalous_sensor_id : # Should not happen if sample_df has columns
            print("Could not determine a specific sensor_id for the anomaly. Skipping crew kickoff.")
            return

        reported_value = sample_df.iloc[0][anomalous_sensor_id]

        # Prepare the string that will fill the {anomaly_details} placeholder in the Task description
        anomaly_details_str_for_task = (
            f"SensorID: {anomalous_sensor_id}, "
            f"Timestamp: {str(data_timestamp)}, "
            f"ReportedValue: {reported_value:.4f}, " # Added formatting
            f"RawScore: {decision_scores[0]:.4f}" # Added RawScore for context
        )
        print(f"Anomaly Details for Crew: {anomaly_details_str_for_task}")

        try:
            # The key in the inputs dictionary MUST match the placeholder in the Task's description
            # Task description has "{anomaly_details}", so the key here must be "anomaly_details"
            print(f"Kicking off crew with inputs: {{'anomaly_details': '{anomaly_details_str_for_task}'}}")
            result = crew.kickoff(inputs={'anomaly_details': anomaly_details_str_for_task})

            print("\n--- CrewAI Triage Result ---")
            print(result)
            print("-----------------------------")
        except Exception as e:
            print(f"ERROR during crew kickoff: {e}")
            # You might want to print more details if the error is complex
            import traceback
            traceback.print_exc()
    else:
        print(f"Status: Data sample at {data_timestamp} classified as NORMAL by the model.")

# --- Your main simulation loop (Cell 8 continued) ---
# This should come AFTER the function definition above in the same cell, or in a subsequent cell.
# Example:
# print("\n--- Starting Simulation with Integrated Model ---")
# ... (load anomaly_model, full_data_df, model_feature_columns) ...
# sample_indices_to_test = [1, 1754, 3150, 3151] # Your example indices

# for i in sample_indices_to_test:
#    if i < len(full_data_df):
#        sample_series_features_only = full_data_df.iloc[i][model_feature_columns]
#        sample_timestamp = full_data_df.index[i]
#        process_data_and_trigger_crew(sample_series_features_only, sample_timestamp, anomaly_model, scada_crew, model_feature_columns)
# print("\n--- Simulation Ended ---")

## Simulation: Load Data, Feed to Model, and Run Triage

In [23]:
if anomaly_model:
    data_csv_path = os.path.join(project_root, 'data', 'processed', 'attack_z.csv')
    print(f"\\n--- Starting Simulation with Integrated Model ---")
    print(f"Loading data for simulation from: {data_csv_path}")

    try:
        full_data_df = pd.read_csv(data_csv_path, index_col=0, parse_dates=True)

        # Get the feature columns the model was trained on (excluding 'Normal/Attack' if present)
        # The loaded IsolationForest model from PyOD doesn't store feature names directly in a simple attribute.
        # We assume the columns in normal_z.csv/attack_z.csv (excluding 'Normal/Attack') are the correct ones.
        # If 'Normal/Attack' is in columns, drop it for prediction features.
        model_feature_columns = [col for col in full_data_df.columns if col != 'Normal/Attack']

        if not model_feature_columns:
             print("ERROR: Could not determine model feature columns from the loaded CSV.")
        else:
            print(f"Model expects {len(model_feature_columns)} features. First few: {model_feature_columns[:5]}")

            # Simulate processing a few samples
            # For a more diverse test, you might pick specific indices:
            # some known attacks, some known normal periods from your 'normal_z.csv'

            # Example: Process first 2 samples from attack_z.csv (which are likely attacks)
            # and one sample much later that might be different.
            # sample_indices_to_test = [1754, 1755, 1756, 1757] # attack_z.csv has attacks, good for testing 'anomaly' path
            sample_indices_to_test = [1, 1754, 3150, 3151]
            # If you also want to test 'normal' predictions, load 'normal_z.csv'
            # normal_data_csv_path = os.path.join(project_root, 'data', 'processed', 'normal_z.csv')
            # normal_df = pd.read_csv(normal_data_csv_path, index_col=0, parse_dates=True)
            # sample_indices_to_test.extend([len(full_data_df) + i for i in range(2)]) # Placeholder for normal samples
            # combined_df_for_testing = pd.concat([full_data_df, normal_df.iloc[:2]]) # Example only

            for i in sample_indices_to_test:
                if i < len(full_data_df):
                    sample_series_features_only = full_data_df.iloc[i][model_feature_columns]

                    sample_timestamp = full_data_df.index[i]
                    process_data_and_trigger_crew(sample_series_features_only, sample_timestamp, anomaly_model, scada_crew, model_feature_columns)
                else:
                    # Example for normal data if you were to load and append it
                    # normal_sample_idx = i - len(full_data_df)
                    # sample_series_features_only = normal_df.iloc[normal_sample_idx][model_feature_columns]
                    # sample_timestamp = normal_df.index[normal_sample_idx]
                    # process_data_and_trigger_crew(sample_series_features_only, sample_timestamp, anomaly_model, scada_crew, model_feature_columns)
                    pass # Not processing normal samples in this specific example loop

    except FileNotFoundError:
        print(f"ERROR: Data CSV file not found at {data_csv_path}.")
    except Exception as e:
        print(f"An error occurred during simulation: {e}")
else:
    print("Simulation cannot run because the anomaly model was not loaded.")

\n--- Starting Simulation with Integrated Model ---
Loading data for simulation from: /Users/tim/myhub.com/us-infra-avengers/bkd/data/processed/attack_z.csv


Model expects 34 features. First few: ['FIT101', 'LIT101', 'MV101', 'P101', 'P102']

Processing sample from Timestamp: 2015-12-28 10:00:01
Raw Anomaly Score: -0.2519
Status: Data sample at 2015-12-28 10:00:01 classified as NORMAL by the model.

Processing sample from Timestamp: 2015-12-28 10:29:14
Raw Anomaly Score: -0.0906
Status: Data sample at 2015-12-28 10:29:14 classified as NORMAL by the model.

Processing sample from Timestamp: 2015-12-28 10:52:30
Raw Anomaly Score: 0.0110
ALERT: Anomaly DETECTED by model at 2015-12-28 10:52:30!
Anomaly Details for Crew: SensorID: LIT101, Timestamp: 2015-12-28 10:52:30, ReportedValue: 0.4591, RawScore: 0.0110
Kicking off crew with inputs: {'anomaly_details': 'SensorID: LIT101, Timestamp: 2015-12-28 10:52:30, ReportedValue: 0.4591, RawScore: 0.0110'}





--- CrewAI Triage Result ---
Assessment: The anomaly on sensor LIT101 appears to be a brief spike following a recent calibration. The sensor reading of 0.4591 is slightly outside the normal operating range, but this is not uncommon immediately after calibration as readings settle. The operational context indicates this sensor is critical, so any sustained anomaly could lead to serious process issues. However, the maintenance log suggests this is a known behavior.

RecommendedAction: Continue to monitor the LIT101 sensor readings over the next few hours. If the readings stabilize within the normal operating range, no further action is required. If the readings remain erratic or the anomaly persists, cross-reference with related sensors P101 and MV101 to identify any broader process issues. Be prepared to adjust control parameters or take the tank offline if the situation does not resolve.
-----------------------------

Processing sample from Timestamp: 2015-12-28 10:52:31
Raw Anomaly S




--- CrewAI Triage Result ---
Assessment: The anomaly for sensor LIT101 appears to be a temporary fluctuation following a recent calibration. The sensor reading of 0.4565 is within the expected range, but the low RawScore of 0.0138 indicates the reading may not be fully stabilized yet. This type of brief post-calibration anomaly is common and not an immediate concern.
RecommendedAction: Continue monitoring LIT101 for the next few hours to ensure the readings stabilize. No urgent action is required at this time. Log the incident and follow up if the anomaly persists beyond the typical post-calibration settling period.
-----------------------------


In [24]:
import pandas as pd
import os # Often useful for path manipulation

# --- Load Data ---
# Construct the path to the data file relative to the project root
# Assuming your notebook is in 'notebooks/' and data is in 'data/processed/'
# Adjust if your notebook structure is different.

# Get the absolute path of the current notebook (or script)
try:
    current_dir = os.path.dirname(os.path.abspath(__file__)) # For .py scripts
except NameError:
    current_dir = os.getcwd() # For .ipynb notebooks

project_root = os.path.abspath(os.path.join(current_dir, '..')) # Go up one level from 'notebooks' to project root
attack_data_path = os.path.join(project_root, 'data', 'processed', 'attack_z.csv')

print(f"Attempting to load attack data from: {attack_data_path}")

try:
    attack_z_df = pd.read_csv(attack_data_path, index_col=0, parse_dates=True)
    print(f"Attack data loaded successfully. Shape: {attack_z_df.shape}")
    # Display the first few rows to confirm
    # display(attack_z_df.head()) # Use display() in Jupyter, print() otherwise
except FileNotFoundError:
    print(f"ERROR: File not found at {attack_data_path}")
    print("Please ensure the path is correct and the file exists.")
    attack_z_df = None # Set to None if not loaded
except Exception as e:
    print(f"An error occurred while loading attack_z.csv: {e}")
    attack_z_df = None

# Now you can define your sample_indices_to_test
# sample_indices_to_test = [1754, 1755, 1756, 1757] # Or any other indices you want to check
sample_indices_to_test = [1, 1754, 3150, 3151]

# --- Ground Truth Check (can now run if attack_z_df loaded) ---
print("\n--- Ground Truth for Sample Indices ---")
if attack_z_df is not None:
    try:
        ground_truths = attack_z_df.iloc[sample_indices_to_test]['Normal/Attack']
        print(ground_truths)
        # For better readability with the data:
        # print("\nSelected rows with ground truth:")
        # display(attack_z_df.iloc[sample_indices_to_test]) # Shows the actual data for these rows
    except IndexError:
        print(f"Error: One or more indices in {sample_indices_to_test} are out of bounds for attack_z_df (rows: {len(attack_z_df)}).")
    except KeyError:
        print("Error: 'Normal/Attack' column not found in attack_z_df.")
else:
    print("attack_z_df was not loaded, cannot check ground truths.")


Attempting to load attack data from: /Users/tim/myhub.com/us-infra-avengers/bkd/data/processed/attack_z.csv
Attack data loaded successfully. Shape: (449919, 35)

--- Ground Truth for Sample Indices ---
Timestamp
2015-12-28 10:00:01    Normal
2015-12-28 10:29:14    Attack
2015-12-28 10:52:30    Attack
2015-12-28 10:52:31    Attack
Name: Normal/Attack, dtype: object
