# AutoLog Question-Answering Agent
This notebook implements a LangGraph agent that answers questions about vehicle ownership, costs, and maintenance based on the vehicles.json data.

## 1. Setup and Imports

In [None]:
from dotenv import load_dotenv
from typing import List, Any, Annotated, TypedDict
from datetime import datetime
import json
import gradio

from langchain_core.messages import SystemMessage, HumanMessage, AIMessage, ToolMessage, BaseMessage
from langchain_core.tools import BaseTool, StructuredTool
from langsmith import uuid7
from langchain_community.tools import Tool
from langgraph.checkpoint.sqlite import SqliteSaver
from langgraph.prebuilt import ToolNode
from langgraph.graph.message import add_messages
from langgraph.graph import StateGraph, START, END
from langchain_openai import ChatOpenAI
from IPython.display import Image
import sqlite3

load_dotenv(override=True)

## 2. Load Vehicle Data

In [None]:
# Load vehicles.json
with open('vehicles.json', 'r') as f:
    vehicles_data = json.load(f)

## 3. Define Data Analysis Tools

In [None]:
def get_analysis_tools() -> List[BaseTool]:
    """Create tools for analyzing vehicle data."""
    
    def search_vehicles(year: str = "", make: str = "", model: str = "", color: str = "") -> str:
        """Search for vehicles matching criteria. Returns vehicle keys and basic info."""
        results = []
        
        for vkey, vehicle in vehicles_data.items():
            match = True
            if year and str(vehicle.get('year', '')) != year:
                match = False
            if make and vehicle.get('make', '').lower() != make.lower():
                match = False
            if model and vehicle.get('model', '').lower() != model.lower():
                match = False
            if color and vehicle.get('color', '').lower() != color.lower():
                match = False
            
            if match:
                results.append({
                    'key': vkey,
                    'name': f"{vehicle['year']} {vehicle['make']} {vehicle['model']}",
                    'color': vehicle['color']
                })
        
        if not results:
            return f"No vehicles found matching: year={year}, make={make}, model={model}, color={color}"
        
        response = f"Found {len(results)} vehicle(s):\n"
        for v in results:
            response += f"  - {v['name']} ({v['color']}) - Key: {v['key']}\n"
        
        return response
    
    def get_vehicle_details(vehicle_key: str) -> str:
        """Get comprehensive details about a vehicle including all available data."""
        if vehicle_key not in vehicles_data:
            return f"Error: Vehicle '{vehicle_key}' not found."
        
        v = vehicles_data[vehicle_key]
        sold_date = v.get('soldDate')
        sold_price = v.get('soldPrice')
        sold_odometer = v.get('soldOdometer')
        
        # Build ownership section
        if sold_date is None or sold_date == "":
            ownership_info = f"""
Ownership:
  Purchased: {v['purchasedDate']} for ${v['purchasedPrice']:.2f}
  Odometer at purchase: {v['purchasedOdometer']} miles
  Purchased from: {v['dealerName']}
  Status: Currently owned (not sold)"""
        else:
            ownership_info = f"""
Ownership:
  Purchased: {v['purchasedDate']} for ${v['purchasedPrice']:.2f}
  Odometer at purchase: {v['purchasedOdometer']} miles
  Purchased from: {v['dealerName']}
  Sold: {sold_date} for ${sold_price:.2f}
  Odometer at sale: {sold_odometer} miles"""
        
        # Count cost entries
        cost_summary = ""
        total_by_category = {}
        for cat in ['insurance', 'registration', 'repairs', 'fuel', 'miscellaneous']:
            entries = v.get(cat, [])
            total = sum(e.get('cost', 0) for e in entries)
            total_by_category[cat] = (len(entries), total)
            if len(entries) > 0:
                cost_summary += f"  {cat.capitalize()}: {len(entries)} entries, ${total:.2f}\n"
        
        total_all_costs = sum(total for _, total in total_by_category.values())
        
        info = f"""
Vehicle Information:
  Year/Make/Model: {v['year']} {v['make']} {v['model']}
  Color: {v['color']}
  VIN: {v['vinNumber']}
  License Plate: {v['licensePlateNumber']}
{ownership_info}

Cost Summary:
{cost_summary}
  TOTAL ALL COSTS: ${total_all_costs:.2f}
        """
        return info
    
    def query_vehicle_data(vehicle_key: str, data_type: str) -> str:
        """Query specific data about a vehicle. 
        data_type can be: 'basic_info', 'ownership', 'costs', 'all_costs_by_category', 
        'fuel_data', 'insurance_data', 'repairs_data', 'registration_data'
        """
        if vehicle_key not in vehicles_data:
            return f"Error: Vehicle '{vehicle_key}' not found."
        
        v = vehicles_data[vehicle_key]
        
        if data_type == 'basic_info':
            return f"{v['year']} {v['make']} {v['model']} ({v['color']}), VIN: {v['vinNumber']}"
        
        elif data_type == 'ownership':
            sold_date = v.get('soldDate')
            if sold_date:
                return f"Purchased: {v['purchasedDate']} | Sold: {sold_date} | Owned for: {(datetime.strptime(sold_date, '%Y-%m-%d') - datetime.strptime(v['purchasedDate'], '%Y-%m-%d')).days // 365} years"
            else:
                days_owned = (datetime.now() - datetime.strptime(v['purchasedDate'], '%Y-%m-%d')).days
                return f"Purchased: {v['purchasedDate']} | Currently owned for: {days_owned} days ({days_owned // 365} years)"
        
        elif data_type == 'costs':
            total = sum(sum(e.get('cost', 0) for e in v.get(cat, [])) 
                       for cat in ['insurance', 'registration', 'repairs', 'fuel', 'miscellaneous'])
            return f"Total costs: ${total:.2f}"
        
        elif data_type == 'all_costs_by_category':
            result = f"Cost breakdown for {v['year']} {v['make']} {v['model']}:\n"
            total_all = 0
            for cat in ['insurance', 'registration', 'repairs', 'fuel', 'miscellaneous']:
                entries = v.get(cat, [])
                total = sum(e.get('cost', 0) for e in entries)
                total_all += total
                result += f"  {cat.capitalize()}: ${total:.2f} ({len(entries)} entries)\n"
            result += f"  TOTAL: ${total_all:.2f}"
            return result
        
        elif data_type == 'fuel_data':
            fuel_entries = v.get('fuel', [])
            if not fuel_entries:
                return "No fuel data available"
            total_cost = sum(f.get('cost', 0) for f in fuel_entries)
            total_gallons = sum(f.get('gallons', 0) for f in fuel_entries)
            sold_odometer = v.get('soldOdometer')
            purchased_odometer = v.get('purchasedOdometer')
            
            result = f"Fuel data: {len(fuel_entries)} fill-ups, ${total_cost:.2f} total, {total_gallons:.1f} gallons\n"
            if sold_odometer and purchased_odometer:
                miles = sold_odometer - purchased_odometer
                mpg = miles / total_gallons if total_gallons > 0 else 0
                result += f"  Average MPG: {mpg:.1f} ({miles:,} miles / {total_gallons:.1f} gallons)"
            return result
        
        elif data_type in ['insurance_data', 'repairs_data', 'registration_data']:
            cat = data_type.replace('_data', '')
            entries = v.get(cat, [])
            total = sum(e.get('cost', 0) for e in entries)
            return f"{cat.capitalize()}: {len(entries)} entries, ${total:.2f} total"
        
        else:
            return f"Unknown data_type: {data_type}. Valid options: basic_info, ownership, costs, all_costs_by_category, fuel_data, insurance_data, repairs_data, registration_data"
    
    def analyze_all_vehicles() -> str:
        """Get analysis across all vehicles - total costs, average costs, counts, etc."""
        result = "\n=== All Vehicles Analysis ===\n\n"
        
        # Total and averages
        total_vehicles = len(vehicles_data)
        total_all_costs = 0
        miles_by_vehicle = []
        
        for vkey, vehicle in vehicles_data.items():
            total_cost = sum(sum(e.get('cost', 0) for e in vehicle.get(cat, []))
                           for cat in ['insurance', 'registration', 'repairs', 'fuel', 'miscellaneous'])
            total_all_costs += total_cost
            
            sold_odometer = vehicle.get('soldOdometer')
            purchased_odometer = vehicle.get('purchasedOdometer')
            if sold_odometer and purchased_odometer:
                miles = sold_odometer - purchased_odometer
                miles_by_vehicle.append({
                    'name': f"{vehicle['year']} {vehicle['make']} {vehicle['model']}",
                    'miles': miles,
                    'cost': total_cost,
                    'cost_per_mile': total_cost / miles if miles > 0 else 0
                })
        
        # Rank by miles
        miles_by_vehicle.sort(key=lambda x: x['miles'], reverse=True)
        
        result += f"Total vehicles: {total_vehicles}\n"
        result += f"Total costs across all vehicles: ${total_all_costs:.2f}\n"
        result += f"Average cost per vehicle: ${total_all_costs / total_vehicles:.2f}\n\n"
        
        result += "Top 5 Most Driven Vehicles:\n"
        for v in miles_by_vehicle[:5]:
            result += f"  {v['name']}: {v['miles']:,} miles (${v['cost_per_mile']:.2f}/mile)\n"
        
        return result
    
    # Create tools
    tools = [
        StructuredTool.from_function(
            func=search_vehicles,
            name="search_vehicles",
            description="Search for vehicles by year, make, model, or color. Returns matching vehicle keys needed for detailed queries."
        ),
        StructuredTool.from_function(
            func=get_vehicle_details,
            name="get_vehicle_details",
            description="Get comprehensive details about a specific vehicle including ownership, costs, and status."
        ),
        StructuredTool.from_function(
            func=query_vehicle_data,
            name="query_vehicle_data",
            description="Query specific data about a vehicle. data_type options: basic_info, ownership, costs, all_costs_by_category, fuel_data, insurance_data, repairs_data, registration_data"
        ),
        StructuredTool.from_function(
            func=analyze_all_vehicles,
            name="analyze_all_vehicles",
            description="Get analysis across all vehicles including totals, averages, and rankings. No parameters needed."
        ),
    ]
    
    return tools

tools = get_analysis_tools()
print(f"Created {len(tools)} analysis tools")

## 4. Setup Memory and State

In [None]:
# Setup SQLite for memory
db_path = "memory.db"
conn = sqlite3.connect(db_path, check_same_thread=False)
sql_memory = SqliteSaver(conn)

# Define state
class State(TypedDict):
    messages: Annotated[List[Any], add_messages]

graph_builder = StateGraph(State)

## 5. Define System Prompt and Agent

In [None]:
qa_system_message = """
You are an AutoLog Question-Answering Agent. Your role is to help users understand their vehicle ownership history and associated costs by answering questions accurately based on the available data.

TOOLS AVAILABLE:
1. search_vehicles: Find vehicles by year, make, model, or color - returns vehicle keys
2. get_vehicle_details: Get comprehensive details for a specific vehicle
3. query_vehicle_data: Get specific data types from a vehicle (fuel, insurance, costs, ownership, etc.)
4. analyze_all_vehicles: Get cross-vehicle analysis and statistics

IMPORTANT PRINCIPLES:
1. Use tools primarily for DATA RETRIEVAL, not for reasoning
2. Once you have the data, USE YOUR KNOWLEDGE AND REASONING to answer the question
3. Combine data from multiple tool calls to synthesize answers when needed
4. Avoid calling tools multiple times to reach an answer you could reason to directly
5. Be concise - get the data, then answer clearly without over-explaining

WORKFLOW FOR ANSWERING QUESTIONS:
- Specific vehicle question? → search_vehicles (if needed) → get_vehicle_details or query_vehicle_data
- Question about one aspect? → query_vehicle_data with specific data_type
- Comparative question? → analyze_all_vehicles or query multiple vehicles then reason about them
- Any question? → Think about what data you need, get it, then REASON to the answer

EXAMPLES:
- "Which car had the most repairs?" → analyze_all_vehicles or search all vehicles, then reason about which had most repairs
- "Should I keep my Mazda?" → get_vehicle_details for Mazda, reason about costs/age/mileage, provide thoughtful analysis
- "How much have I spent on insurance?" → query_vehicle_data for insurance, reason about trends and patterns
- "What's my most economical vehicle?" → Get cost-per-mile data, reason about efficiency

KEY FACTS ABOUT THE DATA:
- Some vehicles are still owned (no soldDate/soldPrice/soldOdometer)
- Cost data includes: insurance, registration, repairs, fuel, miscellaneous
- Each cost entry has a date and amount
- Odometer readings available for all vehicles at purchase, and at sale (if sold)

Answer questions directly and naturally. Don't describe what tools you're calling - just answer.
Provide specific numbers and analysis, not just tool outputs.
"""

## 6. Build LangGraph

In [None]:
# Create the QA agent
llm = ChatOpenAI(model="gpt-4o-mini")
llm_with_tools = llm.bind_tools(tools)

def qa_agent(state: State):
    """Main QA agent node."""
    messages = []

    # Convert all messages to proper LangChain message objects
    for msg in state["messages"]:
        if isinstance(msg, dict):
            if msg["role"] == "system":
                messages.append(SystemMessage(content=msg["content"]))
            elif msg["role"] == "user":
                messages.append(HumanMessage(content=msg["content"]))
            elif msg["role"] == "assistant":
                # For assistant messages, preserve tool_calls if present
                content = msg.get("content", "")
                tool_calls = msg.get("tool_calls")
                # Only add tool_calls if they exist and are non-empty
                if tool_calls:
                    messages.append(AIMessage(content=content, tool_calls=tool_calls))
                else:
                    messages.append(AIMessage(content=content))
            elif msg["role"] == "tool":
                messages.append(ToolMessage(content=msg["content"], tool_call_id=msg.get("tool_call_id")))
        # Handle LangChain message objects directly
        else:
            messages.append(msg)

    # Ensure system message is present at the beginning
    has_system_message = any(isinstance(m, SystemMessage) for m in messages)
    if not has_system_message:
        messages.insert(0, SystemMessage(content=qa_system_message))

    response = llm_with_tools.invoke(messages)

    return {"messages": [response]}

# Add nodes
graph_builder.add_node("qa_agent", qa_agent)
graph_builder.add_node("tools", ToolNode(tools=tools))

# Define routing
def route_agent(state: State):
    """Route based on whether agent calls tools."""
    last_message = state["messages"][-1]
    if hasattr(last_message, "tool_calls") and last_message.tool_calls:
        return "tools"
    return "END"

# Add edges
graph_builder.add_edge(START, "qa_agent")
graph_builder.add_conditional_edges(
    "qa_agent",
    route_agent,
    {"tools": "tools", "END": END}
)
graph_builder.add_edge("tools", "qa_agent")

# Compile with increased recursion limit
graph = graph_builder.compile(checkpointer=sql_memory)
display(Image(graph.get_graph().draw_mermaid_png()))

## 7. Test with Gradio Interface

In [None]:
config = {"configurable": {"thread_id": uuid7()}}

with gradio.Blocks() as demo:
    chatbot = gradio.Chatbot(type="messages")
    user_input_box = gradio.Textbox(label="Ask about your vehicles...")
    submit_btn = gradio.Button("Send")

    def chat_handler(message, history):
        # Invoke the graph with a reasonable recursion limit
        # This prevents infinite loops while allowing multi-step reasoning
        result = graph.invoke(
            {"messages": [{"role": "user", "content": message}]}, 
            config={**config, "recursion_limit": 25}
        )
        last_message = result["messages"][-1]

        # Convert to proper format for Gradio
        # Only convert non-ToolMessage objects for display
        if isinstance(last_message, ToolMessage):
            # Skip tool messages in UI - they're internal implementation details
            # Find the last non-tool message instead
            for msg in reversed(result["messages"]):
                if not isinstance(msg, ToolMessage):
                    last_message = msg
                    break
        
        # Build assistant message dict
        if isinstance(last_message, str):
            assistant_message = {"role": "assistant", "content": last_message}
        elif isinstance(last_message, dict):
            assistant_message = last_message
        elif isinstance(last_message, AIMessage):
            # Extract content, ignore tool_calls for UI display
            assistant_message = {"role": "assistant", "content": last_message.content or ""}
        elif hasattr(last_message, 'content'):
            assistant_message = {"role": "assistant", "content": last_message.content}
        else:
            assistant_message = {"role": "assistant", "content": str(last_message)}

        return history + [{"role": "user", "content": message}, assistant_message]

    submit_btn.click(chat_handler, [user_input_box, chatbot], chatbot)

demo.launch()