In [1]:
# Load libraries

import os
import json
import sys
from datetime import datetime
import logging

# Configure logging to show in console
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),  # This outputs to console
    ]
)

# Reload modules automatically when they change
%load_ext autoreload
%autoreload 2

In [2]:
cd ..

/Users/linafaik/Documents/projects/agentic-investor-brief


In [3]:
from agent_investor_brief.config import setup_environment
from agent_investor_brief.agents.investor_agent import InvestorAgent
from agent_investor_brief.agents.qa_agent import CompanyQAAgent
from agent_investor_brief.tools.industry_research import IndustryResearchTool
from agent_investor_brief.tools.financial_data import FinancialDataTool

2025-10-03 21:11:15,037 - root - INFO - Environment setup completed
2025-10-03 21:11:15,037 - root - INFO - MLflow tracking: sqlite:///mlflow.db
2025-10-03 21:11:15,038 - root - INFO - Default model: gpt-5-nano
2025-10-03 21:11:15,038 - root - INFO - Output directory: outputs


### QA Agent

In [4]:
agent = CompanyQAAgent(model_name="gpt-5")

test_cases = [
    {
        "company": "AAPL",
        "question": "What's Apple's current stock price?",
        "expected_tool": "financial"
    },
    {
        "company": "TSLA", 
        "question": "How is Tesla positioned in the electric vehicle market?",
        "expected_tool": "industry"
    },
    {
        "company": "MSFT",
        "question": "What are Microsoft's revenue and profit margins?",
        "expected_tool": "financial"
    },
    {
        "company": "GOOGL",
        "question": "What are Google's main business segments?",
        "expected_tool": "industry"
    }
]

for i, test_case in enumerate(test_cases, 1):
    print(f"\nTest {i}: {test_case['company']}")
    print(f"Question: {test_case['question']}")
    
    result = agent.ask_question(test_case['question'], test_case['company'])
    
    print(f"Success: {result['success']}")
    print(f"Tools used: {result.get('tools_used', [])}")
    print(f"Answer preview: {result['answer']}")
    
    # Basic validation
    assert result['success'], f"Test {i} failed"
    assert len(result['answer']) > 10, f"Test {i} has very short answer"
    
    print("✓ Passed")

2025/10/03 21:11:17 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/10/03 21:11:17 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.



Test 1: AAPL
Question: What's Apple's current stock price?
Success: True
Tools used: ['financial']
Answer preview: Apple (AAPL) is trading at $258.16, up 0.40% on the day. Last updated: 2025-10-03 21:11 UTC.
✓ Passed

Test 2: TSLA
Question: How is Tesla positioned in the electric vehicle market?
Success: True
Tools used: ['financial', 'industry']
Answer preview: Short answer: Tesla remains the EV market’s scale and brand leader, but its dominance is narrowing as competition intensifies and margins compress.

Position today
- Market leadership (U.S.): Estimated 43.1% EV market share through September 2025, down from ~49% at the end of last year, as more rival models launch. U.S. EVs reached ~10.5% of auto sales in Q3, indicating a growing but more competitive market.
- Global scale: Deliveries were about 1.79 million in 2024 (Morningstar), putting Tesla among the top global EV makers; competition is fiercest in China from players like BYD, XPeng, and NIO.
- Financial strength and press

### Evaluation

In [5]:
import pandas as pd
import mlflow
from mlflow.genai import scorer
from mlflow.genai.scorers import Correctness, Guidelines

# Questions list
evaluation_questions = [
    {
        "ticker": "AAPL",
        "question": "What is Apple's current P/E ratio and how does it compare to the tech sector average?",
        "expected_tool": "financial"
    },
    {
        "ticker": "TSLA", 
        "question": "What is Tesla's market share in the global electric vehicle market and who are its main competitors?",
        "expected_tool": "industry"
    },
    {
        "ticker": "MSFT",
        "question": "What are Microsoft's quarterly revenue growth rates for the past 4 quarters?",
        "expected_tool": "financial"
    },
    {
        "ticker": "NVDA",
        "question": "What are NVIDIA's gross margins and how have they changed due to AI demand?",
        "expected_tool": "financial"
    },
    {
        "ticker": "AMZN",
        "question": "What are Amazon's main revenue streams and which segment generates the highest profit margins?",
        "expected_tool": "industry"
    }
]

# Step 1: Generate ground truth answers 
def generate_ground_truth(model_name="gpt-5"):
    agent = CompanyQAAgent(model_name=model_name)
    ground_truth = []
    
    for q in evaluation_questions:
        result = agent.ask_question(q["question"], q["ticker"])
        ground_truth.append({
            "ticker": q["ticker"],
            "question": q["question"],
            "ground_truth": result["answer"]
        })
    
    return pd.DataFrame(ground_truth)

@scorer
def is_concise(outputs: str) -> bool:
    """Evaluate if the answer is concise """
    return len(outputs.split()) <= 500 and len(outputs.split()) >= 100

# Step 3: Define scorers
scorers = [
    Correctness(),
    is_concise,
    Guidelines(name="is_professional", guidelines="The answer tone must be in professional."),
]

# Step 4: Prediction function - simplified to match MLflow's expectations
def qa_predict_fn(inputs, current_model):
    """
    MLflow passes 'inputs' and any other keys from the 'inputs' dict as separate arguments
    
    Args:
        inputs: The question and ticker combined string
        current_model: The model name to use
    """
    # Parse the input string to extract question and ticker
    # Assuming format: "question Ticker : ticker_symbol"
    parts = inputs.split(" Ticker : ")
    question = parts[0]
    ticker = parts[1] if len(parts) > 1 else ""
    
    agent = CompanyQAAgent(model_name=current_model)
    result = agent.ask_question(question, ticker)
    
    return result["answer"] 

In [6]:
print("Generating ground truth...")
ground_truth_df = generate_ground_truth()

Generating ground truth...


In [7]:
# Models to test
models = ["gpt-5-nano", "gpt-5-mini", "gpt-4.1"]

for model in models:
    print(f"\nEvaluating {model}...")
    
    # Prepare evaluation data

    # Prepare evaluation data in MLflow's expected format
    eval_data = []
    for _, row in ground_truth_df.iterrows():
        eval_data.append({
            "inputs": {
                "inputs": row["question"]+" Ticker : "+row["ticker"],
                "current_model": model
            },
            "expectations": {
                "expected_response": row["ground_truth"]
            },
        })
    
    # Run evaluation
    with mlflow.start_run(run_name=f"evaluation_{model}"):
        results = mlflow.genai.evaluate(
            data=eval_data,
            predict_fn=qa_predict_fn,
            scorers=scorers,
        )
        
        print(f"Evaluation completed for {model}")
        print(f"Results: {results.metrics}")
    
    print(f"Evaluation completed for {model}")

2025/10/03 21:18:57 INFO mlflow.models.evaluation.utils.trace: Auto tracing is temporarily enabled during the model evaluation for computing some metrics and debugging. To disable tracing, call `mlflow.autolog(disable=True)`.
2025/10/03 21:18:57 INFO mlflow.genai.utils.data_validation: Testing model prediction with the first sample in the dataset.
ERROR [root] Error in ask_question: Run with UUID c2bdc64f95254e389b9064ef0a56d357 is already active. To start a new run, first end the current run with mlflow.end_run(). To start a nested run, call start_run with nested=True



Evaluating gpt-5-nano...


Evaluating:   0%|          | 0/5 [Elapsed: 00:00, Remaining: ?] 

2025/10/03 21:19:25 ERROR mlflow.genai.scorers.aggregation: Invalid assessment value for is_professional: None
2025/10/03 21:19:25 ERROR mlflow.genai.scorers.aggregation: Invalid assessment value for correctness: None
2025/10/03 21:19:25 ERROR mlflow.genai.scorers.aggregation: Invalid assessment value for correctness: None
2025/10/03 21:19:25 ERROR mlflow.genai.scorers.aggregation: Invalid assessment value for is_professional: None
2025/10/03 21:19:25 ERROR mlflow.genai.scorers.aggregation: Invalid assessment value for correctness: None
2025/10/03 21:19:25 ERROR mlflow.genai.scorers.aggregation: Invalid assessment value for is_professional: None
2025/10/03 21:19:25 ERROR mlflow.genai.scorers.aggregation: Invalid assessment value for correctness: None
2025/10/03 21:19:25 ERROR mlflow.genai.scorers.aggregation: Invalid assessment value for is_professional: None
2025/10/03 21:19:25 ERROR mlflow.genai.scorers.aggregation: Invalid assessment value for correctness: None
2025/10/03 21:19:25 E


✨ Evaluation completed.

Metrics and evaluation results are logged to the MLflow run:
  Run name: [94mevaluation_gpt-5-nano[0m
  Run ID: [94mc2bdc64f95254e389b9064ef0a56d357[0m

To view the detailed evaluation results with sample-wise scores,
open the [93m[1mTraces[0m tab in the Run page in the MLflow UI.

Evaluation completed for gpt-5-nano
Results: {'is_concise/mean': np.float64(0.8)}
Evaluation completed for gpt-5-nano

Evaluating gpt-5-mini...


Evaluating:   0%|          | 0/5 [Elapsed: 00:00, Remaining: ?] 

2025/10/03 21:20:01 ERROR mlflow.genai.scorers.aggregation: Invalid assessment value for correctness: None
2025/10/03 21:20:01 ERROR mlflow.genai.scorers.aggregation: Invalid assessment value for is_professional: None
2025/10/03 21:20:01 ERROR mlflow.genai.scorers.aggregation: Invalid assessment value for correctness: None
2025/10/03 21:20:01 ERROR mlflow.genai.scorers.aggregation: Invalid assessment value for is_professional: None
2025/10/03 21:20:01 ERROR mlflow.genai.scorers.aggregation: Invalid assessment value for correctness: None
2025/10/03 21:20:01 ERROR mlflow.genai.scorers.aggregation: Invalid assessment value for is_professional: None
2025/10/03 21:20:01 ERROR mlflow.genai.scorers.aggregation: Invalid assessment value for correctness: None
2025/10/03 21:20:01 ERROR mlflow.genai.scorers.aggregation: Invalid assessment value for is_professional: None
2025/10/03 21:20:01 ERROR mlflow.genai.scorers.aggregation: Invalid assessment value for correctness: None
2025/10/03 21:20:01 E


✨ Evaluation completed.

Metrics and evaluation results are logged to the MLflow run:
  Run name: [94mevaluation_gpt-5-mini[0m
  Run ID: [94m273ddd76f6a344ea9e7ddc4188b1aa27[0m

To view the detailed evaluation results with sample-wise scores,
open the [93m[1mTraces[0m tab in the Run page in the MLflow UI.

Evaluation completed for gpt-5-mini
Results: {'is_concise/mean': np.float64(0.4)}
Evaluation completed for gpt-5-mini

Evaluating gpt-4.1...


Evaluating:   0%|          | 0/5 [Elapsed: 00:00, Remaining: ?] 

2025/10/03 21:20:30 ERROR mlflow.genai.scorers.aggregation: Invalid assessment value for correctness: None
2025/10/03 21:20:30 ERROR mlflow.genai.scorers.aggregation: Invalid assessment value for is_professional: None
2025/10/03 21:20:30 ERROR mlflow.genai.scorers.aggregation: Invalid assessment value for correctness: None
2025/10/03 21:20:30 ERROR mlflow.genai.scorers.aggregation: Invalid assessment value for is_professional: None
2025/10/03 21:20:30 ERROR mlflow.genai.scorers.aggregation: Invalid assessment value for correctness: None
2025/10/03 21:20:30 ERROR mlflow.genai.scorers.aggregation: Invalid assessment value for is_professional: None
2025/10/03 21:20:30 ERROR mlflow.genai.scorers.aggregation: Invalid assessment value for correctness: None
2025/10/03 21:20:30 ERROR mlflow.genai.scorers.aggregation: Invalid assessment value for is_professional: None
2025/10/03 21:20:30 ERROR mlflow.genai.scorers.aggregation: Invalid assessment value for correctness: None
2025/10/03 21:20:30 E


✨ Evaluation completed.

Metrics and evaluation results are logged to the MLflow run:
  Run name: [94mevaluation_gpt-4.1[0m
  Run ID: [94m57a1068edc7c44fb8f86edf475170255[0m

To view the detailed evaluation results with sample-wise scores,
open the [93m[1mTraces[0m tab in the Run page in the MLflow UI.

Evaluation completed for gpt-4.1
Results: {'is_concise/mean': np.float64(0.8)}
Evaluation completed for gpt-4.1
