In [1]:
%pip install pydantic_evals llm cachier pandas




[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [56]:
from io import StringIO
from typing import Any, Dict
from pydantic_evals import Dataset
import llm
from pydantic_evals.reporting import EvaluationReport
from pydantic_evals.dataset import set_eval_attribute
from rich.console import Console
from cachier import cachier
import pandas as pd
import nest_asyncio
nest_asyncio.apply()

In [3]:
@cachier(cache_dir="./cached-llm-responses", separate_files=True)
def run_llm(model_name: str, prompt: str) -> str:
    model = llm.get_model(model_name)
    response = model.prompt(prompt)
    return response.text()

In [66]:
model_name = "llama-3.2-3b-4bit"
dataset = Dataset[Any, Any, Any].from_file("tests.json")
for case in dataset.cases:
    case.metadata = (case.metadata or {})
    case.metadata['model'] = model_name


n_cases = len(dataset.cases)
i = 0
async def _run_llm(input: Dict) -> str:
    set_eval_attribute('model', model_name)
    global i
    print(f'Evaluating {i} of {n_cases}')
    i += 1
    prompt = f"""
    Given this political fundraising email, respond with the name of the committee in the disclaimer that begins with Paid for by but does not include Paid for by, the committee address or the treasurer name. If no committee is present, return "None". Do not include any other text, no yapping.
    name: {input['name']}
    email: {input['email']}
    subject: {input['subject']}
    body: {input['body']}
    """
    result = run_llm(model_name, prompt)
    return result

report = dataset.evaluate_sync(_run_llm, name=model_name)

Evaluating 0 of 2
Evaluating 1 of 2


In [68]:
report.cases[0].attributes

{'model': 'llama-3.2-3b-4bit'}

In [58]:
def expand_dict_columns(df, columns_to_expand):
    """
    Expand dictionary columns in a DataFrame into individual columns with dotted notation.
    
    Parameters:
    - df: pandas DataFrame
    - columns_to_expand: list of column names containing dictionaries to expand
    
    Returns:
    - DataFrame with expanded columns and original dictionary columns removed
    """
    # Make a copy to avoid modifying the original DataFrame
    result_df = df.copy()
    
    # Track the original column order
    original_columns = list(result_df.columns)
    columns_to_drop = []
    
    # Process each dictionary column to expand it
    for col in columns_to_expand:
        if col not in result_df.columns:
            continue
            
        # Find the position of this column in the original list
        col_index = original_columns.index(col)
        
        # Check if the column contains dictionaries
        if result_df[col].apply(lambda x: isinstance(x, dict)).any():
            # Mark this column for dropping
            columns_to_drop.append(col)
            
            # Get all unique keys from the dictionaries
            all_keys = set()
            for d in result_df[col].dropna():
                if isinstance(d, dict):
                    all_keys.update(d.keys())
            
            # For each key, create a new column
            new_columns = []
            for key in sorted(all_keys):
                new_col_name = f"{col}.{key}"
                result_df[new_col_name] = result_df[col].apply(
                    lambda x: x.get(key) if isinstance(x, dict) else None
                )
                new_columns.append(new_col_name)
            
            # Update the column order list - replace the original column with the new columns
            original_columns = original_columns[:col_index] + new_columns + original_columns[col_index+1:]
    
    # Drop the original dictionary columns
    result_df = result_df.drop(columns=columns_to_drop)
    
    # Reorder the columns to maintain original positioning with expanded columns
    final_columns = [col for col in original_columns if col in result_df.columns]
    return result_df[final_columns]

In [59]:
def report_to_df(report: EvaluationReport) -> pd.DataFrame:
    records = []
    for case in report.cases:
        assertions = case.assertions
        
        case_data = {
            "input": case.inputs,
            "metadata": case.metadata,
            "expected_output": case.expected_output,
            "output": case.output,
            #"scores": {k: v.value for k, v in case.scores.items()},
            #"labels": {k: v.value for k, v in case.labels.items()},
            #"metrics": case.metrics,
        }
        
        # Calculate assertions pass rate
        assertions_total = len(case.assertions)
        assertions_passed = sum(1 for a in case.assertions.values() if a.value)
        case_data["assertions_passed_rate"] = (
            assertions_passed / assertions_total
            if assertions_total > 0 else 1.0
        )
        
        # Add individual assertion columns
        for assertion_name, assertion_obj in case.assertions.items():
            case_data[f"assertion.{assertion_name}"] = assertion_obj.value
        
        records.append(case_data)

    df = pd.DataFrame(records)
    df = expand_dict_columns(df, ['input', 'metadata'])
    return df

df = report_to_df(report)
df

Unnamed: 0,input.body,input.date,input.day,input.disclaimer,input.domain,input.email,input.hour,input.minute,input.month,input.name,input.party,input.subject,input.year,metadata.model,expected_output,output,assertions_passed_rate,assertion.EqualsExpected
0,Here’s why it doesn’t surprise me that Biden s...,11/1/24 0:00,1,True,newsletter.conservativedirect.com,newsletter@newsletter.conservativedirect.com,0,0,11,Ryan Walker,R,“Garbage” That’s what Joe Biden just called MI...,2024,llama-3.2-3b-4bit,Heritage Action for America,Heritage Action for America,1.0,True
1,Click here to stop receiving emails about Tris...,11/1/24 0:02,1,True,adamforcolorado.com,info@adamforcolorado.com,0,2,11,Trisha Calvarese,D,$5 to defeat Boebert,2024,llama-3.2-3b-4bit,Adam for Colorado,Adam for Colorado,1.0,True


In [54]:
overall_df = pd.read_csv(f"./results/results.csv")
overall_df

Unnamed: 0,input.body,input.date,input.day,input.disclaimer,input.domain,input.email,input.hour,input.minute,input.month,input.name,input.party,input.subject,input.year,metadata.model,expected_output,output,assertions_passed_rate,assertion.EqualsExpected
0,Here’s why it doesn’t surprise me that Biden s...,11/1/24 0:00,1,True,newsletter.conservativedirect.com,newsletter@newsletter.conservativedirect.com,0,0,11,Ryan Walker,R,“Garbage” That’s what Joe Biden just called MI...,2024,llama-3.2-3b-4bit,Heritage Action for America,HeritageAction for America\n,0.0,False
1,Click here to stop receiving emails about Tris...,11/1/24 0:02,1,True,adamforcolorado.com,info@adamforcolorado.com,0,2,11,Trisha Calvarese,D,$5 to defeat Boebert,2024,llama-3.2-3b-4bit,Adam for Colorado,Adam for Colorado\n,0.0,False
2,Here’s why it doesn’t surprise me that Biden s...,11/1/24 0:00,1,True,newsletter.conservativedirect.com,newsletter@newsletter.conservativedirect.com,0,0,11,Ryan Walker,R,“Garbage” That’s what Joe Biden just called MI...,2024,llama-3.2-3b-4bit,Heritage Action for America,Heritage Action for America,1.0,True
3,Click here to stop receiving emails about Tris...,11/1/24 0:02,1,True,adamforcolorado.com,info@adamforcolorado.com,0,2,11,Trisha Calvarese,D,$5 to defeat Boebert,2024,llama-3.2-3b-4bit,Adam for Colorado,Adam for Colorado,1.0,True


In [55]:
def aggregate(df):
    result = df.groupby('metadata.model').agg(
        total_count=('assertion.EqualsExpected', 'count'),
        equals_expected_true=('assertion.EqualsExpected', lambda x: x.sum()),
    )
    
    # Calculate the rate
    result['equals_expected_rate'] = result['equals_expected_true'] / result['total_count']
    
    return result

aggregate(overall_df)

Unnamed: 0_level_0,total_count,equals_expected_true,equals_expected_rate
metadata.model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
llama-3.2-3b-4bit,4,2,0.5
