## Experiment Structure

In [None]:
# import .env variables
import os
from dotenv import load_dotenv
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
GIMINI_API_KEY = os.getenv("GIMINI_API_KEY")
ANTROPIC_API_KEY = os.getenv("ANTROPIC_API_KEY")

In [None]:
from openai import OpenAI
from anthropic import Anthropic
from google import genai
from google.genai import types
import outlines
import instructor
from ollama import Client
import openai


gimini_client = genai.Client(api_key=GIMINI_API_KEY)
openai_client = OpenAI(api_key=OPENAI_API_KEY)
antropic_client = Anthropic(api_key=ANTROPIC_API_KEY)
ollama_client = Client()

gemini = outlines.from_gemini(gimini_client, "gemini-2.5-flash-lite")
claude = instructor.from_anthropic(antropic_client) # doesn't work with outlines yet, so using instructor
chatgpt = outlines.from_openai(openai_client, "gpt-4o")


model = outlines.from_ollama(ollama_client, "smollm2:1.7b")

# start experiement code

In [None]:
from src import (
    JobEnrichment,
    ExtractionQuality,
    ExperimentConfig,
    extract_with_model,
    extract_with_cloud_model,
    EXTRACTION_PROMPT_DIRECT,
    EXTRACTION_PROMPT_EXAMPLES,
    create_judge_prompt,
    judge_extraction,
    ExperimentRunner,
    analyze_results
)
import pandas as pd
from datetime import datetime

## Extraction System

In [None]:
print("Direct Prompt Preview:")
print(EXTRACTION_PROMPT_DIRECT[:200] + "...")
print("\n" + "="*50 + "\n")
print("Examples Prompt Preview:")
print(EXTRACTION_PROMPT_EXAMPLES[:200] + "...")

In [None]:
print("Schemas and extraction functions imported from src/")
print(f"JobEnrichment fields: {list(JobEnrichment.model_fields.keys())}")

## Judge System

In [None]:
# Judge functions are now imported from src
# create_judge_prompt() and judge_extraction()

print("Judge evaluation functions imported from src/")
print("ExtractionQuality schema with 4 scores + detailed feedback")

## Experiment Runner

In [None]:
print("ExperimentRunner class imported from src/runner.py")
print("analyze_results() function imported from src/runner.py")

## RUN

In [None]:
import pandas as pd

jobs_df = pd.read_csv("jobs_sample.csv")

eng = jobs_df[jobs_df['description_language'] == "English"].head(15)
fr  = jobs_df[jobs_df['description_language'] == "French"].head(15)

jobs_df = pd.concat([eng, fr], ignore_index=True)


In [47]:
# df to list of dicts

jobs_list = jobs_df.to_dict(orient="records")
jobs_list[0]


{'title': 'Senior Machine Learning Engineer, Ads R&D',
 'company': 'Spotify',
 'location': 'New York, NY',
 'description': "Engineering\nMachine Learning\nPermanent\nNew York\nOur mission on the Advertising Product & Technology team is to build a next generation advertising platform that aligns with our unique value proposition for audio and video. We work to scale the user experience for hundreds of millions of fans and hundreds of thousands of advertisers. This scale brings unique challenges as well as tremendous opportunities for our artists and creators.\n\nWe are seeking a Senior Machine Learning Engineer to join the Ad Engagement squad. Ad Engagement focuses on using machine learning to accurately predict how Spotify listeners will react to ads, helping advertisers minimize their costs while delivering a more relevant and enjoyable ad experience for listeners. Our core innovations include Multi-Task Learning models (MTL), and we are expanding into scalable sequence modeling with 

In [49]:
JUDGES =  {"gpt-4o": chatgpt,
           "claude-sonnet-4-5":claude}

CLOUD_EXTRACTORS = {"gemini-2.5-flash-lite": gemini,
                   }


# Configure
config = ExperimentConfig(
    local_models=[
                 "qwen3:1.7b-q4_K_M",
                 "qwen3:4b-q4_K_M",
                 "qwen3:4b-instruct-2507-q4_K_M",
                 "qwen3:8b-q4_K_M",
                 "qwen3:14b-q4_K_M",
                 "qwen3:30b"
                 ],
    cloud_models=["gemini-2.5-flash-lite"],
    judge_models=["claude-sonnet-4-5", "gpt-4o"],
    temperature=0.0,  # Deterministic outputs
    num_jobs=30
)

print("=== EXPERIMENT CONFIGURATION ===")
print(f"Testing {len(config.local_models)} local models: {config.local_models}")
print(f"Testing {len(config.cloud_models)} cloud models: {config.cloud_models}")
print(f"Using {len(config.judge_models)} judges: {config.judge_models}")
print(f"Testing 2 prompt versions: direct, examples")
print(f"Processing {config.num_jobs} jobs")
print(f"Total extractions per prompt: {(len(config.local_models) + len(config.cloud_models)) * config.num_jobs}")
print(f"Total extractions (both prompts): {(len(config.local_models) + len(config.cloud_models)) * config.num_jobs * 2}")
print(f"Total evaluations: {(len(config.local_models) + len(config.cloud_models)) * config.num_jobs * 2 * len(config.judge_models)}")
print("\n" + "="*50 + "\n")

# Run experiment with BOTH prompts
runner = ExperimentRunner(
    config,
    jobs_list,
    JUDGES,
    CLOUD_EXTRACTORS,
    EXTRACTION_PROMPT_DIRECT,
    EXTRACTION_PROMPT_EXAMPLES
)
results_df = runner.run_experiment()

print("\n" + "="*50)
print("EXPERIMENT COMPLETED!")
print("="*50 + "\n")

# Save results with ALL new columns
results_df.to_csv("extraction_experiment_results.csv", index=False, escapechar='\\')
results_df.to_pickle("extraction_experiment_results.pkl")
print("✓ Results saved to: extraction_experiment_results.csv and .pkl")
print(f"✓ New columns included: prompt_version, input_tokens, output_tokens, description_length, description_language")



=== EXPERIMENT CONFIGURATION ===
Testing 6 local models: ['qwen3:1.7b-q4_K_M', 'qwen3:4b-q4_K_M', 'qwen3:4b-instruct-2507-q4_K_M', 'qwen3:8b-q4_K_M', 'qwen3:14b-q4_K_M', 'qwen3:30b']
Testing 1 cloud models: ['gemini-2.5-flash-lite']
Using 2 judges: ['claude-sonnet-4-5', 'gpt-4o']
Testing 2 prompt versions: direct, examples
Processing 30 jobs
Total extractions per prompt: 210
Total extractions (both prompts): 420
Total evaluations: 840



COMBINATION 1/14: qwen3:1.7b-q4_K_M + direct

  [1/30] Senior Machine Learning Engineer, Ads R&D...
    ✓ Extracted in 3.06s (1261→295 tokens)
    ✓ Extracted in 3.06s (1261→295 tokens)

  [2/30] Data Engineer...

  [2/30] Data Engineer...
    ✓ Extracted in 3.77s (1380→181 tokens)
    ✓ Extracted in 3.77s (1380→181 tokens)

  [3/30] Data Scientist...

  [3/30] Data Scientist...
    ✓ Extracted in 3.70s (1139→365 tokens)
    ✓ Extracted in 3.70s (1139→365 tokens)

  [4/30] Data Scientist, Music Expression...

  [4/30] Data Scientist, Music Expression..

In [50]:
# Analyze
analyze_results(results_df)


=== MODEL PERFORMANCE BY JUDGE & PROMPT ===

--- Judge: claude-sonnet-4-5 ---
                                             completeness_score        \
                                                           mean   std   
extraction_model              prompt_version                            
gemini-2.5-flash-lite         direct                       8.37  0.61   
                              examples                     8.53  0.57   
qwen3:1.7b-q4_K_M             direct                       7.36  0.95   
                              examples                     6.17  1.47   
qwen3:14b-q4_K_M              direct                       8.43  0.63   
                              examples                     8.07  1.27   
qwen3:30b                     direct                        NaN   NaN   
                              examples                      NaN   NaN   
qwen3:4b-instruct-2507-q4_K_M direct                       8.57  0.50   
                              examples       