In [1]:
import os,sys
sys.path.insert(0,'../../libs')
import openai
from llm_utils import BSAgent
from data_utils import train_val_test_split,load_split_climate_data
from utils import download_hf_model
import pandas as pd
import re,json,copy
from tqdm import tqdm
from prompts import short_cot_pt,short_cot_pt_2label,long_cot_pt,long_cot_pt_2label,long_fewshotcot_pt_2label
import pprint
from pydantic import BaseModel
from typing import Literal

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# set up async process
import nest_asyncio
import asyncio
nest_asyncio.apply()
from llm_utils_async import AsyncBSAgent

In [2]:
from dotenv import load_dotenv
env_path = '../../../.env'
load_dotenv(dotenv_path=env_path)
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("OPENAI_API_KEY not found in environment variables. Please check your .env file.")


#### Download all models for evaluation

#### Define output data model 

In [3]:
class ClimateClassification(BaseModel):
    justification: str
    classification: Literal["favorable", "unfavorable", "neutral"]

class ClimateClassification_2label(BaseModel):
    justification: str
    classification: Literal["favorable", "unfavorable"]

#### Define classification function ; sync and async

In [4]:
def get_climate_classifications(agent, dataset, prompt_template):
    results = []
    for i in tqdm(range(len(dataset))):
        structured_prompt = copy.deepcopy(prompt_template)
        structured_prompt['user'] = structured_prompt['user'].format(PARAGRAPH=dataset.iloc[i].paragraph)
        try:
            response = agent.get_response_content(prompt_template=structured_prompt)
            response = agent.parse_load_json_str(response)
            results.append({
                'paragraph': dataset.iloc[i].paragraph,
                'true_label': dataset.iloc[i].label,
                'predicted_label': response.get('classification'),
                'justification': response.get('justification')
            })
        except Exception as e:
            print(f"Error processing row {i}: {str(e)}")
            results.append({
                'paragraph': dataset.iloc[i].paragraph,
                'true_label': dataset.iloc[i].label,
                'predicted_label': None,
                'justification': f"Error: {str(e)}"
            })
    return pd.DataFrame(results)

In [5]:
data_folder = '/ephemeral/home/xiong/data/Fund/Climate'
# data_path = os.path.join(data_folder,'Climate training paragraphs.csv')
# ds = load_split_climate_data(data_path,merge_neutral=True,verbose=True)
# ds['test'].to_csv(data_folder+'/test.csv')
# ds['validation'].to_csv(data_folder+'/validation.csv')
# ds['train'].to_csv(data_folder+'/train.csv')
test_data = pd.read_csv(data_folder+'/test.csv')
val_data = pd.read_csv(data_folder+'/validation.csv')
train_data = pd.read_csv(data_folder+'/train.csv')

#### Setup experiment scenarios

In [7]:
# Define model names, prompt templates, and API configuration
model_name_list = [
    'anthropic/claude-3-7-sonnet-latest', # Claude 3 Sonnet
    'google/gemini-2.0-flash'   # Google Gemini Flash
]

prompt_template_list = [long_fewshotcot_pt_2label, long_cot_pt_2label, short_cot_pt_2label]
# Create dictionary with model names as keys and prompt templates as values
experiment_dict = {}
for model in model_name_list:
    # Extract model name after the '/' character
    model_short_name = model.split('/')[-1]
    # Create nested dictionary for each prompt template with API configuration
    experiment_dict[model_short_name] = {
        'long_fewshot_cot': long_fewshotcot_pt_2label,
        'long_cot': long_cot_pt_2label,
        'short_cot': short_cot_pt_2label,
    }

#### Define model and initiate llm agent

In [8]:
from llm_utils_claude import ClaudeAgent



In [41]:
model_name = 'claude-3-7-sonnet-latest'
agent = ClaudeAgent(model=model_name,temperature=0.0,max_tokens=4000,api_key=os.getenv("CLAUDE_API_KEY"))
agent.connection_test('hi')

Hello! How can I assist you today? Whether you have a question, need information, or just want to chat, I'm here to help. What's on your mind?


In [42]:
# Get predictions for validation and test sets
experiment = experiment_dict[model_name]
for promt_type in experiment.keys():
    val_results = get_climate_classifications(agent, test_data, experiment[promt_type])
    val_results.to_csv(os.path.join( data_folder,'training_eval_results','{}_{}_val_results_v2.csv'.format(model_name,promt_type)))

    print("\nValidation Results: {} ; {}".format(model_name,promt_type))
    print(f"Total samples: {len(val_results)}")
    print(f"Successfully processed: {len(val_results[val_results.predicted_label.notna()])}")
    val_accuracy = (val_results['true_label'] == val_results['predicted_label']).mean()
    print(f"Validation Accuracy: {val_accuracy:.2%}")


100%|██████████| 108/108 [03:44<00:00,  2.08s/it]



Validation Results: claude-3-7-sonnet-latest ; long_fewshot_cot
Total samples: 108
Successfully processed: 108
Validation Accuracy: 79.63%


100%|██████████| 108/108 [04:25<00:00,  2.46s/it]



Validation Results: claude-3-7-sonnet-latest ; long_cot
Total samples: 108
Successfully processed: 108
Validation Accuracy: 78.70%


100%|██████████| 108/108 [03:05<00:00,  1.71s/it]


Validation Results: claude-3-7-sonnet-latest ; short_cot
Total samples: 108
Successfully processed: 108
Validation Accuracy: 74.07%





#### Try run with asyc clent

In [11]:
from openai import OpenAI

client = OpenAI(
    api_key=os.getenv("GEMINI_API_KEY"),
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
)

response = client.chat.completions.create(
    model="gemini-2.0-flash",
    n=1,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": "hi"
        }
    ]
)

print(response.choices[0].message)

ChatCompletionMessage(content='Hi there! How can I help you today?\n', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None)
