In [1]:
import os,sys,time
sys.path.insert(0,'../../libs')
import openai
from llm_utils import BSAgent
from data_utils import train_val_test_split,load_split_climate_data
from utils import download_hf_model
import pandas as pd
import re,json,copy
from tqdm import tqdm
from prompts import short_cot_pt,short_cot_pt_2label,long_cot_pt,long_cot_pt_2label,long_fewshotcot_pt_2label
import pprint
from pydantic import BaseModel
from typing import Literal

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# set up async process
import nest_asyncio
import asyncio
nest_asyncio.apply()
from llm_utils_async import AsyncBSAgent

In [29]:
from dotenv import load_dotenv
env_path = '../../../.env'
load_dotenv(dotenv_path=env_path)
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("OPENAI_API_KEY not found in environment variables. Please check your .env file.")


#### Download all models for evaluation

In [4]:
## download models
model_name_list = ['Qwen/Qwen2.5-7B-Instruct','Qwen/Qwen2.5-32B-Instruct',
                   'meta-llama/Llama-3.1-8B-Instruct','meta-llama/Llama-3.1-70B-Instruct',
                   'deepseek-ai/DeepSeek-R1-Distill-Qwen-32B','deepseek-ai/DeepSeek-R1-Distill-Llama-70B',
                   'Qwen/QwQ-32B','deepseek-ai/DeepSeek-R1-Distill-Llama-8B']
for model_name in model_name_list:
    # Create the target directory path
    target_dir = '/ephemeral/home/xiong/data/hf_cache/' + model_name
    # Check if model already exists before downloading
    if os.path.exists(target_dir) and os.listdir(target_dir):
        print(f"Model {model_name} already exists at {target_dir}, skipping download")
    else:
        print(f"Downloading model {model_name}...")
        download_hf_model(model_name, target_dir, hf_token=os.getenv('huggingface_token'))


Model Qwen/Qwen2.5-7B-Instruct already exists at /ephemeral/home/xiong/data/hf_cache/Qwen/Qwen2.5-7B-Instruct, skipping download
Model Qwen/Qwen2.5-32B-Instruct already exists at /ephemeral/home/xiong/data/hf_cache/Qwen/Qwen2.5-32B-Instruct, skipping download
Model meta-llama/Llama-3.1-8B-Instruct already exists at /ephemeral/home/xiong/data/hf_cache/meta-llama/Llama-3.1-8B-Instruct, skipping download
Model meta-llama/Llama-3.1-70B-Instruct already exists at /ephemeral/home/xiong/data/hf_cache/meta-llama/Llama-3.1-70B-Instruct, skipping download
Model deepseek-ai/DeepSeek-R1-Distill-Qwen-32B already exists at /ephemeral/home/xiong/data/hf_cache/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B, skipping download
Model deepseek-ai/DeepSeek-R1-Distill-Llama-70B already exists at /ephemeral/home/xiong/data/hf_cache/deepseek-ai/DeepSeek-R1-Distill-Llama-70B, skipping download
Model Qwen/QwQ-32B already exists at /ephemeral/home/xiong/data/hf_cache/Qwen/QwQ-32B, skipping download
Model deepseek-ai/D

#### Define output data model 

In [5]:
class ClimateClassification(BaseModel):
    justification: str
    classification: Literal["favorable", "unfavorable", "neutral"]

class ClimateClassification_2label(BaseModel):
    justification: str
    classification: Literal["favorable", "unfavorable"]

#### Define classification function ; sync and async

In [45]:
def get_climate_classifications(agent, dataset, prompt_template,wait_time=0,formated_output=True):
    results = []
    for i in tqdm(range(len(dataset))):
        structured_prompt = copy.deepcopy(prompt_template)
        structured_prompt['user'] = structured_prompt['user'].format(PARAGRAPH=dataset.iloc[i].paragraph)
        try:
            if formated_output:
                response = agent.get_response_content(prompt_template=structured_prompt, 
                                                    response_format=ClimateClassification_2label,
                                                    ) #max_tokens=4000
                results.append({
                    'paragraph': dataset.iloc[i].paragraph,
                    'true_label': dataset.iloc[i].label,
                    'predicted_label': response.classification,
                    'justification': response.justification
                })
            else:
                response = agent.get_response_content(prompt_template=structured_prompt)
                response = agent.parse_load_json_str(response)
                results.append({
                    'paragraph': dataset.iloc[i].paragraph,
                    'true_label': dataset.iloc[i].label,
                    'predicted_label': response.get('classification'),
                    'justification': response.get('justification')
                })                     

        except Exception as e:
            print(f"Error processing row {i}: {str(e)}")
            results.append({
                'paragraph': dataset.iloc[i].paragraph,
                'true_label': dataset.iloc[i].label,
                'predicted_label': None,
                'justification': f"Error: {str(e)}"
            })
    # Add a 2-second wait between API calls to avoid rate limiting
        time.sleep(wait_time)
    return pd.DataFrame(results)

In [7]:
async def async_get_climate_classifications(agent, dataset, prompt_template):
    async def process_row(i):
        structured_prompt = copy.deepcopy(prompt_template)
        structured_prompt['user'] = structured_prompt['user'].format(PARAGRAPH=dataset.iloc[i].paragraph)
        try:
            response = await agent.get_response_content(prompt_template=structured_prompt, response_format=ClimateClassification_2label)
            return {
                'paragraph': dataset.iloc[i].paragraph,
                'true_label': dataset.iloc[i].label,
                'predicted_label': response.classification,
                'justification': response.justification
            }
        except Exception as e:
            print(f"Error processing row {i}: {str(e)}")
            return {
                'paragraph': dataset.iloc[i].paragraph,
                'true_label': dataset.iloc[i].label,
                'predicted_label': None,
                'justification': f"Error: {str(e)}"
            }

    tasks = [process_row(i) for i in range(len(dataset))]
    results = await asyncio.gather(*tasks)
    return pd.DataFrame(results)

In [8]:
data_folder = '/ephemeral/home/xiong/data/Fund/Climate'
# data_path = os.path.join(data_folder,'Climate training paragraphs.csv')
# ds = load_split_climate_data(data_path,merge_neutral=True,verbose=True)
# ds['test'].to_csv(data_folder+'/test.csv')
# ds['validation'].to_csv(data_folder+'/validation.csv')
# ds['train'].to_csv(data_folder+'/train.csv')
test_data = pd.read_csv(data_folder+'/test.csv')
val_data = pd.read_csv(data_folder+'/validation.csv')
train_data = pd.read_csv(data_folder+'/train.csv')

#### Setup experiment scenarios

In [42]:
# Define model names, prompt templates, and API configuration
model_name_list = ['Qwen/Qwen2.5-7B-Instruct','Qwen/Qwen2.5-32B-Instruct',
                   'meta-llama/Llama-3.1-8B-Instruct','meta-llama/Llama-3.1-70B-Instruct',
                   'deepseek-ai/DeepSeek-R1-Distill-Qwen-32B','deepseek-ai/DeepSeek-R1-Distill-Llama-70B',
                   'Qwen/QwQ-32B','deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
                   'openai/gpt-4o-mini','openai/gpt-4o','openai/o1-mini','openai/o3-mini',
                   'google/gemini-2.0-flash','anthropic/claude-3-7-sonnet-latest','deepseek/deepseek-R1']

prompt_template_list = [long_fewshotcot_pt_2label, long_cot_pt_2label, short_cot_pt_2label]
# Create dictionary with model names as keys and prompt templates as values
experiment_dict = {}
for model in model_name_list:
    # Extract model name after the '/' character
    model_short_name = model.split('/')[-1]
    # Create nested dictionary for each prompt template with API configuration
    experiment_dict[model_short_name] = {
        'long_fewshot_cot': long_fewshotcot_pt_2label,
        'long_cot': long_cot_pt_2label,
        'short_cot': short_cot_pt_2label,
    }

#### Define model and initiate llm agent

In [22]:
## test openai models 
model_name = 'o3-mini'
agent = BSAgent(model=model_name,
                base_url="https://api.openai.com/v1/",  
                api_key=os.getenv("OPENAI_API_KEY"),
                seed=42,
                temperature=1)
agent.connection_test('hi,who are you?')

Hi there! I'm ChatGPT, a language model developed by OpenAI. I'm here to help answer questions, provide information, and assist with whatever you need. How can I help you today?


In [11]:
## test claude models 
model_name = 'claude-3-7-sonnet-latest'
agent = BSAgent(model=model_name,
                base_url="https://api.anthropic.com/v1/",
                api_key=os.getenv("CLAUDE_API_KEY")
                )
agent.connection_test('hi,who are you?',max_tokens=4000)


I'm an AI assistant created by Anthropic to be helpful, harmless, and honest. I'm designed to have conversations, answer questions, provide information, and assist with various tasks through text-based communication. I don't have a personal identity or consciousness - I function as a large language model trained on a broad dataset of text. How can I help you today?


In [11]:
## test gemini models 
model_name = 'gemini-2.0-flash'
agent = BSAgent(model=model_name,
                base_url='https://generativelanguage.googleapis.com/v1beta/openai/',
                api_key=os.getenv("GEMINI_API_KEY"))
agent.connection_test('hi,who are you?')

I am a large language model, trained by Google.



In [31]:
## test deepseek r1 models 
model_name = 'deepseek-R1'
agent = BSAgent(model="deepseek-ai/DeepSeek-R1",
                base_url='https://api.netmind.ai/inference-api/openai/v1',
                api_key=os.getenv("NETMIND_API_KEY"))
agent.connection_test('hi,who are you?')

<think>
Okay, the user asked, "hi, who are you?" I need to respond in a friendly and informative way.

First, I should greet them back. "Hi there!" sounds good. Then, explain what I am. I'm an AI created by DeepSeek, so I should mention that. My purpose is to assist with information, answer questions, and help with tasks. I should keep it simple and welcoming. Maybe add a line about how I can help them today to invite further interaction. Let me check if that covers everything without being too technical. Yeah, that should work. Alright, time to put it all together.
</think>

Hi there! I'm DeepSeek-R1-Lite-Preview, an AI assistant created by DeepSeek. I'm here to help with answering questions, providing information, and assisting with tasks. How can I help you today?


In [12]:

# # Try other opens rousce modesl 
# # python -m vllm.entrypoints.openai.api_server --model /home/xiong/data/hf_cache/llama-3.1-8B-Instruct --dtype auto --servered_model_name llama-3.1-8b-Instruct

# base_url = 'http://localhost:8100/v1'
# api_key = 'abc'
# model_name = 'Qwen2.5-7B-Instruct'
# agent = BSAgent(base_url=base_url,
#                 api_key=api_key,
#                 model=model_name)
# agent.connection_test('hi')
# # agent.model = agent.client.models.list().data[0].id
# # print(agent.model) 

In [46]:
# Get predictions for validation and test sets
experiment = experiment_dict[model_name]
for promt_type in experiment.keys():
    if model_name == 'gemini-2.0-flash':
        wait_time=2
    else:
        wait_time=0
    if model_name == 'deepseek-R1':
        formated_output = False
    else:
        formated_output = True
    val_results = get_climate_classifications(agent, test_data, experiment[promt_type],wait_time,formated_output)
    val_results.to_csv(os.path.join( data_folder,'training_eval_results','{}_{}_val_results_v2.csv'.format(model_name,promt_type)))

    print("\nValidation Results: {} ; {}".format(model_name,promt_type))
    print(f"Total samples: {len(val_results)}")
    print(f"Successfully processed: {len(val_results[val_results.predicted_label.notna()])}")
    val_accuracy = (val_results['true_label'] == val_results['predicted_label']).mean()
    print(f"Validation Accuracy: {val_accuracy:.2%}")


100%|██████████| 108/108 [10:38<00:00,  5.91s/it]



Validation Results: deepseek-R1 ; long_fewshot_cot
Total samples: 108
Successfully processed: 108
Validation Accuracy: 81.48%


100%|██████████| 108/108 [10:17<00:00,  5.72s/it]



Validation Results: deepseek-R1 ; long_cot
Total samples: 108
Successfully processed: 108
Validation Accuracy: 75.00%


100%|██████████| 108/108 [09:23<00:00,  5.22s/it]


Validation Results: deepseek-R1 ; short_cot
Total samples: 108
Successfully processed: 108
Validation Accuracy: 75.00%





#### Try run with asyc clent

In [90]:
async_agent = AsyncBSAgent(model=model_name,
                     base_url=base_url,
                     api_key=api_key)
# Get predictions for validation and test sets
for promt_type in experiment.keys():
    train_results = asyncio.run(async_get_climate_classifications(async_agent, train_data, experiment[promt_type]))
    train_results.to_csv(os.path.join( data_folder,'training_eval_results','{}_{}_train_results_v2.csv'.format(model_name,promt_type)))
    print("\nTraining Results: {} ; {}".format(model_name,promt_type))
    print(f"Total samples: {len(train_results)}")
    print(f"Successfully processed: {len(train_results[train_results.predicted_label.notna()])}")
    train_accuracy = (train_results['true_label'] == train_results['predicted_label']).mean()
    print(f"Validation Accuracy: {train_accuracy:.2%}")


Training Results: Qwen2.5-7B-Instruct ; long_fewshot_cot
Total samples: 504
Successfully processed: 504
Validation Accuracy: 74.40%

Training Results: Qwen2.5-7B-Instruct ; long_cot
Total samples: 504
Successfully processed: 504
Validation Accuracy: 73.61%

Training Results: Qwen2.5-7B-Instruct ; short_cot
Total samples: 504
Successfully processed: 504
Validation Accuracy: 71.43%
