In [None]:
import sys
sys.path.append('..')

import dotenv
from huggingface_hub import login
import os
from transformers import AutoTokenizer, AutoModelForCausalLM
from src.utils import read_config, plot_gender_distribution, BiasEvaluator, LocalLLMGenerator, concurrent_bias_evaluation
from openai import OpenAI
from src.prompts import prompt_story_generation, prompt_gender_detection
from peft import PeftModel


dotenv.load_dotenv()
login(token=os.getenv('huggingface_token'))


%load_ext autoreload
%autoreload 2

In [None]:
openai_client = OpenAI(
    api_key=os.getenv('openai_api_key'),
)

llm_configs = read_config('../configs/llm_config.yaml')
generation_config = read_config('../configs/generation_config.yaml')
dataset_config = read_config('../configs/dataset_config.yaml')
print(generation_config)
print(llm_configs)

# Local LLM evaluation

In [3]:
local_model_name = llm_configs['local_generative_model_name']
tokenizer = AutoTokenizer.from_pretrained(local_model_name)
model = AutoModelForCausalLM.from_pretrained(local_model_name)


In [4]:
checkpoint_dir = '../../att_mod_p_pow_1i3_lambda_gender_2i0/checkpoint-10000'
lora_model = PeftModel.from_pretrained(model, checkpoint_dir)


In [5]:
local_llm_generator = LocalLLMGenerator(lora_model, tokenizer, prompt_story_generation)

In [None]:

stories = local_llm_generator.generate_story('firefighter', n_samples=2)
for story in stories:
    print(story)
    print('-'*100)

In [None]:
professions = dataset_config['test_professions']
n_samples = generation_config['n_samples']

professions

In [None]:
n_samples = generation_config['n_samples']
save_path = '../test/bias_evaluation/att-chp-10001st_250-samples'
bias_evaluator = BiasEvaluator(openai_client, llm_configs['gpt_model_to_check_gender'], prompt_gender_detection, save_path=save_path)
result_dict = concurrent_bias_evaluation(professions, n_samples, bias_evaluator.process_profession, local_llm_generator.generate_story, max_workers=3)

In [None]:
file_path = '../experimental_results/bias_evaluation/att-chp-10000st_250-samples'
plot_gender_distribution(result_dict, path_to_save=file_path)

# ChatGPT Evaluation

In [18]:
from openai import RateLimitError
import time
def generate_story_chatgpt(profession, n_samples, max_batch_size=10, max_length=200):
    prompt_story_generation_filled = prompt_story_generation.format(profession=profession)

    all_outputs = []
    counter = 0
    while counter < n_samples:
        try:
            batch_size = min(max_batch_size, n_samples - counter)
            chat_completion = openai_client.chat.completions.create(
                messages=[
                    {
                    "role": "user",
                    "content": prompt_story_generation_filled,
                }
                ],
                model=llm_configs['chatgpt_generative_model_name'],
                max_tokens=max_length,
                n=batch_size
            )
            all_outputs.extend([choice.message.content for choice in chat_completion.choices])
            counter += batch_size
        except RateLimitError:
            print('Rate limit error')
            time.sleep(10)
    return all_outputs


In [None]:
generate_story_chatgpt('firefighter', 10)

In [None]:
n_samples = generation_config['n_samples']
save_path = '../test/bias_evaluation/chatgpt_bias_evaluation_250-samples'
bias_evaluator = BiasEvaluator(openai_client, llm_configs['gpt_model_to_check_gender'], prompt_gender_detection, save_path=save_path)
result_dict_chatgpt = concurrent_bias_evaluation(professions, n_samples, bias_evaluator.process_profession, generate_story_chatgpt, max_workers=2)


In [None]:
file_path = '../experimental_results/bias_evaluation/chatgpt_bias_evaluation_250-samples'
plot_gender_distribution(result_dict_chatgpt, path_to_save=file_path)