In [None]:
import os

# Change this relative to your own directory structure
PARENT_DIR = '/Users/henrygilbert/GitHub/phd_chatgpt'
os.chdir(PARENT_DIR)

In [None]:
from dotenv import load_dotenv
import mlflow
import numpy as np
import pandas as pd
from typing import List, Dict
import logging
import json

from chatgpt_util import ChatGPTUtil

%load_ext autoreload
%autoreload 2

load_dotenv('secrets.env')

In [None]:
def log_dict_artifact(d: Dict, artifact_name: str, artifact_path: str = 'experiments/artifacts'):
    with open(f'{artifact_path}/{artifact_name}.json', 'w') as f:
        json.dump(d, f)
    mlflow.log_artifact(f'{artifact_path}/{artifact_name}.json')

In [None]:
mlflow.set_experiment("Experiment 1")
mlflow.end_run()
mlflow.start_run()

In [None]:
prompts = [
    "How large is the earth?",  # Purposefully vague question with the metric for large not being specified
    "How many grams are in a kilogram?",  # Specific question with specific answer
    "How many grams does the average dinner plate that is 12 inches in diameter weigh?",  # Specific question with more abstract answer
    "What is your definition of love?",  # Abstract question with abstract answer
    "If there are 5 apples and 3 oranges in a basket, how many fruits are in the basket?",  # Specific question with specific answer
    "What is the underlying, unknown probability distribution that drives economic markets?",  # Abstract question with abstract answer
]
mlflow.log_params({f'prompt_{i}': prompt for i, prompt in enumerate(prompts)})

In [None]:
prompt_to_response: Dict[str, List[str]] = {}
num_samples = 100
max_tokens = 100

mlflow.log_param('num_samples', num_samples)
mlflow.log_param('max_tokens', max_tokens)

for prompt_idx, prompt in enumerate(prompts):
    print(f'Prompt {prompt_idx}: {prompt}')
    responses = ChatGPTUtil.get_text_response(
        prompt=prompt,
        samples=num_samples,
        max_tokens=max_tokens)
    prompt_to_response[prompt] = responses

log_dict_artifact(prompt_to_response, 'prompt_to_response')

In [None]:
response_to_embedding = {}
for prompt, responses in prompt_to_response.items():
    print(f'Prompt: {prompt}')
    for i, response in enumerate(responses):
        print(f"        response {i}") if i % 10 == 0 else None
        embedding = ChatGPTUtil.get_text_embedding(response)
        response_to_embedding[response] = embedding
log_dict_artifact(response_to_embedding, 'response_to_embedding')