## LLM Prompt Engineering Pipeline
- Sampling data for prompt-engineering and few-shot examples
- Programming LLM prediction pipeline (per myth)

In [1]:
import numpy as np
import datetime
from tqdm import tqdm
import pandas as pd
import random
import json
import time
import os
from collections import defaultdict, Counter
from utils import prompts, EvaluatorHelper, GPTRequests
from openai import OpenAI, AzureOpenAI
from dotenv import dotenv_values
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

random.seed(42)
%load_ext autoreload
%autoreload 2
pd.set_option('display.max_colwidth', None)

In [2]:
# importing OpenAI Client
secrets = dotenv_values(".env")
api_key = secrets['OPENAI_KEY']

client = OpenAI(
    api_key=api_key    
)

In [3]:
# directories
FULL_EVAL_DIR =  '/home/hjung10/oud-audit/labeling-pipeline/myth-eval-data/evaluation_set/'
MYTH_TO_EVAL_FILE = {'M1': 'M1_evaluation_set.csv',
                     'M2': 'M2_evaluation_set.csv',
                     'M3': 'M3_evaluation_set.csv',
                     'M4': 'M4_evaluation_set.csv',
                     'M5': 'M5_evaluation_set.csv',
                     'M6': 'M6_evaluation_set.csv',
                     'M7': 'M7_evaluation_set.csv',
                     'M8': 'M8_evaluation_set.csv'}

In [4]:
"""
Returns the video_id to label mapping from the evaluation set
"""
def create_vid_to_label_eval(dataframe):
    vid_to_label = defaultdict()

    # iterating through each row
    for i, row in dataframe.iterrows():
        vid_to_label[row['video_id']] = row['label']
        
    return vid_to_label

In [None]:
### Videos to exclude as it is included in the few-shot example
MYTH_TO_FEW_SHOT_EXCLUDE = {
    'M1' : ['SjCZwqEE22Y', '7PT0gv6a97o', 'X3UKcHR-2uU', 'fTcGyWDDg5s', 'bMitni3tC-c'],
    'M2' : ['9TYr6sqDEfY', 'DyjRxf-aJN4', 'AnUN2Zs4Mnk', 'm_uV8UkTDKw', '-3G162dqVVI'],
    'M3' : ['Jc-buPCKisM', '0hR2Hwkhey8', 'UfQWOGOFNFA', 'JczoO7ogOS8', 'SjCZwqEE22Y'],
    'M4' : ['DyjRxf-aJN4', '7PT0gv6a97o', 'Qg7G0VTi3iY', 'OFGFeA6Ap7E', 'v4GnSSvcYys'],
    'M5' : ['zN9NDZ6lgaM', '7PT0gv6a97o', 'sZ5-i72Yl2Q', 'FmGalSsq63k', 'TnYHKxUHgCs'],
    'M6' : ['tzHKfZyevXo', 'eWdCJm9q1bw', '7gtWuoWGQWM', 'WNXieqey_iQ', 'SjCZwqEE22Y'],
    'M7' : ['QtRQ9UD7rpY', 'W-7_alg4I28', '0RkpSTlvvj0', '0y55ymuJ2K4', 'TP0ToVYXQ-k'],
    'M10' : ['6F6d10ggVDw', 'nmMCQ1y8l14', 'GI3blNNe56w', 'E9jKyHjPbUg', 'QtRQ9UD7rpY']
}

### Zero-Shot Evaluation

In [5]:
# myth statements
MYTH_TO_STATEMENT = {
    'M1': 'Agonist therapy or medication-assisted treatment (MAT) for OUD is merely replacing one drug with another.',
    'M2': 'People with OUD are not suffering from a medical DISEASE treatable with medication from a self-imposed condition maintained through the lack of moral fiber.', 
    'M3': 'The ultimate goal of treatment for OUD is abstinence from any opioid use (e.g., Taking medication is not true recovery).',
    'M4': 'Only patients with certain characteristics are vulnerable to addiction.',
    'M5': 'Physical dependence or tolerance is the same as addiction.',
    'M6': 'Detoxification for OUD is effective.',
    'M7': 'You should only take medication for a brief period of time.',
    'M8': 'Kratom is a non-addictive and safe alternative to opioids.'
}

# variables
myth_key = 'M1'
model_name = 'gpt-4o-2024-08-06' # e.g., gpt-4o-2024-08-06, gpt-4o-mini-2024-07-18
temperature = 0.2   #  fixed based on prior works (which shows this as the optimal temperature for classification)
vid_to_output = defaultdict()

# zero-shot only (CHANGE FOR FEW SHOT)
prompt_chosen = prompts.zero_shot_prompt

# directories
EVAL_DIR = os.getcwd() + '/evaluations/evaluation-set/'
df = pd.read_csv(FULL_EVAL_DIR + MYTH_TO_EVAL_FILE[myth_key])
save_file_dir_EVAL = EVAL_DIR + model_name + '-' + myth_key + '-zero-shot-evaluation.json'

In [6]:
# reading the data & creating the evaluation dictionary
vid_to_label_eval = create_vid_to_label_eval(df)

In [None]:
# crafting the prompt
myth = MYTH_TO_STATEMENT[myth_key]
crafted_prompt = EvaluatorHelper.create_myth_specific_prompts(df, prompt_chosen, prompts.persona, myth)

In [None]:
# feeding the prompt into the LLM and collecting the output
vid_to_output, total_completion_token, total_prompt_token = GPTRequests.evaluate_prompts(crafted_prompt, model_name, temperature, vid_to_output)

In [1228]:
# Save the list to a JSON file
GPTRequests.extract_and_save_output(save_file_dir_EVAL, vid_to_output)

In [None]:
save_file_dir_EVAL = EVAL_DIR + model_name + '-' + myth_key + '-zero-shot-evaluation.json'
print(save_file_dir_EVAL)
with open(save_file_dir_EVAL, 'r') as json_file:
    vid_to_output = json.load(json_file)

In [None]:
EvaluatorHelper.compute_results(vid_to_output, vid_to_label_eval, False, MYTH_TO_FEW_SHOT_EXCLUDE[myth_key])

### Few-Shot Evaluation


In [7]:
# variables
myth_key = 'M2'
model_name = 'gpt-4o-2024-08-06' # e.g., gpt-4o-2024-08-06
temperature = 0.2   #  fixed based on prior works (which shows this as the optimal temperature for classification)
vid_to_output = defaultdict()

# zero-shot only (CHANGE FOR FEW SHOT)
prompt_chosen = prompts.few_shot_prompt

# directories
EVAL_DIR = os.getcwd() + '/evaluations/evaluation-set/'

# prompt engineering vs. eval
df = pd.read_csv(FULL_EVAL_DIR + MYTH_TO_EVAL_FILE[myth_key])
save_file_dir_EVAL = EVAL_DIR + model_name + '-' + myth_key + '-few-shot-evaluation.json'

# reading the data & creating the evaluation dictionary
vid_to_label_eval = create_vid_to_label_eval(df)

In [8]:
# crafting the prompt
myth = MYTH_TO_STATEMENT[myth_key]
print(myth)
few_shot_examples = prompts.M2_FEW_SHOT_EXAMPLES
crafted_prompt = EvaluatorHelper.create_myth_specific_prompts(df, prompt_chosen, prompts.persona, myth, few_shot_examples)


People with OUD are not suffering from a medical DISEASE treatable with medication from a self-imposed condition maintained through the lack of moral fiber.
The total input tokens: 1958069
The average input tokens: 6316.351612903226


In [9]:
# feeding the prompt into the LLM and collecting the output
start = time.perf_counter()
vid_to_output, total_completion_token, total_prompt_token = GPTRequests.evaluate_prompts(client, crafted_prompt, model_name, temperature)
end = time.perf_counter()
print(f"Elapsed time: {end - start:.6f} seconds")

0
30
60
90
120
150
180
210
240
270
300
Total Number of Input Token: 1931562
Total Number of Output Token: 61949
Elapsed time: 1042.112122 seconds


In [10]:
len(vid_to_output)

310

In [40]:
GPTRequests.extract_and_save_output(save_file_dir_EVAL, vid_to_output)

In [None]:
save_file_dir_EVAL = EVAL_DIR + model_name + '-' + myth_key + '-few-shot-evaluation.json'
with open(save_file_dir_EVAL, 'r') as json_file:
    vid_to_output = json.load(json_file)
print(save_file_dir_EVAL)
    
M10_few_shot_exclude = ['6F6d10ggVDw', 'nmMCQ1y8l14', 'GI3blNNe56w', 'E9jKyHjPbUg', 'QtRQ9UD7rpY']
EvaluatorHelper.compute_results(vid_to_output, vid_to_label_eval, False, M10_few_shot_exclude)