### GPT Batch Prompting for Synthetic Data Generation
- Gather video search results that has not been human-annotated (e.g., ~1450 videos)
- Employ GPT-4o (best-performing model) to generate synthetic labels over the video search results (Batch Prompting)
- Train DEBERTA model on the synthetic labels and evaluate over the expert human annotations

For Recommendation Results, we employed this script to batch prompt many recommendation results that were deferred by the DEBERTA models across the myths

In [1]:
import numpy as np
import datetime
from tqdm import tqdm
import pandas as pd
import random
import math
import json
import re
import time
import os
from collections import defaultdict, Counter
from utils import prompts, EvaluatorHelper, GPTRequests
from openai import OpenAI, AzureOpenAI
from dotenv import dotenv_values
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

random.seed(42)
%load_ext autoreload
%autoreload 2
pd.set_option('display.max_colwidth', None)

In [610]:
# importing OpenAI Client
secrets = dotenv_values(".env")
api_key = secrets['OPENAI_KEY']

client = OpenAI(
    api_key=api_key    
)

### Gathering video recommendation results to label

In [611]:
def read_dict_from_json(filename):
    """
    Reads a dictionary with string keys and list values from a JSON file.

    Args:
        filename (str): Path to the JSON file.

    Returns:
        dict: Dictionary with string keys and list values.
    """
    with open(filename, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Validate the structure
    if not isinstance(data, dict):
        raise ValueError("The JSON content is not a dictionary.")
    
    for key, value in data.items():
        if not isinstance(key, str):
            raise ValueError("All keys in the dictionary must be strings.")
        if not isinstance(value, list):
            raise ValueError("All values in the dictionary must be lists.")

    return data

In [3]:
# directories containing the search results and annotated data
RECOMMENDATION_DATA_DIR = '/home/hjung10/oud-audit/data-collection-pipeline/unpersonalized-recommendation-data/recommendation-video-metadata.csv'
LLM_DEFERRED_DIR = '/home/hjung10/oud-audit/labeling-pipeline/recommendation-labels/'

In [630]:
# reading in the recommendation results & the videos that need to be labeled by the LLM
recommendation_df = pd.read_csv(RECOMMENDATION_DATA_DIR).drop_duplicates('video_id')
loaded_data = read_dict_from_json(LLM_DEFERRED_DIR + "myth_to_video_id_for_llm.json")


### Synthetic Label Generation

In [612]:
def create_jsonl_batch(jsonl_filename, crafted_prompt, model_name, temperature):
    with open(jsonl_filename, "a") as f:
        # adding each JSONL line for prompt
        for vid, prompt in crafted_prompt.items():
            request_data = {
                'custom_id' : vid,
                'method' : 'POST',
                'url' : '/v1/chat/completions',
                'body' : {'model': model_name, 'messages': prompt, 'max_tokens': 1000, 'temperature' : temperature}
            }
            
            f.write(json.dumps(request_data) + '\n')
    print(f"JSONL file '{jsonl_filename}' created successfully.")

In [689]:
def split_dict_into_chunks(data_dict, num_chunks):
    """Split a dictionary into N approximately equal-sized chunks."""
    items = list(data_dict.items())
    chunk_size = math.ceil(len(items) / num_chunks)
    return [dict(items[i:i + chunk_size]) for i in range(0, len(items), chunk_size)]

def create_jsonl_batches(base_filename, crafted_prompt, model_name, temperature, num_batches):
    chunks = split_dict_into_chunks(crafted_prompt, num_batches)
    
    for i, chunk in enumerate(chunks, start=1):
        batch_filename = f"{base_filename}_batch{i}.jsonl"
        print(len(chunk))
        with open(batch_filename, "w") as f:
            for vid, prompt in chunk.items():
                request_data = {
                    'custom_id': vid,
                    'method': 'POST',
                    'url': '/v1/chat/completions',
                    'body': {
                        'model': model_name,
                        'messages': prompt,
                        'max_tokens': 1000,
                        'temperature': temperature
                    }
                }
                f.write(json.dumps(request_data) + '\n')
        print(f"JSONL file '{batch_filename}' created successfully.")
    return chunks
        
def send_batch_requests(base_path, base_filename, num_batches, myth_key):
    for i in range(1, num_batches + 1):
        print(i)
        
        
        batch_filename = f"{base_filename}_batch{i}.jsonl"
        batch_path = os.path.join(base_path, batch_filename)
        print(batch_path)
        
        batch_input_file = client.files.create(
            file=open(batch_path, "rb"),
            purpose="batch"
        )
        print(batch_input_file)
        
        batch_input_file_id = batch_input_file.id
        batch_result = client.batches.create(
            input_file_id=batch_input_file_id,
            endpoint="/v1/chat/completions",
            completion_window="24h",
            metadata={
                "description": myth_key + ' recommendation labeling batch ' + str(i)
            })
        
        print(batch_result.id)
        print(myth_key)
        print(f"Batch request submitted. ID: {batch_input_file_id}")

In [621]:
# myth statements
MYTH_TO_STATEMENT = {
    'M1': 'Agonist therapy or medication-assisted treatment (MAT) for OUD is merely replacing one drug with another.',
    'M2': 'People with OUD are not suffering from a medical DISEASE treatable with medication from a self-imposed condition maintained through the lack of moral fiber.', 
    'M3': 'The ultimate goal of treatment for OUD is abstinence from any opioid use (e.g., Taking medication is not true recovery).',
    'M4': 'Only patients with certain characteristics are vulnerable to addiction.',
    'M5': 'Physical dependence or tolerance is the same as addiction.',
    'M6': 'Detoxification for OUD is effective.',
    'M7': 'You should only take medication for a brief period of time.',
    'M8': 'Kratom is a non-addictive and safe alternative to opioids.'
}

### Videos to exclude as it is included in the few-shot example
MYTH_TO_FEW_SHOT = {
    'M1' : prompts.M1_FEW_SHOT_EXAMPLES,
    'M2' : prompts.M2_FEW_SHOT_EXAMPLES,
    'M3' : prompts.M3_FEW_SHOT_EXAMPLES,
    'M4' : prompts.M4_FEW_SHOT_EXAMPLES,
    'M5' : prompts.M5_FEW_SHOT_EXAMPLES,
    'M6' : prompts.M6_FEW_SHOT_EXAMPLES,
    'M7' : prompts.M7_FEW_SHOT_EXAMPLES,
    'M8' : prompts.M8_FEW_SHOT_EXAMPLES
}

# variables
model_name = 'gpt-4o-2024-08-06' 
temperature = 0.2   #  fixed based on prior works (which shows this as the optimal temperature for classification)
myth_to_batch_id = defaultdict()

## Rerun starting from here (change myth_key)

In [None]:
# creating the dataframe containing only the recommendation results that was meant to be LLM-cascaded aka handled by 4o
myth_key = 'M4'  # <---- ONLY CHANGE THIS
unannotated_recommendation_myth_df = recommendation_df.loc[recommendation_df['video_id'].isin(diff_result_for_prompting[myth_key])]
print(unannotated_recommendation_myth_df.shape)
print(unannotated_recommendation_myth_df.head(1)['video_id'])

In [703]:
# creating JSONL batch
crafted_prompt = EvaluatorHelper.create_myth_specific_prompts(unannotated_recommendation_myth_df, prompts.few_shot_prompt, prompts.persona, MYTH_TO_STATEMENT[myth_key], MYTH_TO_FEW_SHOT[myth_key])
print(len(crafted_prompt))
crafted_prompt['V3ZXAxfLapM']

The total input tokens: 99290135
The average input tokens: 11869.71129707113
8365


[{'role': 'system',
  'content': 'You are a public health expert with specialized knowledge of opioid use disorder (OUD) and medication-assisted treatment (MAT). You are well-versed in the common myths and misconceptions about OUD and MAT that circulate on online platforms like YouTube. Your task is to analyze the provided YouTube video metadata and identify the presence of the specified OUD-related myth within the video metadata.'},
 {'role': 'user',
  'content': 'Using the provided LABEL DESCRIPTIONS, evaluate the YOUTUBE VIDEO METADATA based on its stance towards the  MYTH and justify your label with brief EXCERPT(s) from the metadata. While evaluating the video, assign one of three labels based on the video\'s stance towards the specified MYTH: label it as "Supports the OUD myth (1)" if it supports or promotes the MYTH, including subtle undertones and implications; "Opposes the OUD myth (-1)" if it debunks or opposes the MYTH; and "Neither (0)" if it is neither supporting nor oppos

In [None]:
# creating JSONL batch
jsonl_filename = os.getcwd() + os.sep + 'batch_requests' + os.sep + 'batch_requests_recommendation_LLM_cascaded_' + myth_key + '.jsonl'
create_jsonl_batch(jsonl_filename, crafted_prompt, model_name, temperature)

# uploading the JSONL batch
batch_input_file = client.files.create(
    file=open(jsonl_filename, "rb"),
    purpose="batch"
)

# creating the batch
batch_input_file_id = batch_input_file.id
batch_result = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
        "description": myth_key + ' recommendation labeling retry'
    })
myth_to_batch_id[myth_key] = batch_result.id
print(myth_key)
print(f"Batch request submitted. ID: {batch_input_file_id}")

In [None]:
# checking the status of batch
for myth_key, batch_id in myth_to_batch_id.items():
    print(myth_key)
    batch_status = client.batches.retrieve(batch_id)
    print(f"Batch status: {batch_status.status}")

### Checking files + extracting synthetic labels

In [805]:
def parse_json_string(json_string):
    """Parses a JSON string into a dictionary, handling embedded JSON in code blocks and fixing common format issues."""
    json_string = re.sub(r'^```json|```$', '', json_string.strip(), flags=re.MULTILINE).strip()
    
    # Attempt to fix common JSON format errors (e.g., missing colons in object keys)
    try:
        return json.loads(json_string)
    except json.JSONDecodeError as e:
        print(f"JSON Decode Error: {e}. Attempting fix...")
        #print(json_string)
        
        # Fix missing colons by ensuring all key-value pairs are properly formatted
        json_string = re.sub(r'("[^"]+")\s*([a-zA-Z])', r'\1: "\2', json_string)
        print(json_string)
        
        try:
            return json.loads(json_string)
        except json.JSONDecodeError as e:
            #print(f"Failed to fix JSON: {e}")
            return None

def extract_labels(json_output):
    # extracting the label
    try:
        if isinstance(json_output, list):
            json_output = json_output[0]
        
        label = str(json_output['LABEL'])
    except Exception as e:
        print("label extraction error")
        print(json_output)
        print(e)
    
    # standardizing the label
    if '-1' in label:
        return -1
    elif '0' in label:
        return 0
    elif '1' in label:
        return 1
    else:
        print("Error parsing the label")
        print(label)
        return None
    
def save_dict_to_jsonl(data: dict, filename: str):
    """
    Saves a dictionary in JSONL format.
    Each key-value pair in the dictionary is written as a separate JSON object on a new line.
    
    :param data: Dictionary to save.
    :param filename: Name of the JSONL file.
    """
    with open(filename, 'w', encoding='utf-8') as f:
        for key, value in data.items():
            json.dump({key: value}, f)
            f.write("\n")

In [930]:
myth_key = 'M4'

file_path = myth_key + '-batch1-synthetic-recommendation-labels.jsonl'
with open(file_path, 'r', encoding='utf-8') as file:
    # extracting line by line
    for line in file:
        json_line = json.loads(line)
        generated_text = json_line['response']['body']['choices'][0]['message']['content']
        
        # obtaining vid and label
        vid = json_line['custom_id']
        try:
            json_generated_text = parse_json_string(generated_text)
            label = extract_labels(json_generated_text)
            vid_to_label[vid] = label
        except:
            print(vid)

In [931]:
print(len(vid_to_label))
print(Counter(vid_to_label.values()))
save_dict_to_jsonl(vid_to_label, myth_key + '-synthetic-labels-only.jsonl') 

8364
Counter({0: 7360, -1: 964, 1: 40})


### Extracting synthetic labels in file

In [477]:
FULL_EVAL_DIR =  '/home/hjung10/oud-audit/labeling-pipeline/myth-eval-data/evaluation_set/'
SYNTHETIC_LABEL_DIR = '/home/hjung10/oud-audit/labeling-pipeline/synthetic-labels-only/'

MYTH_TO_EVAL_FILE = {'M1': 'M1_evaluation_set.csv',
                     'M2': 'M2_evaluation_set.csv',
                     'M3': 'M3_evaluation_set.csv',
                     'M4': 'M4_evaluation_set.csv',
                     'M5': 'M5_evaluation_set.csv',
                     'M6': 'M6_evaluation_set.csv',
                     'M7': 'M7_evaluation_set.csv',
                     'M8': 'M8_evaluation_set.csv'}

MYTH_TO_SYNTHETIC_LABEL_FILE = {'M1': 'M1-synthetic-labels-only.jsonl',
                     'M2': 'M2-synthetic-labels-only.jsonl',
                     'M3': 'M3-synthetic-labels-only.jsonl',
                     'M4': 'M4-synthetic-labels-only.jsonl',
                     'M5': 'M5-synthetic-labels-only.jsonl',
                     'M6': 'M6-synthetic-labels-only.jsonl',
                     'M7': 'M7-synthetic-labels-only.jsonl',
                     'M8': 'M8-synthetic-labels-only.jsonl'}

"""
Returns the video_id to label mapping from the evaluation set
"""
def create_vid_to_label_eval(dataframe):
    vid_to_label = defaultdict()

    # iterating through each row
    for i, row in dataframe.iterrows():
        vid_to_label[row['video_id']] = row['label']
        
    return vid_to_label

def load_jsonl_to_dict(file_path):
    data_dict = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            entry = json.loads(line.strip())  # Parse each JSONL entry
            data_dict.update(entry)  # Update dictionary with parsed data
    return data_dict

In [None]:
search_results_df = pd.read_csv(SEARCH_RESULT_DATA_DIR)

myth_to_mappings = dict()
for myth, file_name in MYTH_TO_EVAL_FILE.items():
    print(myth + ": " + file_name)
    
    # reading in the ground-truth anno
    df = pd.read_csv(FULL_EVAL_DIR + file_name)
    vid_to_label_eval = create_vid_to_label_eval(df)
    
    vid_to_output = load_jsonl_to_dict(SYNTHETIC_LABEL_DIR + MYTH_TO_SYNTHETIC_LABEL_FILE[myth])
        
    vid_to_output.update(vid_to_label_eval)
    myth_to_mappings[myth] = vid_to_output
    
    search_results_df[myth] = search_results_df['video_id'].map(myth_to_mappings[myth])

In [514]:
unique_df = search_results_df.drop_duplicates(subset='video_id')

In [493]:
search_results_df.to_csv('final-search-results-annotations.csv')

In [539]:
vid_to_freq = defaultdict()
counter_support = 0
counter_oppose = 0
counter_mix = 0
for i, row in unique_df.iterrows():
    list_num = [row['M1'], row['M2'], row['M3'], row['M4'], row['M5'], row['M6'], row['M7'], row['M8']]
    label_to_freq = Counter(list_num)
    
    if -1 in label_to_freq and 1 in label_to_freq:
        counter_mix += 1
        vid_to_freq[row['video_id']] = label_to_freq
    elif 1 in label_to_freq:
        counter_support += 1
    elif -1 in label_to_freq:
        counter_oppose += 1

In [540]:
len(vid_to_freq)

63

In [525]:
search_results_df.loc[(search_results_df["query"] == 'suboxone') & (search_results_df["sort_by_filter"] == 'rating')].drop('transcript', axis=1)


Unnamed: 0.1,Unnamed: 0,ranking,query,sort_by_filter,video_id,video_url,video_title,video_description,channel_name,channel_id,...,fav,comments,M1,M2,M3,M4,M5,M6,M7,M10
2043,2043,1,suboxone,rating,0ilV-xKR3_Q,https://www.youtube.com/watch?v=0ilV-xKR3_Q,3 Reasons Why People Fail On Suboxone #Shorts,Recover from addiction by calling me: (800) 779-4715 Or visit: https://american-addiction.com/ ---- If you are new to this channel ...,Dr. B Addiction Recovery,UCNcMaWo7evcwxnQeErsFLmA,...,0,3,0,0,0,0,0,0,0,0
2044,2044,2,suboxone,rating,aGJA7WCNfcA,https://www.youtube.com/watch?v=aGJA7WCNfcA,"Fix Watery Glassy Eyes From Opiates, Suboxone, Kratom, 420, etc",I AM NOT A DR! This is NOT Medical advice. Please seek professional assistance if you are struggling with addiction. Everything ...,Kickin Addiction,UC8DGiYY6CTl9qhuU1KVgjvw,...,0,1,1,0,0,0,0,0,0,0
2045,2045,3,suboxone,rating,hLzJHTTgzfg,https://www.youtube.com/watch?v=hLzJHTTgzfg,Subs and Rockstar 😆 #fypシ #fy #comedy #wedorecover #recovery #addiction #suboxone,,Tyler Blaze,UCjfwov9dbVTtckSGHZEqigg,...,0,0,0,0,0,0,0,0,0,0
2046,2046,4,suboxone,rating,o8blbs71Kw8,https://www.youtube.com/watch?v=o8blbs71Kw8,SUBOXONE Two Ways To Get Off Without A Painful Withdrawal,I AM NOT A DR! This is NOT Medical advice. Please seek professional assistance if you are struggling with addiction. Everything ...,Kickin Addiction,UC8DGiYY6CTl9qhuU1KVgjvw,...,0,6,1,0,1,0,0,-1,1,0
2047,2047,5,suboxone,rating,B767CqpKw80,https://www.youtube.com/watch?v=B767CqpKw80,Suboxone &amp; Health Clinic TN,"Suboxone Services $150 Nashville, Tennessee.",Hope &' Healing LLC,UCws47A_EJGdc4J998ISajCw,...,0,0,0,0,0,0,0,0,0,0
2048,2048,6,suboxone,rating,vYFDRpzHO70,https://www.youtube.com/watch?v=vYFDRpzHO70,Kr8tom can get you off #suboxone when you’re ready #shorts,,ThaTeaBishhh,UCckDEdZw6rqReJbo9D2IAgA,...,0,0,0,0,0,0,0,0,0,0
2049,2049,7,suboxone,rating,tMPbazJJ_vw,https://www.youtube.com/watch?v=tMPbazJJ_vw,#Suboxone &amp; #methadone are -Bandaids on bullet wounds without a #RecoveryProgram -#daamnkam #dtdjo,,Nermak Black,UC834IpNx0zpaGbstVOxVCNQ,...,0,0,1,0,1,0,0,0,1,0
2050,2050,8,suboxone,rating,d4RoGdsOG1w,https://www.youtube.com/watch?v=d4RoGdsOG1w,Benzo vs Suboxone Which to quit first.,,Dr. B Addiction Recovery,UCNcMaWo7evcwxnQeErsFLmA,...,0,6,0,0,0,0,0,0,0,0
2051,2051,9,suboxone,rating,tzHKfZyevXo,https://www.youtube.com/watch?v=tzHKfZyevXo,YOU DO NOT NEED #suboxone TO GET OFF KR@TOM! That’s like throwing gas into the fire. #taper,WEDORECOVER 'In 2008. I was diagnosed with degenerative disc disease(L4-L5) and epilepsy. That began a whole slew of ...,ThaTeaBishhh,UCckDEdZw6rqReJbo9D2IAgA,...,0,1,1,1,1,1,1,1,1,1
2052,2052,10,suboxone,rating,mpxvJ0d0J9E,https://www.youtube.com/watch?v=mpxvJ0d0J9E,What Is Suboxone Abuse And Its Side Effects?,A person could misuse Suboxone by using it to relieve opioid withdrawal without a prescription and undergoing treatment for ...,Addiction 101,UCRO67vJS0M0pXZC-_xhsqSw,...,0,5,1,1,1,0,1,0,1,0


In [549]:
leans_opposing = 0
leans_supporting = 0
tie = 0

for vid, label_to_freq in vid_to_freq.items():
    if label_to_freq[-1] == label_to_freq[1]:
        tie += 1
    elif label_to_freq[-1] > label_to_freq[1]:
        leans_opposing += 1
    elif label_to_freq[-1] < label_to_freq[1]:
        leans_supporting += 1
        
print("Number of annotations leaning opposing: " + str(leans_opposing))
print("Number of annotations leaning supporting: " + str(leans_supporting))
print("Number of annotations tie: " + str(tie))

Number of annotations leaning opposing: 10
Number of annotations leaning supporting: 22
Number of annotations tie: 31
