This notebook calls an llm to label youtube comments using externally defined prompts.

Currently runs on sample data.

In [1]:
import pandas as pd
from multiprocessing import Pool, cpu_count


In [2]:
def parse_arguments(args=None):
    """Parse command-line arguments or use provided args."""
    import argparse
    import importlib.util
    import json

    if args is None:
        parser = argparse.ArgumentParser()
        parser.add_argument("--inp-col", type=str, required=True, help="Input column name in the DataFrame")
        parser.add_argument("--out-col", type=str, required=True, help="Output column name for LLM responses")
        parser.add_argument("--prompt-name", type=str, required=True, help="Name of the prompt to use")
        parser.add_argument("--llm-name", type=str, default="gpt-4o-mini", help="Name of the LLM to use")
        parser.add_argument("--debug-mode", action="store_true", help="Run in debug mode with sequential processing")
        
        args = parser.parse_args()

    # Read API key if not already set
    if not hasattr(args, "api_key"):
        args.api_key = read_api_key()

    # Load config and merge into args
    config_path = "configs/constants_labeling.py"
    spec = importlib.util.spec_from_file_location("constants_labeling", config_path)
    config = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(config)

    # Set attributes from config if not already set by args
    for attr in dir(config):
        if not attr.startswith("__"):
            if not hasattr(args, attr):
                setattr(args, attr, getattr(config, attr))

    # Load prompt template if not already set
    if not hasattr(args, "prompt_template"):
        args.prompt_path = getattr(args, "PROMPT_PATH", None)
        with open(args.prompt_path, "r") as f:
            PROMPT_TEMPLATES = json.load(f)
        args.prompt_template = PROMPT_TEMPLATES[args.prompt_name]

    # Set endpoint and API version if not already set
    if not hasattr(args, "endpoint"):
        args.endpoint = getattr(args, "SANDBOX_ENDPOINT", None)
    if not hasattr(args, "api_version"):
        args.api_version = getattr(args, "SANDBOX_API_VERSION", None)

    return args

def read_api_key(api_key_path="openai_api_key.txt"):
    with open(api_key_path, "r") as f:
        return f.read().strip()

def _worker(args_tuple):
    """Worker function for multiprocessing."""
    sentence, prompt_template, model_to_be_used, api_key, endpoint, api_version, prompt_name = args_tuple
    try:
        from openai import AzureOpenAI
        client = AzureOpenAI(api_key=api_key, azure_endpoint=endpoint, api_version=api_version)
        prompt = [
            {"role": "system", "content": "You are helping in scientific analysis, so please be precise."},
            {"role": "user", "content": prompt_template.format(sentence=sentence)}
        ]
        # Adjust max_tokens based on prompt_name and model_to_be_used
        if "eval" in prompt_name and "Mistral" in model_to_be_used:
            max_tokens = 5
        elif "eval" in prompt_name:
            max_tokens = 1
        else:
            max_tokens = 1000
        response = client.chat.completions.create(
            model=model_to_be_used,
            temperature=0.0,
            max_tokens=max_tokens,
            top_p=1.0,  # Ensure top_p is 1.0 for greedy sampling
            messages=prompt
        )
        return response.choices[0].message.content
    except Exception as e:
        # print exception and return empty string
        # print(f"Error processing sentence '{sentence}': {e}")
        return ""

def call_llm_on_sentences(sentences, args, batch_size=32):
    """Call LLM on sentences and return results."""
    args_list = [
        (sent, args.prompt_template, args.llm_name, args.api_key, args.endpoint, args.api_version, args.prompt_name)
        for sent in sentences
    ]
    if args.debug_mode:
        # Sequential processing for debug mode
        results = []
        for arg in args_list:
            tmp_res = _worker(arg)
            results.append(tmp_res)
    else:
        # Parallel processing
        with Pool(processes=min(cpu_count(), batch_size)) as pool:
            results = pool.map(_worker, args_list)
    return results

def process_comments_with_llm(comments_df, args, batch_size=32):
    """
    Process comments in comments_df using an LLM and store the responses in a new column.

    Args:
        comments_df (pd.DataFrame): DataFrame containing comments to process.
        args: Arguments required for the LLM call.
        batch_size (int): Number of processes to run in parallel.

    Returns:
        pd.DataFrame: DataFrame with an additional column for LLM responses.
    """
    # Extract the comments as a list
    sentences = comments_df['Comments'].tolist()

    # Call the LLM on the sentences
    responses = call_llm_on_sentences(sentences, args, batch_size=batch_size)

    # Add the responses as a new column in the DataFrame
    comments_df[args.out_col] = responses

    return comments_df

In [3]:
comments_df = pd.read_excel('data/Comments_Dataset.xlsx')
comments_df.dropna(subset=['Comments'], inplace=True)

# # iterate through df based on 'Song Name'
# for song_name, group in comments_df.groupby('Song Name'):
#     print(f"Processing comments for song: {song_name}")
#     print(group.shape)



In [None]:
import argparse

# Manually define the arguments
args = argparse.Namespace(
    inp_col="Comments",
    prompt_name="eval_music_rel",
    llm_name="gpt-4o-mini",
    debug_mode=True
)
args.out_col=f"{args.prompt_name}_{args.llm_name}"
args = parse_arguments(args)

comments_df = process_comments_with_llm(comments_df, args, batch_size=16)