In [14]:
import requests
import pandas as pd
import json
import numpy as np
import concurrent.futures
from tqdm import tqdm
import time
import re
import logging
from openai import OpenAI
from dotenv import load_dotenv
import os
import torch
import argparse
import sys

In [15]:
### import the evaluation metrics from the helper open-instruct project ###
eval_dir = r"D:\University of Illinois Chicago\Classes\CS421\GradResearch\open-instruct"
if eval_dir not in sys.path:
    sys.path.append(eval_dir)

from eval.utils import load_hf_lm_and_tokenizer, generate_completions
from eval.templates import create_prompt_with_tulu_chat_format

print("Successfully imported modules from eval.")

Successfully imported modules from eval.


In [16]:
### Initialise logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [17]:
### Initialize OpenAI client
load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")

if not api_key:
    raise ValueError("API key not found! Please set the OPENAI_API_KEY environment variable.")

client = OpenAI(api_key=api_key)

### Data Retrieval Functions

In [18]:
def get_sni_json(task_name):
    """Fetches task data from the Natural Instructions dataset and returns a DataFrame."""
    url = f"https://raw.githubusercontent.com/allenai/natural-instructions/master/tasks/{task_name}.json"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        return pd.DataFrame(data["Instances"])
    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching data for task {task_name}: {e}")
        return pd.DataFrame()

In [19]:
def get_sni_json_wurl(url):
    """Fetches data from a given URL and returns a DataFrame."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        return pd.DataFrame(data["Instances"])
    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching data from URL {url}: {e}")
        return pd.DataFrame()

### Data Searching Functions

In [20]:
def search_dataframe_sw(df, query):
    """Filters rows where 'input' starts with the query string."""
    mask = df['input'].str.startswith(query)
    return df[mask]

In [21]:
def search_dataframe_co(df, query):
    """Filters rows where 'input' contains the query string."""
    mask = df['input'].str.contains(query, regex=False)
    return df[mask]

### Utility Functions

In [22]:
def read_json_file(file_path):
    """Reads a JSON file and returns the data."""
    try:
        with open(file_path, 'r') as file:
            return json.load(file)
    except Exception as e:
        logger.error(f"Error reading JSON file {file_path}: {e}")
        return {}

In [23]:
def parse_answer(s, N):
    """Parses the answer for task N from the given string."""
    pattern = f'<task{N}>(.*?)<task{N}/>'
    match = re.search(pattern, str(s), re.DOTALL)
    return match.group(1).strip() if match else None

In [24]:
def norm_text(text):
    """Normalizes text by attempting to evaluate it if it's numeric."""
    if "/" in text or text.startswith("("):
        return text
    try:
        # Safely evaluate numeric expressions
        return str(eval(text, {"__builtins__": {}}, {}))
    except:
        return text

In [25]:
def calculate_accuracy(answer, prediction):
    """Calculates step-wise and conditional accuracies."""
    if len(answer) != len(prediction):
        raise ValueError("Answer and prediction must have the same length.")
    step_accuracies = np.array([int(a == p) for a, p in zip(answer, prediction)])
    conditional_accuracies = np.cumprod(step_accuracies)
    return step_accuracies, conditional_accuracies

In [26]:
def get_score(df, data, tuid):
    """Calculates average accuracies over the DataFrame."""
    num_answers = len(next(iter(data[tuid]["instance"].values()))["answers"])
    total_step_accuracies = np.zeros(num_answers)
    total_conditional_accuracies = np.zeros(num_answers)
    for _, row in df.iterrows():
        instance_data = data[tuid]["instance"][row.uid]
        answers = [str(ans) for ans in instance_data["answers"]]
        predictions = [
            str(norm_text(parse_answer(row.generation, idx+1) or ""))
            for idx in range(len(answers))
        ]
        step_accuracies, conditional_accuracies = calculate_accuracy(answers, predictions)
        total_step_accuracies += step_accuracies
        total_conditional_accuracies += conditional_accuracies
    n = len(df)
    return total_step_accuracies / n, total_conditional_accuracies / n

### Instruction Reconstruction Functions

In [27]:
def reconstruct_instruction(data, num, full=True):
    """Reconstructs instruction text for GPT input."""
    if num == -1:
        num = len(data['instruction'])
    query = data.get('query', '')
    instructions = data.get('instruction', {})
    contexts = data.get('context', {})
    if full:
        filtered_instructions = '\n'.join(
            f"#{i}: {instructions.get(str(i), '')}" for i in range(1, num+1)
        )
        filtered_contexts = '\n'.join(
            v for i in range(1, num+1) for k, v in contexts.items() if k.startswith(str(i))
        )
        final_string = f"{query}\n{filtered_instructions}\n\n{filtered_contexts}"
    else:
        filtered_instructions = instructions.get(str(num), '')
        filtered_contexts = '\n'.join(
            v for k, v in contexts.items() if k.startswith(str(num))
        )
        final_string = f"{filtered_instructions}\n\n{filtered_contexts}"
    return final_string

### GPT Interaction Functions

In [28]:
def get_answer(instance, model):
    """Calls the OpenAI API with the provided instance and model."""
    try:
        completion = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": instance}],
            stop="### Task:",
            temperature=0.7,
        )
        return completion.choices[0].message.content
    except Exception as e:
        logger.error(f"Error in get_answer: {e}")
        return ""

In [29]:
def gpt_multithread(gpt_input_list, model):
    """Executes GPT calls concurrently and returns the responses."""
    MAX_RETRIES = 3
    answers = []
    for attempt in range(MAX_RETRIES):
        try:
            with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
                futures = [
                    executor.submit(get_answer, gpt_input, model)
                    for gpt_input in gpt_input_list[len(answers):]
                ]
                for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
                    answers.append(future.result())
            break
        except Exception as e:
            logger.error(f"Error during GPT calls: {e}")
            time.sleep(10)
    return answers

In [30]:
def get_instances(idx_list, data, CoT, tuid):
    """Generates input instances for GPT."""
    input_list = []
    for uid in idx_list:
        instance_data = data[tuid]["instance"][uid]
        instruction = reconstruct_instruction(instance_data, -1)
        full_input = f"### Example:\n\n{CoT}\n\n### Task:\n\n{instruction}"
        input_list.append(full_input)
    return input_list

### Similarity Functions

In [31]:
def similarity(list1, list2):
    """Calculates the fraction of matching elements between two lists."""
    if len(list1) != len(list2):
        raise ValueError("Both lists must have the same length.")
    return sum(a == b for a, b in zip(list1, list2)) / len(list1)

In [32]:
def similarity_ef(list1, list2):
    """Calculates similarity considering positions where at least one value is True."""
    if len(list1) != len(list2):
        raise ValueError("Both lists must have the same length.")
    total_positions = sum(1 for a, b in zip(list1, list2) if a or b)
    matched_positions = sum(a == b for a, b in zip(list1, list2) if a or b)
    return matched_positions / total_positions if total_positions else 1.0

### Additional Functions

In [33]:
def reconstruct_batch_instruction(data):
    """Constructs batch instructions by pairing data entries."""
    instruction_lists = {1: [], 2: [], 3: []}
    keys = list(data.keys())
    paired_keys = [(keys[i], keys[i+1]) for i in range(0, len(keys)-1, 2)]
    for uid_0, uid_1 in paired_keys:
        for num in [1, 2, 3]:
            uid_0_instruction = reconstruct_instruction(data[uid_0], num, full=False)
            uid_1_instruction = reconstruct_instruction(data[uid_1], num, full=False)
            uid_1_instruction = uid_1_instruction.replace(
                "<task1>", f"<task{num}>"
            ).replace("<task1/>", f"<task{num}/>")
            final_instruction = (
                f"Read the following passage, and solve both of the instructions provided.\n"
                f"### Instruction 1: {uid_0_instruction}\n\n"
                f"### Instruction 2: {uid_1_instruction}"
            )
            instruction_lists[num].append(final_instruction)
    return instruction_lists, paired_keys

In [34]:
def get_batch_score(df, data, tuid, return_id=False):
    """Calculates accuracy scores for batched tasks."""
    df = df.dropna()
    s1_score = 0
    s2_score = 0
    uid_list = []
    for uid_str, generation in df[["uid", "generation"]].values:
        uid = eval(uid_str)
        p1 = str(norm_text(parse_answer(generation, 1) or ""))
        p2 = str(norm_text(parse_answer(generation, 2) or ""))
        a1 = str(data[tuid]["instance"][uid[0]]["answers"][0])
        a2 = str(data[tuid]["instance"][uid[1]]["answers"][0])
        if a1 == p1:
            s1_score += 1
            uid_list.append(uid[0])
        if a2 == p2:
            s2_score += 1
            uid_list.append(uid[1])
    n = len(df)
    if return_id:
        return uid_list
    return s1_score / n, s2_score / n

## STI functionality

In [35]:
def save_generations(task_id, model_name, generations, step):
    """
    Saves the generated outputs to a CSV file.
    """
    df = pd.DataFrame(generations)
    filename = f"data/{model_name}-MTI-{task_id}-s{step}.csv"
    df.to_csv(filename, index=False)
    print(f"Saved step {step} generations to {filename}")

In [36]:
def create_prompt(cot_sample, s1, s2, s3, instruction1, instruction2, instruction3, gen1, gen2, step):
    """
    Creates a formatted prompt for the given step.
    """
    # Example section
    example_section = f"### Example:\n\n### Instruction: {cot_sample}\n\n"

    # Task section
    task_section = "### Task:\n\n"
    task_section += f"### (1) Instruction: {instruction1}\n\n"
    if step >= 2:
        task_section += f"### Answer:\n\n{gen1}\n\n"
        task_section += f"### (2) Instruction: {instruction2}\n\n"
    if step == 3:
        task_section += f"### Answer:\n\n{gen2}\n\n"
        task_section += f"### (3) Instruction: {instruction3}\n\n"

    # Answer section
    task_section += "### Answer:\n\n"

    # Full prompt
    full_prompt = example_section + task_section
    return create_prompt_with_tulu_chat_format([{"role": "user", "content": full_prompt}])

In [37]:
def generate_step_outputs(model, tokenizer, task_data, cot_sample, s1, batch_size, step, previous_generations=None, s2=None, s3=None):
    """
    Generates outputs for a specific step.
    """
    input_list = []
    instances = task_data["instance"]

    # For each instance in the task
    for idx, (uid, instance) in enumerate(instances.items()):
        # Reconstruct instructions for the current and previous steps
        instruction1 = reconstruct_instruction(instance, 1, False)
        instruction2 = reconstruct_instruction(instance, 2, False) if step >= 2 else None
        instruction3 = reconstruct_instruction(instance, 3, False) if step == 3 else None

        # Get previous generations if any
        gen1 = previous_generations[idx]['generation'] if step >= 2 else None
        gen2 = previous_generations[idx]['generation'] if step == 3 else None

        # Create the prompt for the current step
        prompt = create_prompt(
            cot_sample, s1, s2, s3,
            instruction1, instruction2, instruction3,
            gen1, gen2, step
        )
        input_list.append(prompt)

    # Generate completions
    generation_time, generated_texts = generate_completions(
        model,
        tokenizer,
        input_list,
        batch_size=batch_size,
        stop_id_sequences=None,
        add_special_tokens=True,
        disable_tqdm=False,
        max_new_tokens=2048,
        min_new_tokens=32,
        do_sample=True,
        temperature=0.7,
        top_p=1.0
    )

    # Package the results
    return [
        {
            "uid": uid,
            "generation": gen_text,
            "generation_time": generation_time
        }
        for uid, gen_text in zip(instances.keys(), generated_texts)
    ]

In [38]:
def main_STI():
    # Parse command-line arguments
    parser = argparse.ArgumentParser(description="Run the script with a specified model and batch size.")
    parser.add_argument("--model_name", type=str, required=True, help="Name of the model to load")
    parser.add_argument("--batch_size", type=int, required=True, help="Batch size for generation")
    args = parser.parse_args()

    # Load the MTI benchmark data
    with open(r"D:\University of Illinois Chicago\Classes\CS421\GradResearch\MTI-Bench\data\MTI_BENCH.json", 'r') as file:
        data = json.load(file)

    # Load the language model and tokenizer
    model, tokenizer = load_hf_lm_and_tokenizer(
        args.model_name,
        torch_dtype=torch.float16
    )

    # Load the Chain-of-Thought (CoT) breakdown data
    cot_df = pd.read_excel(r"D:\University of Illinois Chicago\Classes\CS421\GradResearch\MTI-Bench\data\cot_breakdown.xlsx")

    print("Starting evaluation")

    # Iterate over each task in the data
    for task_id, task_data in data.items():
        print(f"Processing task {task_id}")

        # Get the sample CoT steps for the current task
        cot_row = cot_df[cot_df["tuid"] == int(task_id)]
        if cot_row.empty:
            print(f"No CoT data found for task {task_id}")
            continue
        _, s1, s2, s3 = cot_row.values[0]

        # Get the sample CoT from the task data
        cot_sample = task_data.get("sample", "")

        # Generate outputs for each step
        generated_s1 = generate_step_outputs(
            model, tokenizer, task_data, cot_sample, s1, args.batch_size, step=1
        )
        save_generations(task_id, args.model_name, generated_s1, step=1)

        generated_s2 = generate_step_outputs(
            model, tokenizer, task_data, cot_sample, s1, args.batch_size, step=2,
            previous_generations=generated_s1, s2=s2
        )
        save_generations(task_id, args.model_name, generated_s2, step=2)

        generated_s3 = generate_step_outputs(
            model, tokenizer, task_data, cot_sample, s1, args.batch_size, step=3,
            previous_generations=(generated_s1, generated_s2), s2=s2, s3=s3
        )
        save_generations(task_id, args.model_name, generated_s3, step=3)

        # Clean up to free memory
        del generated_s1, generated_s2, generated_s3
        torch.cuda.empty_cache()

In [40]:
# run the main sti function
main_STI()

usage: ipykernel_launcher.py [-h] --model_name MODEL_NAME --batch_size
                             BATCH_SIZE
ipykernel_launcher.py: error: the following arguments are required: --model_name, --batch_size


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
