In [None]:
import re
import random
import html
import numpy as np 
import signal
from tqdm import tqdm
import json
from lxml import etree
from io import StringIO
from collections import Counter
from concurrent.futures import TimeoutError
from langchain_openai import ChatOpenAI

from nltk.corpus import wordnet as wn
import spacy 
from string import punctuation

import torch
from transformers import AutoModelForSequenceClassification, AutoModelForCausalLM,\
    AutoTokenizer
    
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

DetectorFactory.seed = 0

spacy_nlp = spacy.load('en_core_web_sm')
random.seed(0)

numbering_list = ['3', '7)', '7.', '4', 'iii.', 'iii-', '8.', '4-', 'v:', 'I:', 'ii.', 'i.', 'V)', 'E)', 'I)', 'III.', 'III)', '2-', '1)', 'v-', 'III', 'I.', 'c)', '1.', 'V-', 'iv)', 'A)', 'v)', 'IV', 'C.', 'ii)', 'I', 'IV.', 'C)', 'II-', '2.', 'III-', 'IV)', 'd)', 'iii', 'i-', 'iii:', 'A.', 'B.', '1', '6)', 'ii', '8)', '3)', 'e)', 'ii-', '5-', 'II)', 'iv-', '2)', 'e.', 'IV:', 'III:', 'i)', '10.', 'V', 'V.', 'v.', 'D)', 'E.', 'iv:', 'B)', 'II', 'ii:', 'V:', 'a.', '5.', 'IV-', '9.', 'D.', '3.', '4:', '2:', 'i', 'II.', '3-', '2', 'c.', 'a)', '3:', '10)', 'd.', 'i:', 'iv.', '1-', '4.', '5', 'iv', 'iii)', 'b.', '1:', 'II:', 'v', '5:', '6.', 'b)', 'I-', '9)', '4)', '5)']

stopwords_list = ['es', 'ing', 'ed', 'include', 'includes', 'also', 'haven', 'are', 'why', 'most', "won't", 'against', 'with', 'needn', 'couldn', 'now', 'mustn', 'who', 'under', 'doing', 'am', 'aren', 'they', "didn't", 'd', 'doesn', 'if', 'he', 'her', "haven't", 'isn', 'own', 'does', 'such', 'until', 'into', 'had', 'again', 'over', "hadn't", "you'll", 't', 'by', 'be', "wasn't", 'so', 'yours', 'both', 'any', 'did', "you've", 'these', 'myself', 'o', 'hasn', "isn't", 'you', 'other', 'shan', 'being', 'yourselves', 'was', 'no', 'm', 'those', 'will', 'its', 'itself', 'have', 'down', 'weren', 'having', 'wouldn', 'herself', "mustn't", 'very', 'do', "should've", 'him', "you'd", 'below', 'just', 'that', 'for', 'which', 'but', 'nor', 'all', 'then', 'i', 'whom', 'it', 'once', 'here', 've', "you're", 'ours', "that'll", 'a', 'won', 'himself', 'where', 'this', 'your', "hasn't", 'same', 'when', 'ourselves', 'because', "needn't", 'theirs', 'from', 'mightn', 'my', 'while', 'yourself', "she's", 'each', "doesn't", 'only', 'at', 's', 'their', "wouldn't", 'shouldn', 'and', 'themselves', 'hers', 'has', 'up', 'ma', 'in', 'll', 'we', 're', 'y', 'of', 'after', 'our', "shan't", 'before', 'wasn', 'can', 'should', 'been', 'through', 'as', 'further', 'during', 'between', 'there', 'me', 'on', 'don', "shouldn't", 'more', 'out', "don't", 'the', "weren't", "aren't", "it's", 'what', 'or', "couldn't", 'hadn', "mightn't", 'his', 'above', 'to', 'how', 'few', 'off', 'them', 'didn', 'ain', 'not', 'she', 'an', 'than', 'too', 'is', 'some', 'were', 'about']

common_title_words_set = {'introduction', 'conclusion', 'section', 'chapter', 'works', 'notes', 'note', 'further', 'see', 'references', 'reference', 'section', 'title', 'conclusion', 'intro', 'introduction', 'executive', 'summary', 'key', 'plot', 'theme'}
stopwords_set = set(stopwords_list + numbering_list)

In [2]:
import os
from getpass import getpass

os.environ["OPENAI_API_KEY"] = getpass("Enter your OpenAI API key: ")

In [3]:
def get_tokenizer_with_new_chat_template(tokenizer):
    to_delete = "{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}"
    new_template = tokenizer.get_chat_template().replace(to_delete, "")
    return AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B", chat_template=new_template)

In [97]:
from datasets import load_dataset 
dataset = load_dataset("bespokelabs/Bespoke-Stratos-17k")

In [5]:
global tokenizer, model, tokenizer_nli, model_nli
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
tokenizer = get_tokenizer_with_new_chat_template(tokenizer)
model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B", device_map="auto").half().eval()

tokenizer_nli = AutoTokenizer.from_pretrained("MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7")
model_nli = AutoModelForSequenceClassification.from_pretrained("MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7").eval().cuda()

llm_openai = ChatOpenAI(
    model="meta-llama/Llama-3.3-70B-Instruct",
    temperature=0,
    max_tokens=15000,
    openai_api_base="https://fmapi.swissai.cscs.ch"
)

# global seed_instructions, generated_eval_functions, filtered_generated_eval_functions, generated_instructions, filtered_generated_instructions, generated_responses, filtered_generated_responses, filtered2_generated_responses
# seed_instructions = [each.strip() for each in open("/share/u/harshraj/CotIF/data/seed_instruction.txt").readlines()]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
def timeout_handler(signum, frame):
    raise TimeoutError("Function execution timed out")

In [139]:
llm_openai = ChatOpenAI(
    model="meta-llama/Llama-3.3-70B-Instruct",
    temperature=0,
    max_tokens=15000,
    openai_api_base="https://fmapi.swissai.cscs.ch",
)
llm_openai.invoke([{"role": "user", "content": "Hello!"}])

AIMessage(content='Hello. How can I help you today?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 37, 'total_tokens': 47, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'meta-llama/Llama-3.3-70B-Instruct', 'system_fingerprint': None, 'id': '09f719d09ca34dba9cd0a6512d3778cf', 'finish_reason': 'stop', 'logprobs': None}, id='run-51c47479-9f3f-4d82-be9b-8f468055cab1-0', usage_metadata={'input_tokens': 37, 'output_tokens': 10, 'total_tokens': 47, 'input_token_details': {}, 'output_token_details': {}})

In [149]:
import requests

response = requests.get("https://fmapi.swissai.cscs.ch")
print(response.headers)

{'Date': 'Sat, 19 Apr 2025 00:30:29 GMT', 'Content-Type': 'text/html; charset=utf-8', 'Content-Length': '38066', 'Connection': 'keep-alive', 'Strict-Transport-Security': 'max-age=31536000; includeSubDomains'}


In [27]:
import time
# import openai

# Replace with your actual API key and desired model
# openai.api_key = "YOUR_API_KEY"
# MODEL_NAME = "gpt-3.5-turbo"
llm_openai = ChatOpenAI(
            model="meta-llama/Llama-3.3-70B-Instruct",
            temperature=0,
            max_tokens=15000,
            openai_api_base="https://fmapi.swissai.cscs.ch",
        )
def test_frequency(prompt, n_calls=15, delays=[0, 0.5, 1, 2]):
    """
    Test API behavior under different delays between calls.
    Returns a list of dicts with delay, successes, failures, and times.
    """
    results = []
    for delay in delays:
        successes = 0
        failures = 0
        times = []
        for _ in range(n_calls):
            try:
                start = time.monotonic()
                resp = llm_openai.invoke([{"role": "user", "content": prompt}],)
                elapsed = time.monotonic() - start
                times.append(elapsed)
                successes += 1
            except Exception as e:
                failures += 1
                times.append(None)
            time.sleep(delay)
        results.append({
            "delay_s": delay,
            "successes": successes,
            "failures": failures,
            "times_s": times
        })
    return results

def test_prompt_length(base_prompt, lengths=[18000, 18000, 18000, 18000, 18000, 18000, 18000, 18000, 18000]):
    """
    Test API behavior with increasing prompt lengths.
    Returns a list of dicts with prompt length, status, and time taken.
    """
    results = []
    for length in lengths:
        prompt = "Hello"
        try:
            start = time.monotonic()
            resp = llm_openai.invoke([{"role": "user", "content": prompt}],)
            elapsed = time.monotonic() - start
            status = "success"
        except Exception as e:
            elapsed = None
            status = f"error: {type(e).__name__}"
        results.append({
            "prompt_length": length,
            "status": status,
            "time_s": elapsed
        })
    return results

if __name__ == "__main__":
    base_prompt = "Hello, please respond briefly."
    # print("=== Frequency Test ===")
    # freq_results = test_frequency(base_prompt)
    # for r in freq_results:
    #     print(r)
    print("\n=== Prompt Length Test ===")
    prompt_results = test_prompt_length(base_prompt)
    for r in prompt_results:
        print(r)



=== Prompt Length Test ===
{'prompt_length': 18000, 'status': 'success', 'time_s': 0.737207107944414}
{'prompt_length': 18000, 'status': 'success', 'time_s': 0.4344358800444752}
{'prompt_length': 18000, 'status': 'success', 'time_s': 0.43144977395422757}
{'prompt_length': 18000, 'status': 'success', 'time_s': 0.4359107951167971}
{'prompt_length': 18000, 'status': 'success', 'time_s': 0.43513693287968636}
{'prompt_length': 18000, 'status': 'success', 'time_s': 0.43246272788383067}
{'prompt_length': 18000, 'status': 'success', 'time_s': 0.4315934849437326}
{'prompt_length': 18000, 'status': 'success', 'time_s': 0.4318384879734367}
{'prompt_length': 18000, 'status': 'success', 'time_s': 0.4325141590088606}


In [6]:
import time
import random
import httpx
from langchain.schema import HumanMessage, AIMessage

def call_api(
    llm,
    messages: list[HumanMessage] = None,
    *,
    prompt: str = None,
    max_retries: int = 8,
    backoff_factor: float = 4.0,
    max_sleep: float = 260.0,
    **llm_kwargs
) -> str:

    for attempt in range(1, max_retries + 1):
        try:
            ai_msg: AIMessage = llm(messages=messages, **llm_kwargs)
            return ai_msg.content

        except httpx.HTTPStatusError as e:
            status = e.response.status_code
            headers = e.response.headers

            # Rate limit
            if status == 429:
                raw = headers.get("Retry-After")
                if raw:
                    sleep_t = min(float(raw), max_sleep)
                else:
                    sleep_t = min(backoff_factor * 2 ** (attempt - 1) + random.random(), max_sleep)
                print(f"[429] rate limited – sleeping {sleep_t:.1f}s (try {attempt})")
                time.sleep(sleep_t)
                continue

            # Server errors (including 504)
            if 500 <= status < 600:
                sleep_t = min(backoff_factor * 2 ** (attempt - 1) + random.random(), max_sleep)
                print(f"[{status}] server error – sleeping {sleep_t:.1f}s (try {attempt})")
                time.sleep(sleep_t)
                continue

            # Other HTTP errors are not retriable
            raise

        except (httpx.TransportError, httpx.TimeoutException) as e:
            # network glitch or timeout
            sleep_t = min(backoff_factor * 2 ** (attempt - 1) + random.random(), max_sleep)
            print(f"[Network/Timeout] {e!r} – sleeping {sleep_t:.1f}s (try {attempt})")
            time.sleep(sleep_t)
            continue

        except Exception:
            # anything else, give up
            raise

    raise RuntimeError(f"API call failed after {max_retries} attempts")


In [None]:
import time
class AutoIf:
    def __init__(self):
        # self.tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
        # self.model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B", device_map="auto").half().eval()
        
        self.tokenizer_nli = AutoTokenizer.from_pretrained("MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7")
        self.model_nli = AutoModelForSequenceClassification.from_pretrained("MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7").eval().cuda()
        
        # self.llm_openai = ChatOpenAI(
        #     model="meta-llama/Llama-3.3-70B-Instruct",
        #     temperature=0,
        #     max_tokens=15000,
        #     openai_api_base="https://fmapi.swissai.cscs.ch",
        # )
        self.llm_openai = ChatOpenAI(
            model="gpt-4o-mini",
            temperature=0.8,
            max_tokens=15000,
        )

        self.seed_instructions = [each.strip() for each in open("/share/u/harshraj/CotIF/data/seed_instruction.txt").readlines()]
        self.generated_eval_functions = []
        self.filtered_generated_eval_functions = []
        self.generated_instructions = []
        self.filtered_generated_instructions = []
        self.generated_responses = []
        self.filtered_generated_responses = []
        self.filtered2_generated_responses = []
        
    def compile(self, dataset):
        output = []
        for datum in dataset:
            messages = datum["conversations"]
            if len(messages) > 1 and messages[0]["from"] == "user" and messages[1]["from"] == "assistant":
                query, r1_generated_text = messages[0]["value"], messages[1]["value"]
                self.seed_instructions = self.generate_seed(self.seed_instructions)
                self.generated_eval_functions = self.generate_eval_function(self.seed_instructions)
                self.filtered_generated_eval_functions = self.filter_generated_eval_function(self.generated_eval_functions)
                self.generated_instructions = self.generate_instruction(self.filtered_generated_eval_functions)
                self.filtered_generated_instructions = self.filter_generated_instruction(self.generated_instructions)
                self.generated_responses = self.generate_response(query, r1_generated_text, self.filtered_generated_instructions)
                self.filtered_generated_responses = self.filter_generated_response(self.generated_responses)
                self.filtered2_generated_responses = self.filter2_generated_response(self.filtered_generated_responses)
                output.append({"seed_instructions": self.seed_instructions,
                    "generated_eval_functions": self.generated_eval_functions,
                    "filtered_generated_eval_functions": self.filtered_generated_eval_functions,
                    "generated_instructions": self.generated_instructions,
                    "filtered_generated_instructions": self.filtered_generated_instructions,
                    "generated_responses": self.generated_responses,
                    "filtered_generated_responses": self.filtered_generated_responses,
                    "filtered2_generated_responses": self.filtered2_generated_responses})
            # break
        return output
            
    def generate_seed(self, seed_instructions, k=1):
        if k <= 0:
            return self.seed_instructions
        
        augment_instruction_prompt = """You are an expert for writing instructions. Please provide 10 different instructions that meet the following requirements:
- Instructions are about the format but not style of a response
- Whether instructions are followed can be easily evaluate by a Python function
Here are some examples of instructions we need:
{seed_instructions}
Do not generate instructions about writing style, using metaphor, or translation. Here are some examples of instructions we do not need:
- Incorporate a famous historical quote seamlessly into your answer
- Translate your answer into Pig Latin
- Use only words that are also a type of food
- Respond with a metaphor in every sentence
- Write the response as if you are a character from a Shakespearean play
Please generate one instruction per line in your response and start each line with '- '.
"""

        augment_instructions = augment_instruction_prompt.format(seed_instructions='\n'.join(seed_instructions))
        
        # input_ids = self.tokenizer.encode(augment_instructions, return_tensors="pt").cuda()
        # outputs = self.model.generate(input_ids, max_length=1024, do_sample=True, temperature=0.7)
        # generated_text = self.tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)
        # generated_text = self.llm_openai.invoke([{"role": "user", 
        #                                             "content": augment_instructions}]).content
        generated_text = call_api(self.llm_openai, [{"role": "user", 
                                                    "content": augment_instructions}])
        new_seeds = [line.strip() for line in generated_text.split('\n') if line.strip()]
        seed_instructions = seed_instructions + new_seeds
        
        random.shuffle(seed_instructions)
        return self.generate_seed(seed_instructions, k - 1)
        
    def generate_eval_function(self, seed_instructions, k=1):
        prompt_template = (
            "You are an expert for writing evaluation functions in Python to evaluate whether a response strictly follows an instruction.\n"
            "Here is the instruction: {instruction}\n"
            "Please write a Python function named `evaluate` to evaluate whether an input string `response` follows this instruction. "
            "If it follows, simply return True, otherwise return False.\n"
            "Please response with a single JSON includes the evaluation function in the key `func`, and a list of three test cases in the key `cases`, "
            "which includes an input in the key `input` and an expected output in the key `output` in (true, false).\n"
            "Here is an example of output JSON format: {{\"func\": JSON_STR(use only \\n instead of \n), \"cases\": [{{\"input\": str, \"output\": str}}]}}."
        )

        for instruction in seed_instructions:
            prompt = prompt_template.format(instruction=instruction)
            self.generated_eval_functions.append({
                "prompt": prompt,
                "instruction": instruction,
                "gpt-answer": []
            })
            for _ in range(k):
                # tokens = encoding.encode(prompt)
                # print("Number of tokens:", len(tokens))
                # generated_text = self.llm_openai.invoke([{"role": "user", "content": prompt}]).content
                generated_text = call_api(self.llm_openai, [{"role": "user", "content": prompt}])
                self.generated_eval_functions[-1]['gpt-answer'].append(generated_text)
        return self.generated_eval_functions
                
    def filter_generated_eval_function(self, generated_eval_functions):
        collect_packages, count = [], 0
        for result in generated_eval_functions:
            res = result['gpt-answer']
            eval_funcs, test_cases = [], []
            for each in res:
                try:
                    json_dict = re.findall(r'```json(.*?)```', each, re.DOTALL)[0].strip().replace("\n", "")
                except Exception as e:
                    count += 1
                    print("Skipping because of index error:", e)
                    continue
                try:
                    res_dict = json.loads(json_dict)
                except Exception as e:
                    count += 1
                    print("skipping because of JSON load error:", e)
                    continue
                func = res_dict['func']
                if '\\n' in func:
                    func = func.replace('\\n', '\n')
                try:
                    exec(func)
                except Exception as e:
                    count += 1
                    print("Error executing eval function:", e)
                    continue
                for line in func.split('\n'):
                    if 'import' in line or 'download' in line or 'requests' in line:
                        collect_packages.append(line)
        print(list(set(collect_packages)))

        for result in tqdm(generated_eval_functions):
            res = result['gpt-answer']
            eval_funcs, test_cases = [], []
            for each in tqdm(res):
                try:
                    json_dict = re.findall(r'```json(.*?)```', each, re.DOTALL)[0].strip().replace("\n", "")
                except Exception as e:
                    print("Skipping because of index error:", e)
                    continue
                try:
                    res_dict = json.loads(json_dict)
                except Exception as e:
                    print("Skipping as JSON load error:", e)
                    continue

                # func rejection and cleaning
                func = res_dict['func'].strip()
                func = '\n'.join([line for line in func.split('\n') if 'download' not in line and 'requests' not in line])
                try:
                    exec(func)
                except Exception as e:
                    print("Error executing eval function:", e)
                    continue
                eval_funcs.append(func)

                for each_case in res_dict['cases']:
                    try:
                        test_cases.append((each_case['input'], each_case['output']))
                    except KeyError:
                        print(each_case)

            eval_funcs = list(set(eval_funcs))
            test_cases = list(map(json.loads, set(map(json.dumps, test_cases))))
            
            # if len(eval_funcs) < 3 or len(test_cases) < 10:
            #     continue
            
            filtered_test_cases = []
            for each in tqdm(test_cases):
                flag = False
                for func in eval_funcs:
                    local_vars = {}
                    try:
                        exec(func, globals(), local_vars)
                    except Exception as e:
                        print("Error executing eval function:", e)
                        continue
                    if 'evaluate' not in local_vars:
                        print("skipping because evaluate not in local_vars")
                        continue
                    eval_func = local_vars['evaluate']
                    try:
                        signal.signal(signal.SIGALRM, timeout_handler)
                        signal.alarm(5)
                        res_val = eval_func(each[0])
                    except Exception as e:
                        print("Error executing eval function:", e)
                        res_val = None
                    finally:
                        signal.alarm(0)
                    if res_val is not None and res_val == each[1]:
                        flag = True
                if flag:
                    filtered_test_cases.append(each)
                else:
                    print("skipping because flag is False")
            scored_funcs = []
            for func in tqdm(eval_funcs):
                local_vars = {}
                try:
                    exec(func, globals(), local_vars)
                except Exception as e:
                    print("Error executing eval function:", e)
                    continue
                if 'evaluate' not in local_vars:
                    print("skipping because evaluate not in local_vars")
                    continue
                eval_func = local_vars['evaluate']
                acc = []
                for inp, out in filtered_test_cases:
                    try:
                        signal.signal(signal.SIGALRM, timeout_handler)
                        signal.alarm(5)
                        res_val = eval_func(inp)
                    except Exception as e:
                        print("Error executing eval function:", e)
                        res_val = None
                    finally:
                        signal.alarm(0)
                    if res_val is None or res_val != out:
                        acc.append(0)
                    else:
                        acc.append(1)
                acc = np.mean(acc) if acc else 0
                scored_funcs.append((func, acc))
            valid_funcs = [each for each in scored_funcs if each[1] >= 0.5]
            if not valid_funcs:
                print("not valid funcs")
                continue
            else:
                print("valid funcs")
            
            self.filtered_generated_eval_functions.append({
                "instruction": result['instruction'],
                "eval_func": valid_funcs,
                "cases": filtered_test_cases
            })
        return self.filtered_generated_eval_functions

    def generate_instruction(self, filtered_generated_eval_functions, k=2):
        count = 0
        filter_count = 0

        for line in tqdm(filtered_generated_eval_functions, desc="Generating back-translated instructions"):
            funcs = line["eval_func"][:3]

            instruction_prompt = f"""You are an expert in converting the Python eval function code into the corresponding instruction text. I will provide the eval function code. Please strictly follow the code to convert it into the corresponding instruction text. Here's an example: 

[["def evaluate(response):\n    return 'e' not in response.lower()", 1.0], ["def evaluate(response):\n    words = response.split()\n    for word in response.split():\n        if 'e' in word.lower():\n            return False\n    return True", 1.0], ["def evaluate(response):\n    return all('e' not in word.lower() for word in response.split())", 1.0]] 

["Answer without using any words that contain the letter 'E'.","Answer with words that do not contain the letter 'E'.","Answer with words that do not contain the letter 'E'."] Please convert the following eval function into instructions stored in a list: 

{funcs}"""
            for _ in range(k):
                # input_ids = self.tokenizer.encode(instruction_prompt, return_tensors="pt").cuda()
                # outputs = self.model.generate(input_ids, max_length=1024, do_sample=True, temperature=0.7)
                # generated_text = self.tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)
                # generated_text = self.llm_openai.invoke([{"role": "user", "content": instruction_prompt}]).content
                generated_text= call_api(self.llm_openai, [{"role": "user", "content": instruction_prompt}])
                try:
                    back_instruction = json.loads(generated_text)
                    break
                except Exception:
                    filter_count += 1
                    continue
            line["back_instruction"] = back_instruction
            self.generated_instructions.append(line)
            count += 1
        return self.generated_instructions
            
    def filter_generated_instruction(self, generated_instructions):
        count = 0 
        filter_count = 0

        for line in tqdm(generated_instructions, desc="Filtering back-translated instructions"):
            back_instructions = line["back_instruction"]
            ori_ins = line["instruction"]

            nli_scores = []
            for back_ins in back_instructions[:3]:
                premise = ori_ins
                hypothesis = back_ins
                inputs = self.tokenizer_nli(premise, hypothesis, truncation=True, return_tensors="pt")
                output = self.model_nli(inputs["input_ids"].cuda())
                prediction = torch.softmax(output["logits"][0], -1).tolist()
                label_names = ["entailment", "neutral", "contradiction"]
                prediction_dict = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
                max_label = max(prediction_dict, key=prediction_dict.get)
                nli_scores.append(max_label)

            line["nli_scores"] = nli_scores
            if "contradiction" in nli_scores:
                filter_count += 1
                continue
            else:
                self.filtered_generated_instructions.append(line)
            count += 1
        return self.filtered_generated_instructions
            
    def generate_response(self, query, r1_generated_text, filtered_generated_instructions, k=2):
        for instruction in filtered_generated_instructions:
            prompt = (
                f"Please answer the query strictly following the instruction.\n"
                f"[instruction] {instruction['instruction']}\n"
                f"[Query] {query}"
            )
            responses = []
            for _ in range(k):
                generated_text = call_api(self.llm_openai, [{"role": "user", 
                                                          "content": (f"{r1_generated_text}\n"
                                                                    f"Re-write the above text following: {instruction['instruction']}\n\n"
                                                                    f"Note: Use the same words and sentences but re-arrange them in a way that strictly follows the instruction.\n")}])
                # generated_text = self.llm_openai.invoke([{"role": "user", 
                #                                           "content": (f"{r1_generated_text}\n"
                #                                                     f"Re-write the above text following: {instruction['instruction']}\n\n"
                #                                                     f"Note: Use the same words and sentences but re-arrange them in a way that strictly follows the instruction.\n")}]).content
                responses.append(generated_text)
            self.generated_responses.append({
                "instruction": instruction['instruction'],
                "prompt": prompt,
                "gpt-answer": responses,
                "eval_func": instruction["eval_func"],
            })
        return self.generated_responses
              
    def filter_generated_response(self, generated_responses):
        filtered_samples = []
        for result in tqdm(generated_responses, desc="Filtering back translated responses"):
            eval_funcs = []
            for func, score in result['eval_func']:
                local_vars = {}
                try:
                    exec(func, globals(), local_vars)
                except Exception as e:
                    print("Error executing eval function:", e)
                    continue
                if 'evaluate' in local_vars:
                    eval_funcs.append(local_vars['evaluate'])
            
            filter_responses = []
            for response in result['gpt-answer']:
                acc = []
                for eval_func in eval_funcs:
                    try:
                        signal.signal(signal.SIGALRM, timeout_handler)
                        signal.alarm(5)
                        res = eval_func(response)
                    except Exception as e:
                        print(e)
                        res = None
                    finally:
                        signal.alarm(0)
                    if res is not None:
                        try:
                            acc.append(int(res))
                        except Exception:
                            continue
                acc = np.mean(acc) if acc else 0
                if acc > 0:
                    filter_responses.append(response)
            
            for each in filter_responses:
                try:
                    query_match = re.findall(r'\[Query\](.*)$', result['prompt'], re.DOTALL)
                    query = query_match[0].strip() if query_match else ""
                    filtered_samples.append({
                        'instruction': result['instruction'],
                        'query': query,
                        'response': each,
                        'prompt': result['prompt'],
                        "eval_func": result['eval_func'],
                    })
                except IndexError:
                    print("Prompt extraction error:", result['prompt'])
        
        self.filtered_generated_responses = list(map(json.loads, set(map(json.dumps, filtered_samples))))
        return self.filtered_generated_responses
        
    def filter2_generated_response(self, filtered_generated_responses, k=2): 
        prompt_template = (
            "You are an expert that is good at judging whether a response is following the instruction and query.\n"
            "[Instruction] {instruction}\n"
            "[Query] {query}\n"
            "[Response] {response}\n"
            "Please notice that the response may not be helpful as it needs to strictly follow the requirements in the Instruction.\n"
            "You need to judge whether the response answers the query. Please first provide a detailed analysis and then give a score ranking from 0 to 10 at the last line.\n"
            "Scoring 0 means the response is totally unrelated to the query, while scoring 10 means the response is helpful and highly related to the query.\n"
            "Please only provide a score in the format `Score: {{score}}` without any other contents at the last line."
        )
        for each in filtered_generated_responses:
            each['gen'] = []
            for _ in range(k):
                # input_ids = self.tokenizer.encode(each['prompt'], return_tensors="pt").cuda()
                # outputs = self.model.generate(input_ids, max_length=1024, do_sample=True, temperature=0.7)
                # generated_text = self.tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)
                generated_text = call_api(self.llm_openai, [{"role": "user", "content": prompt_template.format(
                                                                                            instruction=each['instruction'],
                                                                                            query=each['query'],
                                                                                            response=each['response']
                                                                                        )}])
                # generated_text = self.llm_openai.invoke([{"role": "user", "content": each['prompt']}])
                each['gen'].append(generated_text)
            
            scores = []
            for gen in each['gen']:
                score = re.findall(r'Score: (\d+?)$', gen)
                if score:
                    scores.append(int(score[0]))
            score = np.mean(scores) if scores else 0
            if score > 5: # quality score
                self.filtered2_generated_responses.append(each)
        return self.filtered2_generated_responses

In [41]:
x[0]["filtered_generated_response"]

[{'instruction': 'Include at least three rhyming pairs',
  'query': 'Return your final response within \\boxed{}. The operation $\\otimes$ is defined for all nonzero numbers by $a\\otimes b =\\frac{a^{2}}{b}$. Determine $[(1\\otimes 2)\\otimes 3]-[1\\otimes (2\\otimes 3)]$.\n$\\text{(A)}\\ -\\frac{2}{3}\\qquad\\text{(B)}\\ -\\frac{1}{4}\\qquad\\text{(C)}\\ 0\\qquad\\text{(D)}\\ \\frac{1}{4}\\qquad\\text{(E)}\\ \\frac{2}{3}$',
  'response': 'To find the value of \\([(1 \\otimes 2) \\otimes 3] - [1 \\otimes (2 \\otimes 3)]\\), we start with the operation defined as \\(a \\otimes b = \\frac{a^2}{b}\\). \n\nFirst, we compute \\(1 \\otimes 2\\):\n\\[\n1 \\otimes 2 = \\frac{1^2}{2} = \\frac{1}{2}\n\\]\nNext, we take this result to compute \\((1 \\otimes 2) \\otimes 3\\):\n\\[\n\\left(\\frac{1}{2}\\right) \\otimes 3 = \\frac{\\left(\\frac{1}{2}\\right)^2}{3} = \\frac{\\frac{1}{4}}{3} = \\frac{1}{12}\n\\]\n\nNow, we compute \\(2 \\otimes 3\\):\n\\[\n2 \\otimes 3 = \\frac{2^2}{3} = \\frac{4}{3}

In [99]:
autoif = AutoIf()

In [100]:
dataset = dataset['train'].select(range(2))

In [115]:
print(x[0]["filtered2_generated_responses"][0]["prompt"])

Please answer the query strictly following the instruction.
[instruction] Incorporate a famous movie quote seamlessly into your answer
[Query] Return your final response within \boxed{}. Doug constructs a square window using $8$ equal-size panes of glass, as shown. The ratio of the height to width for each pane is $5 : 2$, and the borders around and between the panes are $2$ inches wide. In inches, what is the side length of the square window?
[asy] fill((0,0)--(2,0)--(2,26)--(0,26)--cycle,gray); fill((6,0)--(8,0)--(8,26)--(6,26)--cycle,gray); fill((12,0)--(14,0)--(14,26)--(12,26)--cycle,gray); fill((18,0)--(20,0)--(20,26)--(18,26)--cycle,gray); fill((24,0)--(26,0)--(26,26)--(24,26)--cycle,gray); fill((0,0)--(26,0)--(26,2)--(0,2)--cycle,gray); fill((0,12)--(26,12)--(26,14)--(0,14)--cycle,gray); fill((0,24)--(26,24)--(26,26)--(0,26)--cycle,gray); [/asy]
$\textbf{(A)}\ 26\qquad\textbf{(B)}\ 28\qquad\textbf{(C)}\ 30\qquad\textbf{(D)}\ 32\qquad\textbf{(E)}\ 34$


In [116]:
print(x[1]["filtered2_generated_responses"][0]["response"])

Doug constructs a square window using 8 equal-size panes of glass, with a height to width ratio of 5:2 for each pane. The borders around and between the panes are 2 inches wide. We need to determine the side length of the square window.

1. **Arrangement of Panes**: The panes are arranged in 2 rows and 4 columns. This arrangement ensures the window is square. As Yoda would say, "Do or do not, there is no try," and in this case, we will do the calculations.

2. **Dimensions of Each Pane**: Let the width of each pane be \(2k\) and the height be \(5k\). The ratio of height to width is \(5:2\).

3. **Total Width Calculation**:
   - Left border: 2 inches
   - Right border: 2 inches
   - 4 panes: \(4 \times 2k\)
   - 3 borders between panes: \(3 \times 2\) inches
   - Total width: \(2 + 2 + 4 \times 2k + 3 \times 2 = 10 + 8k\)

4. **Total Height Calculation**:
   - Top border: 2 inches
   - Bottom border: 2 inches
   - 2 panes: \(2 \times 5k\)
   - 1 border between panes: 2 inches
   - Total

In [None]:
x = autoif.compile(dataset)

Error executing eval function: unterminated string literal (detected at line 2) (<string>, line 2)
Error executing eval function: unterminated string literal (detected at line 3) (<string>, line 3)
Error executing eval function: unterminated string literal (detected at line 2) (<string>, line 2)
skipping because of JSON load error: Expecting value: line 1 column 163 (char 162)
Error executing eval function: unterminated string literal (detected at line 4) (<string>, line 4)
Error executing eval function: closing parenthesis '}' does not match opening parenthesis '(' (<string>, line 2)
['    import re']


100%|██████████| 1/1 [00:00<00:00, 4500.33it/s]
100%|██████████| 3/3 [00:00<00:00, 7214.97it/s]
100%|██████████| 1/1 [00:00<00:00, 3603.35it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 4419.71it/s]
100%|██████████| 3/3 [00:00<00:00, 11781.75it/s]
100%|██████████| 1/1 [00:00<00:00, 6087.52it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 6269.51it/s]
100%|██████████| 3/3 [00:00<00:00, 11224.72it/s]
100%|██████████| 1/1 [00:00<00:00, 5957.82it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 5817.34it/s]
100%|██████████| 3/3 [00:00<00:00, 9724.04it/s]


skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 7943.76it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 11428.62it/s]
100%|██████████| 3/3 [00:00<00:00, 20068.44it/s]


skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 10255.02it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 6000.43it/s]
100%|██████████| 3/3 [00:00<00:00, 7830.06it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 6061.13it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 6213.78it/s]
100%|██████████| 3/3 [00:00<00:00, 9716.53it/s]
100%|██████████| 1/1 [00:00<00:00, 6034.97it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 8128.50it/s]
100%|██████████| 3/3 [00:00<00:00, 14997.51it/s]


skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 8456.26it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 8774.69it/s]
100%|██████████| 3/3 [00:00<00:00, 14580.43it/s]
100%|██████████| 1/1 [00:00<00:00, 7653.84it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 8305.55it/s]
100%|██████████| 3/3 [00:00<00:00, 12722.86it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 8240.28it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 6297.75it/s]
100%|██████████| 3/3 [00:00<00:00, 8427.94it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 6123.07it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 8128.50it/s]
100%|██████████| 3/3 [00:00<00:00, 12396.96it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 7639.90it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 5229.81it/s]
100%|██████████| 3/3 [00:00<00:00, 6064.05it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 4476.31it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 4293.04it/s]
100%|██████████| 3/3 [00:00<00:00, 5877.12it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 5949.37it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 3887.21it/s]
100%|██████████| 3/3 [00:00<00:00, 6269.51it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 4771.68it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 4832.15it/s]
100%|██████████| 3/3 [00:00<00:00, 6978.88it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 5171.77it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 8224.13it/s]
100%|██████████| 3/3 [00:00<00:00, 19448.09it/s]
100%|██████████| 1/1 [00:00<00:00, 9238.56it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 5035.18it/s]
100%|██████████| 3/3 [00:00<00:00, 7923.75it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 5974.79it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 9446.63it/s]
100%|██████████| 3/3 [00:00<00:00, 9439.54it/s]


skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 8577.31it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 10381.94it/s]
100%|██████████| 5/5 [00:00<00:00, 18444.61it/s]


skipping because flag is False
skipping because flag is False
skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 9320.68it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 3953.16it/s]
100%|██████████| 5/5 [00:00<00:00, 5887.57it/s]


skipping because flag is False
skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 4006.02it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 3160.74it/s]
100%|██████████| 3/3 [00:00<00:00, 4627.77it/s]


skipping because flag is False
skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 5071.71it/s]
 61%|██████    | 22/36 [00:00<00:00, 206.66it/s]

not valid funcs


100%|██████████| 1/1 [00:00<00:00, 4675.92it/s]
100%|██████████| 3/3 [00:00<00:00, 6584.46it/s]


skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 4563.99it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 5841.65it/s]
100%|██████████| 3/3 [00:00<00:00, 12409.18it/s]


skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 7653.84it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 7884.03it/s]
100%|██████████| 3/3 [00:00<00:00, 12839.71it/s]
100%|██████████| 1/1 [00:00<00:00, 7108.99it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 8272.79it/s]
100%|██████████| 3/3 [00:00<00:00, 12087.33it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 6061.13it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 5915.80it/s]
100%|██████████| 3/3 [00:00<00:00, 8949.44it/s]


skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 5667.98it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 12446.01it/s]


Skipping as JSON load error: Expecting value: line 1 column 163 (char 162)


0it [00:00, ?it/s]
0it [00:00, ?it/s]


not valid funcs


100%|██████████| 1/1 [00:00<00:00, 6034.97it/s]
100%|██████████| 3/3 [00:00<00:00, 9799.78it/s]
100%|██████████| 1/1 [00:00<00:00, 6141.00it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 8830.11it/s]
100%|██████████| 3/3 [00:00<00:00, 14027.77it/s]
100%|██████████| 1/1 [00:00<00:00, 7489.83it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 5932.54it/s]
100%|██████████| 3/3 [00:00<00:00, 7989.15it/s]


skipping because flag is False
skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 7810.62it/s]


not valid funcs


100%|██████████| 1/1 [00:00<00:00, 5468.45it/s]
100%|██████████| 3/3 [00:00<00:00, 6743.25it/s]
100%|██████████| 1/1 [00:00<00:00, 4301.85it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 6462.72it/s]
100%|██████████| 3/3 [00:00<00:00, 7345.54it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 5229.81it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 8542.37it/s]


Error executing eval function: closing parenthesis '}' does not match opening parenthesis '(' (<string>, line 2)


0it [00:00, ?it/s]
0it [00:00, ?it/s]


not valid funcs


100%|██████████| 1/1 [00:00<00:00, 6574.14it/s]
100%|██████████| 3/3 [00:00<00:00, 14106.40it/s]


skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 8224.13it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 7371.36it/s]
100%|██████████| 3/3 [00:00<00:00, 10773.04it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 7049.25it/s]
100%|██████████| 36/36 [00:00<00:00, 212.47it/s]


valid funcs


Generating back-translated instructions: 100%|██████████| 32/32 [00:31<00:00,  1.01it/s]
Filtering back-translated instructions: 100%|██████████| 32/32 [00:00<00:00, 62.76it/s]
Filtering back translated responses: 100%|██████████| 24/24 [00:00<00:00, 4867.43it/s]


Error executing eval function: unterminated string literal (detected at line 2) (<string>, line 2)
Error executing eval function: unterminated string literal (detected at line 3) (<string>, line 3)
Error executing eval function: unterminated string literal (detected at line 2) (<string>, line 2)
skipping because of JSON load error: Expecting value: line 1 column 163 (char 162)
Error executing eval function: unterminated string literal (detected at line 4) (<string>, line 4)
Error executing eval function: closing parenthesis '}' does not match opening parenthesis '(' (<string>, line 2)
Error executing eval function: No module named 'emoji'
Error executing eval function: unterminated string literal (detected at line 2) (<string>, line 2)
Error executing eval function: unterminated string literal (detected at line 3) (<string>, line 3)
Error executing eval function: unterminated string literal (detected at line 2) (<string>, line 2)
Skipping because of index error: list index out of range

100%|██████████| 1/1 [00:00<00:00, 4391.94it/s]
100%|██████████| 3/3 [00:00<00:00, 11683.30it/s]
100%|██████████| 1/1 [00:00<00:00, 3765.08it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 5197.40it/s]
100%|██████████| 3/3 [00:00<00:00, 16008.79it/s]
100%|██████████| 1/1 [00:00<00:00, 8128.50it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 8305.55it/s]
100%|██████████| 3/3 [00:00<00:00, 14315.03it/s]
100%|██████████| 1/1 [00:00<00:00, 7639.90it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 7516.67it/s]
100%|██████████| 3/3 [00:00<00:00, 12288.00it/s]


skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 5817.34it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 8507.72it/s]
100%|██████████| 3/3 [00:00<00:00, 14463.12it/s]


skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 7570.95it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 5785.25it/s]
100%|██████████| 3/3 [00:00<00:00, 8224.13it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 6087.52it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 6087.52it/s]
100%|██████████| 3/3 [00:00<00:00, 9656.88it/s]
100%|██████████| 1/1 [00:00<00:00, 5841.65it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 7463.17it/s]
100%|██████████| 3/3 [00:00<00:00, 15160.13it/s]


skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 6288.31it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 6452.78it/s]
100%|██████████| 3/3 [00:00<00:00, 10941.66it/s]
100%|██████████| 1/1 [00:00<00:00, 5817.34it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 6944.21it/s]
100%|██████████| 3/3 [00:00<00:00, 11376.95it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 7626.01it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 6141.00it/s]
100%|██████████| 3/3 [00:00<00:00, 7269.16it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 5461.33it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 7781.64it/s]
100%|██████████| 3/3 [00:00<00:00, 10837.99it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 7710.12it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 4899.89it/s]
100%|██████████| 3/3 [00:00<00:00, 7507.70it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 5533.38it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 5440.08it/s]
100%|██████████| 3/3 [00:00<00:00, 7210.84it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 5753.50it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 4275.54it/s]
100%|██████████| 3/3 [00:00<00:00, 7653.84it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 5236.33it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 6114.15it/s]
100%|██████████| 3/3 [00:00<00:00, 7298.67it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 5833.52it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 9799.78it/s]
100%|██████████| 3/3 [00:00<00:00, 18613.78it/s]
100%|██████████| 1/1 [00:00<00:00, 5777.28it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 3945.72it/s]
100%|██████████| 3/3 [00:00<00:00, 4549.14it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 5924.16it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 5675.65it/s]
100%|██████████| 3/3 [00:00<00:00, 14364.05it/s]


skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 8128.50it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 9962.72it/s]
100%|██████████| 5/5 [00:00<00:00, 19134.60it/s]


skipping because flag is False
skipping because flag is False
skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 9300.01it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 3823.43it/s]
100%|██████████| 5/5 [00:00<00:00, 4391.94it/s]


skipping because flag is False
skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 3823.43it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 4922.89it/s]
100%|██████████| 3/3 [00:00<00:00, 6410.04it/s]


skipping because flag is False
skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 6374.32it/s]
 31%|███       | 22/72 [00:00<00:00, 215.12it/s]

not valid funcs


100%|██████████| 1/1 [00:00<00:00, 5398.07it/s]
100%|██████████| 3/3 [00:00<00:00, 8416.66it/s]


skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 5660.33it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 7463.17it/s]
100%|██████████| 3/3 [00:00<00:00, 10951.19it/s]


skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 5769.33it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 6204.59it/s]
100%|██████████| 3/3 [00:00<00:00, 9861.22it/s]
100%|██████████| 1/1 [00:00<00:00, 6978.88it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 7738.57it/s]
100%|██████████| 3/3 [00:00<00:00, 11554.56it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 4899.89it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 5629.94it/s]
100%|██████████| 3/3 [00:00<00:00, 6936.56it/s]


skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 6831.11it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 7612.17it/s]


Skipping as JSON load error: Expecting value: line 1 column 163 (char 162)


0it [00:00, ?it/s]
0it [00:00, ?it/s]


not valid funcs


100%|██████████| 1/1 [00:00<00:00, 4148.67it/s]
100%|██████████| 3/3 [00:00<00:00, 7667.83it/s]
100%|██████████| 1/1 [00:00<00:00, 4744.69it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 7839.82it/s]
100%|██████████| 3/3 [00:00<00:00, 10875.46it/s]
100%|██████████| 1/1 [00:00<00:00, 6413.31it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 5282.50it/s]
100%|██████████| 3/3 [00:00<00:00, 7092.96it/s]


skipping because flag is False
skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 6150.01it/s]


not valid funcs


100%|██████████| 1/1 [00:00<00:00, 4739.33it/s]
100%|██████████| 3/3 [00:00<00:00, 7223.26it/s]
100%|██████████| 1/1 [00:00<00:00, 4364.52it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 4987.28it/s]
100%|██████████| 3/3 [00:00<00:00, 7302.91it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 5127.51it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 7626.01it/s]


Error executing eval function: closing parenthesis '}' does not match opening parenthesis '(' (<string>, line 2)


0it [00:00, ?it/s]
0it [00:00, ?it/s]


not valid funcs


100%|██████████| 1/1 [00:00<00:00, 9404.27it/s]
100%|██████████| 3/3 [00:00<00:00, 15592.21it/s]


skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 8456.26it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 7898.88it/s]
100%|██████████| 3/3 [00:00<00:00, 11781.75it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 8112.77it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 7884.03it/s]
100%|██████████| 3/3 [00:00<00:00, 12169.16it/s]


skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 5785.25it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 4588.95it/s]
100%|██████████| 3/3 [00:00<00:00, 8536.58it/s]


skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 3685.68it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 2968.37it/s]
100%|██████████| 3/3 [00:00<00:00, 8701.88it/s]
100%|██████████| 1/1 [00:00<00:00, 5433.04it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 6260.16it/s]
100%|██████████| 3/3 [00:00<00:00, 8671.89it/s]


skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 6374.32it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 9686.61it/s]
100%|██████████| 3/3 [00:00<00:00, 16556.46it/s]


skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 8422.30it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 5184.55it/s]
100%|██████████| 3/3 [00:00<00:00, 7157.52it/s]
100%|██████████| 1/1 [00:00<00:00, 4728.64it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 4686.37it/s]
100%|██████████| 3/3 [00:00<00:00, 6750.49it/s]


skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 4815.50it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 7025.63it/s]
100%|██████████| 3/3 [00:00<00:00, 11748.75it/s]


skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 7384.34it/s]
 61%|██████    | 44/72 [00:00<00:00, 215.06it/s]

valid funcs


100%|██████████| 1/1 [00:00<00:00, 7025.63it/s]
100%|██████████| 3/3 [00:00<00:00, 11125.47it/s]
100%|██████████| 1/1 [00:00<00:00, 6523.02it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 1366.22it/s]


Error executing eval function: No module named 'emoji'


0it [00:00, ?it/s]
0it [00:00, ?it/s]


not valid funcs


100%|██████████| 1/1 [00:00<00:00, 5210.32it/s]
100%|██████████| 3/3 [00:00<00:00, 6553.60it/s]

skipping because flag is False
skipping because flag is False



100%|██████████| 1/1 [00:00<00:00, 5090.17it/s]

valid funcs



100%|██████████| 1/1 [00:00<00:00, 6842.26it/s]
100%|██████████| 3/3 [00:00<00:00, 9861.22it/s]

skipping because flag is False
skipping because flag is False



100%|██████████| 1/1 [00:00<00:00, 6223.00it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 4240.95it/s]
100%|██████████| 3/3 [00:00<00:00, 6925.10it/s]

skipping because flag is False
skipping because flag is False



100%|██████████| 1/1 [00:00<00:00, 4341.93it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 4452.55it/s]


Error executing eval function:

100%|██████████| 3/3 [00:00<00:00, 4644.85it/s]


 name 'count_syllables' is not defined
skipping because flag is False
Error executing eval function: name 'count_syllables' is not defined
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 4826.59it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 4877.10it/s]
100%|██████████| 3/3 [00:00<00:00, 5123.34it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 3336.76it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 3754.97it/s]
100%|██████████| 3/3 [00:00<00:00, 5212.47it/s]

skipping because flag is False
skipping because flag is False



100%|██████████| 1/1 [00:00<00:00, 3701.95it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 5178.15it/s]
100%|██████████| 3/3 [00:00<00:00, 11076.51it/s]
100%|██████████| 1/1 [00:00<00:00, 4793.49it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 2896.62it/s]
100%|██████████| 3/3 [00:00<00:00, 5356.71it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 2987.40it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 1599.05it/s]

Skipping because of index error: list index out of range



0it [00:00, ?it/s]
0it [00:00, ?it/s]


not valid funcs


100%|██████████| 1/1 [00:00<00:00, 5146.39it/s]
100%|██████████| 3/3 [00:00<00:00, 4867.66it/s]

skipping because flag is False
skipping because flag is False



100%|██████████| 1/1 [00:00<00:00, 5419.00it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 2227.46it/s]
100%|██████████| 4/4 [00:00<00:00, 2996.47it/s]

skipping because flag is False



100%|██████████| 1/1 [00:00<00:00, 3294.82it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 4471.54it/s]
100%|██████████| 3/3 [00:00<00:00, 5150.60it/s]


skipping because flag is False
skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 5223.29it/s]


not valid funcs


100%|██████████| 1/1 [00:00<00:00, 5698.78it/s]
100%|██████████| 3/3 [00:00<00:00, 7061.12it/s]


skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 5035.18it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 6678.83it/s]
100%|██████████| 3/3 [00:00<00:00, 9686.61it/s]


skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 6186.29it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 6853.44it/s]
100%|██████████| 3/3 [00:00<00:00, 10547.29it/s]
100%|██████████| 1/1 [00:00<00:00, 5991.86it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 12520.31it/s]


Skipping as JSON load error: Expecting value: line 1 column 170 (char 169)


0it [00:00, ?it/s]
0it [00:00, ?it/s]


not valid funcs


100%|██████████| 1/1 [00:00<00:00, 6647.07it/s]
100%|██████████| 3/3 [00:00<00:00, 9467.95it/s]

skipping because flag is False



100%|██████████| 1/1 [00:00<00:00, 6087.52it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 3072.75it/s]
100%|██████████| 3/3 [00:00<00:00, 3869.28it/s]
100%|██████████| 1/1 [00:00<00:00, 2849.39it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 5077.85it/s]
100%|██████████| 3/3 [00:00<00:00, 6978.88it/s]
100%|██████████| 1/1 [00:00<00:00, 4750.06it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 6786.90it/s]
100%|██████████| 3/3 [00:00<00:00, 10082.46it/s]
100%|██████████| 1/1 [00:00<00:00, 5957.82it/s]
 92%|█████████▏| 66/72 [00:00<00:00, 200.18it/s]

valid funcs


100%|██████████| 1/1 [00:00<00:00, 4963.67it/s]
100%|██████████| 3/3 [00:00<00:00, 6156.02it/s]


skipping because flag is False
skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 5890.88it/s]


not valid funcs


100%|██████████| 1/1 [00:00<00:00, 4534.38it/s]
100%|██████████| 3/3 [00:00<00:00, 7239.88it/s]
100%|██████████| 1/1 [00:00<00:00, 4324.02it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 5146.39it/s]
100%|██████████| 3/3 [00:00<00:00, 7494.29it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 5518.82it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 13357.66it/s]


Skipping as JSON load error: Expecting value: line 1 column 164 (char 163)


0it [00:00, ?it/s]
0it [00:00, ?it/s]


not valid funcs


100%|██████████| 1/1 [00:00<00:00, 6775.94it/s]
100%|██████████| 3/3 [00:00<00:00, 10903.74it/s]
100%|██████████| 1/1 [00:00<00:00, 6689.48it/s]


valid funcs


100%|██████████| 1/1 [00:00<00:00, 6452.78it/s]
100%|██████████| 3/3 [00:00<00:00, 8726.01it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 6232.25it/s]
100%|██████████| 72/72 [00:00<00:00, 203.94it/s]


valid funcs


Generating back-translated instructions: 100%|██████████| 94/94 [01:34<00:00,  1.00s/it]
Filtering back-translated instructions: 100%|██████████| 126/126 [00:02<00:00, 62.01it/s]
Filtering back translated responses: 100%|██████████| 143/143 [00:00<00:00, 7680.89it/s]


In [94]:
print(x[2]["filtered2_generated_responses"][17])

The response provided does follow the instruction to include at least three rhyming pairs, as it contains several lines that rhyme throughout the explanation. The response also addresses the query by detailing the construction of the square window, the dimensions of the panes, and the calculations needed to determine the side length of the window. 

1. **Rhyming Pairs**: The response includes multiple rhyming pairs, such as "behold" and "told," "right" and "delight," and "fine" and "divine." This fulfills the instruction effectively.
  
2. **Content Relevance**: The response thoroughly explains the problem, breaking down the calculations for the width and height of the window based on the given dimensions and ratios. It correctly concludes that the side length of the square window is 26 inches, which corresponds to option (A).

3. **Clarity and Structure**: The response is well-structured, with clear steps that guide the reader through the problem-solving process. Each step logically f

In [4]:
global seed_instructions
seed_instructions = []
def generate_seed(k=1):
    global seed_instructions
    if k <= 0:
        return seed_instructions
    
    augment_instruction_prompt = """You are an expert for writing instructions. Please provide 10 different instructions that meet the following requirements:
    - Instructions are about the format but not style of a response
    - Whether instructions are followed can be easily evaluate by a Python function
    Here are some examples of instructions we need:
    {seed_instructions}
    Do not generate instructions about writing style, using metaphor, or translation. Here are some examples of instructions we do not need:
    - Incorporate a famous historical quote seamlessly into your answer
    - Translate your answer into Pig Latin
    - Use only words that are also a type of food
    - Respond with a metaphor in every sentence
    - Write the response as if you are a character from a Shakespearean play
    Please generate one instruction per line in your response and start each line with '- '.
    """

    augment_instructions = augment_instruction_prompt.format(seed_instructions='\n'.join(seed_instructions))
    
    # input_ids = self.tokenizer.encode(augment_instructions, return_tensors="pt").cuda()
    # outputs = self.model.generate(input_ids, max_length=1024, do_sample=True, temperature=0.7)
    # generated_text = self.tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)
    generated_text = llm_openai.invoke([{"role": "user", 
                                                "content": augment_instructions}]).content
    new_seeds = [line.strip() for line in generated_text.split('\n') if line.strip()]
    seed_instructions = seed_instructions + new_seeds
    
    random.shuffle(seed_instructions)
    return generate_seed(k - 1)

In [195]:
seed_instructions = generate_seed(1)

In [196]:
len(seed_instructions)

10

In [51]:
global seed_instructions
seed_instructions = x[0]['seed_instructions ']

In [52]:
global generated_eval_functions
generated_eval_functions = []
def generate_eval_function(k=1):
    global seed_instructions, generated_eval_functions
    prompt_template = (
        "You are an expert for writing evaluation functions in Python to evaluate whether a response strictly follows an instruction.\n"
        "Here is the instruction: {instruction}\n"
        "Please write a Python function named `evaluate` to evaluate whether an input string `response` follows this instruction. "
        "If it follows, simply return True, otherwise return False.\n"
        "Please response with a single JSON includes the evaluation function in the key `func`, and a list of three test cases in the key `cases`, "
        "which includes an input in the key `input` and an expected output in the key `output` in (true, false).\n"
        "Here is an example of output JSON format: {{\"func\": JSON_STR(use only \\n instead of \n), \"cases\": [{{\"input\": str, \"output\": str}}]}}."
    )

    for instruction in seed_instructions:
        prompt = prompt_template.format(instruction=instruction)
        generated_eval_functions.append({
            "prompt": prompt,
            "instruction": instruction,
            "gpt-answer": []
        })
        for _ in range(k):
            # input_ids = self.tokenizer.encode(prompt, return_tensors="pt").cuda()
            # outputs = self.model.generate(input_ids, max_length=1024, do_sample=True, temperature=0.7)
            # generated_text = self.tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)
            generated_text = llm_openai.invoke([{"role": "user", "content": prompt}]).content
            generated_eval_functions[-1]['gpt-answer'].append(generated_text)
        print("len(generated_eval_functions[-1]['gpt-answer']):", len(generated_eval_functions[-1]['gpt-answer']))
    return generated_eval_functions

In [53]:
generated_eval_functions = generate_eval_function(6)

len(generated_eval_functions[-1]['gpt-answer']): 6
len(generated_eval_functions[-1]['gpt-answer']): 6
len(generated_eval_functions[-1]['gpt-answer']): 6
len(generated_eval_functions[-1]['gpt-answer']): 6
len(generated_eval_functions[-1]['gpt-answer']): 6
len(generated_eval_functions[-1]['gpt-answer']): 6
len(generated_eval_functions[-1]['gpt-answer']): 6
len(generated_eval_functions[-1]['gpt-answer']): 6
len(generated_eval_functions[-1]['gpt-answer']): 6
len(generated_eval_functions[-1]['gpt-answer']): 6
len(generated_eval_functions[-1]['gpt-answer']): 6
len(generated_eval_functions[-1]['gpt-answer']): 6
len(generated_eval_functions[-1]['gpt-answer']): 6
len(generated_eval_functions[-1]['gpt-answer']): 6
len(generated_eval_functions[-1]['gpt-answer']): 6
len(generated_eval_functions[-1]['gpt-answer']): 6
len(generated_eval_functions[-1]['gpt-answer']): 6
len(generated_eval_functions[-1]['gpt-answer']): 6
len(generated_eval_functions[-1]['gpt-answer']): 6
len(generated_eval_functions[-1

In [54]:
generated_eval_functions[5]['gpt-answer']

['```json\n{\n  "func": "\ndef evaluate(response):\\n\n  words = response.split()\\n\n  for word in words:\\n\n    if not any(letter * 2 in word for letter in set(word)):\\n\n      return False\\n\n  return True",\n  "cases": [\n    {"input": "bookkeeper", "output": true},\n    {"input": "hello world", "output": false},\n    {"input": "bubble doodle", "output": true}\n  ]\n}\n```',
 '```json\n{\n  "func": "\ndef evaluate(response):\\n\n  words = response.split()\\n\n  for word in words:\\n\n    if not any(letter * 2 in word for letter in set(word)):\\n\n      return False\\n\n  return True",\n  "cases": [\n    {"input": "bookkeeper", "output": true},\n    {"input": "hello world", "output": false},\n    {"input": "bubble doodle", "output": true}\n  ]\n}\n```',
 '```json\n{\n  "func": "\ndef evaluate(response):\\n\n  words = response.split()\\n\n  for word in words:\\n\n    if not any(letter * 2 in word for letter in set(word)):\\n\n      return False\\n\n  return True",\n  "cases": [\n 

In [42]:
# with open("/share/u/harshraj/CotIF/data-v2/generated_eval_functions.json", "w") as f:
#     json.dump(generated_eval_functions, f, indent=4)

with open("/share/u/harshraj/CotIF/data-v2/generated_eval_functions.json", "r") as f:
    generated_eval_functions = json.load(f)

In [55]:
generated_eval_functions[0]

{'prompt': 'You are an expert for writing evaluation functions in Python to evaluate whether a response strictly follows an instruction.\nHere is the instruction: Answer with words that begin with the letter \'B\'\nPlease write a Python function named `evaluate` to evaluate whether an input string `response` follows this instruction. If it follows, simply return True, otherwise return False.\nPlease response with a single JSON includes the evaluation function in the key `func`, and a list of three test cases in the key `cases`, which includes an input in the key `input` and an expected output in the key `output` in (true, false).\nHere is an example of output JSON format: {"func": JSON_STR(use only \\n instead of \n), "cases": [{"input": str, "output": str}]}.',
 'instruction': "Answer with words that begin with the letter 'B'",
 'gpt-answer': ['```json\n{\n  "func": "\ndef evaluate(response):\\n\n  words = response.split()\\n\n  for word in words:\\n\n    if not word.startswith(\'B\')

In [56]:
len(generated_eval_functions[0]['gpt-answer'])

6

In [45]:
collect_packages = []
count = 0
for result in generated_eval_functions:
    res = result['gpt-answer']
    eval_funcs, test_cases = [], []
    for each in res:
        try:
            json_dict = re.findall(r'```json(.*?)```', each, re.DOTALL)[0].strip()
        except Exception as e:
            count += 1
            print("Skipping because of index error:", e)
            continue
        try:
            res_dict = eval(json_dict)
        except Exception as e:
            count += 1
            print("skipping because of JSON load error:", e)
            continue
        func = res_dict['func']
        if '\\n' in func:
            func = func.replace('\\n', '\n')
        try:
            exec(func)
        except Exception as e:
            count += 1
            print("Error executing eval function:", e)
            continue
        for line in func.split('\n'):
            if 'import' in line or 'download' in line or 'requests' in line:
                collect_packages.append(line)
print(list(set(collect_packages)))



skipping because of JSON load error: name 'true' is not defined
skipping because of JSON load error: name 'true' is not defined
skipping because of JSON load error: name 'true' is not defined
skipping because of JSON load error: name 'true' is not defined
skipping because of JSON load error: name 'true' is not defined
skipping because of JSON load error: name 'true' is not defined
skipping because of JSON load error: name 'true' is not defined
skipping because of JSON load error: name 'true' is not defined
skipping because of JSON load error: name 'true' is not defined
skipping because of JSON load error: name 'true' is not defined
skipping because of JSON load error: name 'true' is not defined
skipping because of JSON load error: name 'true' is not defined
skipping because of JSON load error: name 'true' is not defined
Error executing eval function: unterminated string literal (detected at line 11) (<string>, line 11)
skipping because of JSON load error: name 'true' is not defined
Err

In [57]:
filtered_generated_eval_functions = []

In [58]:
def timeout_handler(signum, frame):
    raise TimeoutError("Function execution timed out")

In [82]:
import re
import json

def correct_json_string(raw):
    """
    Fixes a JSON-like string where the "func" field contains unescaped newlines
    and quotes. Returns a properly escaped JSON string.
    """
    # Extract the raw func block (between the first "func": " and the next ")
    func_match = re.search(r'"func"\s*:\s*"([\s\S]*?)",\s*"cases"', raw)
    if not func_match:
        raise ValueError("Could not find the func field in the input string.")
    func_body = func_match.group(1)

    # Escape backslashes, quotes, and newlines for JSON embedding
    escaped_func = (
        func_body.replace('\\', '\\\\')
                 .replace('"', '\\"')
                 .replace('\n', '\\n')
    )

    # Extract the cases block
    cases_match = re.search(r'"cases"\s*:\s*(\[\s*{[\s\S]*}\s*])', raw)
    if not cases_match:
        raise ValueError("Could not find the cases field in the input string.")
    cases_body = cases_match.group(1)
    cases = json.loads(cases_body)

    # Build a proper Python dict and dump to JSON
    fixed_dict = {
        "func": escaped_func,
        "cases": cases
    }
    return json.dumps(fixed_dict, indent=2)

def escape_json_func_field(raw_json_str):
    """
    Escapes the value of the "func" field in a JSON-like string so that
    newlines, quotes, and backslashes are properly encoded for json.loads().
    Returns the modified string; you can then call json.loads() on it.
    """
    def replacer(match):
        func_body = match.group(1)
        # Escape backslashes first, then quotes, then convert real newlines to \n
        escaped = (
            func_body.replace('\\', '\\\\')
                     .replace('"', '\\"')
                     .replace('\n', '\\n')
        )
        return f'"func":"{escaped}",'

    # This regex captures everything between the first "func": " and the next ",
    # including literal newlines.
    pattern = r'"func"\s*:\s*"([\s\S]*?)"\s*,'
    fixed = re.sub(pattern, replacer, raw_json_str)

    # Return the string ready for json.loads()
    return fixed

json_dict = escape_json_func_field(json_dict)
json_dict_ = correct_json_string(json_dict)

JSONDecodeError: Invalid \escape: line 2 column 68 (char 69)

In [96]:
json_dict= json_dict.replace("\n", "")
len(generated_eval_functions)

36

In [102]:
filtered_generated_eval_functions =[]

In [103]:
for result in tqdm(generated_eval_functions):
    res = result['gpt-answer']
    eval_funcs, test_cases = [], []
    for each in tqdm(res):
        try:
            json_dict = re.findall(r'```json(.*?)```', each, re.DOTALL)[0].strip().replace("\n", "")
        except Exception as e:
            print("Skipping because of index error:", e)
            continue
        try:
            res_dict = json.loads(json_dict)
        except Exception as e:
            print("Skipping as JSON load error:", e)
            continue

        # func rejection and cleaning
        func = res_dict['func'].strip()
        func = '\n'.join([line for line in func.split('\n') if 'download' not in line and 'requests' not in line])
        try:
            exec(func)
        except Exception as e:
            print("Error executing eval function:", e)
            continue
        eval_funcs.append(func)

        for each_case in res_dict['cases']:
            try:
                test_cases.append((each_case['input'], each_case['output']))
            except KeyError:
                print(each_case)

    eval_funcs = list(set(eval_funcs))
    test_cases = list(map(json.loads, set(map(json.dumps, test_cases))))
    
    # if len(eval_funcs) < 3 or len(test_cases) < 10:
    #     continue
    
    filtered_test_cases = []
    for each in tqdm(test_cases):
        flag = False
        for func in eval_funcs:
            local_vars = {}
            try:
                exec(func, globals(), local_vars)
            except Exception as e:
                print("Error executing eval function:", e)
                continue
            if 'evaluate' not in local_vars:
                print("skipping because evaluate not in local_vars")
                continue
            eval_func = local_vars['evaluate']
            try:
                signal.signal(signal.SIGALRM, timeout_handler)
                signal.alarm(5)
                res_val = eval_func(each[0])
            except Exception as e:
                print("Error executing eval function:", e)
                res_val = None
            finally:
                signal.alarm(0)
            if res_val is not None and res_val == each[1]:
                flag = True
        if flag:
            filtered_test_cases.append(each)
        else:
            print("skipping because flag is False")
    scored_funcs = []
    for func in tqdm(eval_funcs):
        print('-'*100)
        print(func)
        print()
        local_vars = {}
        try:
            exec(func, globals(), local_vars)
        except Exception as e:
            print("Error executing eval function:", e)
            continue
        if 'evaluate' not in local_vars:
            print("skipping because evaluate not in local_vars")
            continue
        eval_func = local_vars['evaluate']
        acc = []
        for inp, out in filtered_test_cases:
            print("inp:", inp)
            print("out:", out)
            try:
                signal.signal(signal.SIGALRM, timeout_handler)
                signal.alarm(5)
                res_val = eval_func(inp)
            except Exception as e:
                print("Error executing eval function:", e)
                res_val = None
            finally:
                signal.alarm(0)
            if res_val is None or res_val != out:
                acc.append(0)
            else:
                acc.append(1)
            print("res_val:", res_val)
        acc = np.mean(acc) if acc else 0
        scored_funcs.append((func, acc))
    valid_funcs = [each for each in scored_funcs if each[1] >= 0.8]
    if not valid_funcs:
        print("not valid funcs")
        continue
    else:
        print("valid funcs")
    
    filtered_generated_eval_functions.append({
        "instruction": result['instruction'],
        "eval_func": valid_funcs,
        "cases": filtered_test_cases
    })

100%|██████████| 6/6 [00:00<00:00, 17873.45it/s]
100%|██████████| 3/3 [00:00<00:00, 15553.66it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 5363.56it/s]


----------------------------------------------------------------------------------------------------
def evaluate(response):
  words = response.split()
  for word in words:
    if not word.startswith('B'):
      return False
  return True

inp: Amazing beings bounce
out: False
res_val: False
valid funcs


100%|██████████| 6/6 [00:00<00:00, 17949.95it/s]
100%|██████████| 3/3 [00:00<00:00, 7626.01it/s]
100%|██████████| 1/1 [00:00<00:00, 4040.76it/s]


----------------------------------------------------------------------------------------------------
def evaluate(response):
    return response.endswith(' STOP')

inp: Hello world
out: False
res_val: False
inp: Hello world STOP
out: True
res_val: True
inp: Hello world STOP 
out: False
res_val: False
valid funcs


100%|██████████| 6/6 [00:00<00:00, 8911.41it/s]
100%|██████████| 3/3 [00:00<00:00, 8892.52it/s]
100%|██████████| 1/1 [00:00<00:00, 3318.28it/s]


----------------------------------------------------------------------------------------------------
def evaluate(response):
    
    def is_palindrome(s):
        return s == s[::-1]
    
    words = response.split()
    for word in words:
        if not is_palindrome(word):
            return False
    return True

inp: madam
out: True
res_val: True
inp: radar level
out: True
res_val: True
inp: hello world
out: False
res_val: False
valid funcs


100%|██████████| 6/6 [00:00<00:00, 2954.08it/s]


Error executing eval function: unexpected indent (<string>, line 2)
Error executing eval function: unexpected indent (<string>, line 2)
Error executing eval function: unexpected indent (<string>, line 2)
Error executing eval function: unexpected indent (<string>, line 2)
Error executing eval function: unexpected indent (<string>, line 2)
Error executing eval function: unexpected indent (<string>, line 2)


0it [00:00, ?it/s]
0it [00:00, ?it/s]


not valid funcs


100%|██████████| 6/6 [00:00<00:00, 16644.06it/s]
100%|██████████| 3/3 [00:00<00:00, 11135.32it/s]


Error executing eval function: evaluate() missing 1 required positional argument: 'original'
skipping because flag is False
Error executing eval function: evaluate() missing 1 required positional argument: 'original'
skipping because flag is False
Error executing eval function: evaluate() missing 1 required positional argument: 'original'
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 7667.83it/s]


----------------------------------------------------------------------------------------------------
def evaluate(response, original):
  return response == original[::-1]

not valid funcs


100%|██████████| 6/6 [00:00<00:00, 8481.91it/s]
100%|██████████| 3/3 [00:00<00:00, 7729.06it/s]
100%|██████████| 1/1 [00:00<00:00, 3010.99it/s]


----------------------------------------------------------------------------------------------------
def evaluate(response):
  words = response.split()
  for word in words:
    if not any(letter * 2 in word for letter in set(word)):
      return False
  return True

inp: bookkeeper
out: True
res_val: True
inp: bubble doodle
out: True
res_val: True
inp: hello world
out: False
res_val: False
valid funcs


100%|██████████| 6/6 [00:00<00:00, 3472.59it/s]
100%|██████████| 3/3 [00:00<00:00, 3671.70it/s]
100%|██████████| 1/1 [00:00<00:00, 2041.02it/s]


----------------------------------------------------------------------------------------------------
def evaluate(response):

    # Define a list of onomatopoeia words
    onomatopoeia = ['bang', 'boom', 'pow', 'meow', 'woof', 'moo', 'oink', 'buzz', 'beep', 'chirp', 'tweet', 'splash', 'splosh', 'plop', 'drip', 'drop', 'splat', 'thud', 'thump', 'clunk', 'clink', 'cling', 'clang', 'honk', 'toot', 'vroom', 'zoom', 'zip', 'zap', 'ping', 'pong', 'ding', 'dong', 'ring', 'ting', 'tingling', 'sizzle', 'crackle', 'pop', 'fizz', 'whir', 'whizz', 'whoosh', 'swish', 'swirl', 'gurgle', 'glug', 'bloop', 'bleep', 'blip', 'bong', 'dingdong', 'tinkle', 'jingle', 'chime', 'clangclang', 'rattle', 'clatter', 'clinkclink', 'tap', 'ratatap', 'tapity', 'tip', 'top', 'tot', 'tut', 'tapping', 'beating', 'drumming', 'thumping', 'pounding', 'splatting', 'splashing', 'sploshing', 'plopping', 'dripping', 'dropping', 'sizzling', 'crackling', 'popping', 'fizzling', 'whirring', 'whizzing', 'whooshing', 'swishing', 's

100%|██████████| 6/6 [00:00<00:00, 11140.25it/s]
100%|██████████| 3/3 [00:00<00:00, 10082.46it/s]


skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 4009.85it/s]


----------------------------------------------------------------------------------------------------
def evaluate(response):
    words = response.split()
    return len(response) == 100 and response.count(' ') == 99 and response.endswith('.')

inp: This is another test sentence that is not exactly one hundred words long
out: False
res_val: False
inp: This is a test sentence that is exactly one hundred words long but does not have proper punctuation
out: False
res_val: False
valid funcs


100%|██████████| 6/6 [00:00<00:00, 20052.45it/s]


Skipping as JSON load error: Invalid \escape: line 1 column 38 (char 37)
Skipping as JSON load error: Invalid \escape: line 1 column 38 (char 37)
Skipping as JSON load error: Invalid \escape: line 1 column 38 (char 37)
Skipping as JSON load error: Invalid \escape: line 1 column 38 (char 37)
Skipping as JSON load error: Invalid \escape: line 1 column 38 (char 37)
Skipping as JSON load error: Invalid \escape: line 1 column 38 (char 37)


0it [00:00, ?it/s]
0it [00:00, ?it/s]


not valid funcs


100%|██████████| 6/6 [00:00<00:00, 29262.59it/s]


Skipping as JSON load error: Invalid \escape: line 1 column 139 (char 138)
Skipping as JSON load error: Invalid \escape: line 1 column 139 (char 138)
Skipping as JSON load error: Invalid \escape: line 1 column 139 (char 138)
Skipping as JSON load error: Invalid \escape: line 1 column 139 (char 138)
Skipping as JSON load error: Invalid \escape: line 1 column 139 (char 138)
Skipping as JSON load error: Invalid \escape: line 1 column 139 (char 138)


0it [00:00, ?it/s]
0it [00:00, ?it/s]


not valid funcs


100%|██████████| 6/6 [00:00<00:00, 28859.89it/s]


Skipping as JSON load error: Invalid \escape: line 1 column 246 (char 245)
Skipping as JSON load error: Invalid \escape: line 1 column 246 (char 245)
Skipping as JSON load error: Invalid \escape: line 1 column 246 (char 245)
Skipping as JSON load error: Invalid \escape: line 1 column 246 (char 245)
Skipping as JSON load error: Invalid \escape: line 1 column 246 (char 245)
Skipping as JSON load error: Invalid \escape: line 1 column 246 (char 245)


0it [00:00, ?it/s]
0it [00:00, ?it/s]


not valid funcs


100%|██████████| 6/6 [00:00<00:00, 12186.84it/s]
100%|██████████| 3/3 [00:00<00:00, 11893.11it/s]
100%|██████████| 1/1 [00:00<00:00, 4860.14it/s]


----------------------------------------------------------------------------------------------------
def evaluate(response):
      words = response.split()
      for word in words:
        if not word.endswith('ing'):
          return False
      return True

inp: running jumping
out: True
res_val: True
inp: swimming diving flying
out: True
res_val: True
inp: running jump
out: False
res_val: False
valid funcs


100%|██████████| 6/6 [00:00<00:00, 2078.27it/s]
100%|██████████| 5/5 [00:00<00:00, 693.78it/s]


skipping because flag is False
skipping because flag is False
skipping because flag is False


100%|██████████| 3/3 [00:00<00:00, 2660.80it/s]


----------------------------------------------------------------------------------------------------
def evaluate(response):

    # Define a list of military lingo terms
    military_lingo = ['roger', 'over', 'out', 'bravo', 'charlie', 'delta', 'echo', 'foxtrot', 'golf', 'hotel', 'india', 'juliet', 'kilo', 'lima', 'mike', 'november', 'oscar', 'papa', 'quebec', 'romeo', 'sierra', 'tango', 'uniform', 'victor', 'whiskey', 'x-ray', 'yankee', 'zulu', 'alpha', 'beta', 'sitrep', 'opsec', 'tactical', 'strategic', 'recon', 'intel', 'comms', 'command', 'control', 'logistics', 'supply', 'chain', 'of', 'command', 'hooah', 'oorah', 'semper', 'fi', 'honor', 'country', 'corps', 'battalion', 'company', 'platoon', 'squad', 'fire', 'team', 'safety', 'protocol', 'procedure', 'regulation', 'directive', 'order', 'instruction', 'briefing', 'debriefing', 'mission', 'objective', 'parameter', 'perimeter', 'sector', 'grid', 'coordinate', 'map', 'chart', 'navigation', 'orient', 'elevation', 'azimuth', 'bearing',

100%|██████████| 6/6 [00:00<00:00, 30393.51it/s]


Skipping as JSON load error: Invalid \escape: line 1 column 1609 (char 1608)
Skipping as JSON load error: Invalid \escape: line 1 column 439 (char 438)
Skipping as JSON load error: Invalid \escape: line 1 column 439 (char 438)
Skipping as JSON load error: Invalid \escape: line 1 column 439 (char 438)
Skipping as JSON load error: Invalid \escape: line 1 column 439 (char 438)
Skipping as JSON load error: Invalid \escape: line 1 column 439 (char 438)


0it [00:00, ?it/s]
0it [00:00, ?it/s]


not valid funcs


100%|██████████| 6/6 [00:00<00:00, 18170.27it/s]


Skipping as JSON load error: Invalid \escape: line 1 column 63 (char 62)
Skipping as JSON load error: Invalid \escape: line 1 column 63 (char 62)
Skipping as JSON load error: Invalid \escape: line 1 column 63 (char 62)
Skipping as JSON load error: Invalid \escape: line 1 column 63 (char 62)
Skipping as JSON load error: Invalid \escape: line 1 column 63 (char 62)
Skipping as JSON load error: Invalid \escape: line 1 column 63 (char 62)


0it [00:00, ?it/s]
0it [00:00, ?it/s]


not valid funcs


100%|██████████| 6/6 [00:00<00:00, 8805.40it/s]
100%|██████████| 3/3 [00:00<00:00, 10600.60it/s]
100%|██████████| 1/1 [00:00<00:00, 4355.46it/s]
 44%|████▍     | 16/36 [00:00<00:00, 153.83it/s]

----------------------------------------------------------------------------------------------------
def evaluate(response):
  words = response.split()
  for word in words:
    if len(word) > 3 or word.count('a') + word.count('e') + word.count('i') + word.count('o') + word.count('u') > 1:
      return False
  return True

inp: run pet
out: True
res_val: True
inp: yes no
out: True
res_val: True
inp: hello world
out: False
res_val: False
valid funcs


100%|██████████| 6/6 [00:00<00:00, 21076.90it/s]
100%|██████████| 3/3 [00:00<00:00, 16622.08it/s]


skipping because flag is False
skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 7781.64it/s]


----------------------------------------------------------------------------------------------------
def evaluate(response):
    words = response.split()
    return words == sorted(words)

not valid funcs


100%|██████████| 6/6 [00:00<00:00, 23786.22it/s]


Skipping as JSON load error: Expecting value: line 1 column 594 (char 593)
Skipping as JSON load error: Expecting value: line 1 column 594 (char 593)
Skipping as JSON load error: Expecting value: line 1 column 594 (char 593)
Skipping as JSON load error: Expecting value: line 1 column 594 (char 593)
Skipping as JSON load error: Expecting value: line 1 column 594 (char 593)
Skipping as JSON load error: Expecting value: line 1 column 594 (char 593)


0it [00:00, ?it/s]
0it [00:00, ?it/s]


not valid funcs


100%|██████████| 6/6 [00:00<00:00, 8063.38it/s]
100%|██████████| 3/3 [00:00<00:00, 2870.19it/s]
100%|██████████| 1/1 [00:00<00:00, 1572.67it/s]


----------------------------------------------------------------------------------------------------
import nltk
from nltk import pos_tag, word_tokenize

def evaluate(response):
  tokens = word_tokenize(response)
  tagged = pos_tag(tokens)
  for word, tag in tagged:
    if tag in ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']:
      return False
  return True

inp: I go to the store
out: True
res_val: True
inp: She eats a sandwich
out: True
res_val: True
inp: The big red car drives quickly
out: False
res_val: False
valid funcs


100%|██████████| 6/6 [00:00<00:00, 20593.96it/s]
100%|██████████| 3/3 [00:00<00:00, 17331.83it/s]


skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 4332.96it/s]


----------------------------------------------------------------------------------------------------
def evaluate(response):
    words = response.split()
    return len(words) == 6

inp: I am
out: False
res_val: False
inp: This is a very long sentence that does not follow the instruction
out: False
res_val: False
valid funcs


100%|██████████| 6/6 [00:00<00:00, 21864.31it/s]


Skipping as JSON load error: Invalid \escape: line 1 column 426 (char 425)
Skipping as JSON load error: Invalid \escape: line 1 column 426 (char 425)
Skipping as JSON load error: Invalid \escape: line 1 column 426 (char 425)
Skipping as JSON load error: Invalid \escape: line 1 column 426 (char 425)
Skipping as JSON load error: Invalid \escape: line 1 column 426 (char 425)
Skipping as JSON load error: Invalid \escape: line 1 column 426 (char 425)


0it [00:00, ?it/s]
0it [00:00, ?it/s]


not valid funcs


100%|██████████| 6/6 [00:00<00:00, 7096.96it/s]
100%|██████████| 3/3 [00:00<00:00, 6808.94it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 4934.48it/s]


----------------------------------------------------------------------------------------------------
import re

def evaluate(response):
    # Split the response into lines
    lines = response.split('\n')
    # Define a pattern for an iambic pentameter line
    pattern = re.compile('^(da-DUM )*(da-DUM|da-DUM da)$', re.IGNORECASE)
    # Check each line against the pattern
    for line in lines:
        syllables = line.split()
        # Check if the line has 10 syllables (5 feet)
        if len(syllables) != 10:
            return False
        # Check if the line follows the iambic pattern
        if not pattern.match(line):
            return False
    return True

inp: The sun is shining very brightly today
out: False
res_val: False
valid funcs


100%|██████████| 6/6 [00:00<00:00, 11688.72it/s]
100%|██████████| 3/3 [00:00<00:00, 10373.38it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 4341.93it/s]


----------------------------------------------------------------------------------------------------
def evaluate(response):
      words = response.split()
      for word in words:
        if not word:
          continue
        first_letter = word[0].lower()
        for char in word:
          if char.isalpha() and char.lower() != first_letter:
            return False
      return True

inp: Hello world
out: False
res_val: False
valid funcs


100%|██████████| 6/6 [00:00<00:00, 7609.87it/s]
100%|██████████| 3/3 [00:00<00:00, 7445.51it/s]
100%|██████████| 1/1 [00:00<00:00, 2926.94it/s]


----------------------------------------------------------------------------------------------------
import re

def evaluate(response):
  # Define future tense verbs pattern
  future_tense_pattern = re.compile(r'\b(will|shall|is going to)\b', re.IGNORECASE)
  # Check if any word in the response matches the future tense pattern
  has_future_tense = bool(future_tense_pattern.search(response))
  # Check if the response contains any verb that is not in future tense
  verbs = re.findall(r'\b\w+ing\b|\b\w+ed\b|\b\w+s\b', response, re.IGNORECASE)
  for verb in verbs:
    if not future_tense_pattern.search(verb):
      return False
  # If no non-future tense verb is found and there is at least one future tense verb, return True
  return has_future_tense

inp: I went to the store yesterday
out: False
res_val: False
inp: I am going to the store now
out: False
res_val: False
inp: I will go to the store tomorrow
out: True
res_val: True
valid funcs


100%|██████████| 6/6 [00:00<00:00, 10468.31it/s]
100%|██████████| 3/3 [00:00<00:00, 9258.95it/s]


skipping because flag is False
skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 6808.94it/s]


----------------------------------------------------------------------------------------------------
def evaluate(response):
    # Convert the response to uppercase to handle case-insensitivity
    response = response.upper()
    # Iterate over each character in the response
    for char in response:
        # Check if the character is an alphabet letter and if it's in the second half of the alphabet (N-Z)
        if char.isalpha() and char > 'M':
            # If it is, return False
            return False
    # If no characters in the second half of the alphabet are found, return True
    return True

not valid funcs


100%|██████████| 6/6 [00:00<00:00, 11052.18it/s]
100%|██████████| 3/3 [00:00<00:00, 10582.77it/s]
100%|██████████| 1/1 [00:00<00:00, 3539.50it/s]


----------------------------------------------------------------------------------------------------
def evaluate(response):
    sentences = response.split('. ')
    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence.endswith('?'):
            return False
    return True

inp: Are you ready? Are you set?
out: True
res_val: True
inp: This is a statement.
out: False
res_val: False
inp: Is this a question? Is that a question?
out: True
res_val: True
valid funcs


100%|██████████| 6/6 [00:00<00:00, 9795.96it/s]
100%|██████████| 3/3 [00:00<00:00, 9568.75it/s]
100%|██████████| 1/1 [00:00<00:00, 3276.80it/s]


----------------------------------------------------------------------------------------------------
def evaluate(response):
  words = response.split()
  for word in words:
    if not (word[0].lower() == word[-1].lower()) or len(word) == 0:
      return False
  return True

inp: Madam
out: True
res_val: True
inp: Hello
out: False
res_val: False
inp: Dad
out: True
res_val: True
valid funcs


100%|██████████| 6/6 [00:00<00:00, 5990.44it/s]


Error executing eval function: unexpected indent (<string>, line 2)
Error executing eval function: unexpected indent (<string>, line 2)
Error executing eval function: unexpected indent (<string>, line 2)
Error executing eval function: unexpected indent (<string>, line 2)
Error executing eval function: unexpected indent (<string>, line 2)
Error executing eval function: unexpected indent (<string>, line 2)


0it [00:00, ?it/s]
0it [00:00, ?it/s]


not valid funcs


100%|██████████| 6/6 [00:00<00:00, 7492.06it/s]
100%|██████████| 3/3 [00:00<00:00, 7800.94it/s]
100%|██████████| 1/1 [00:00<00:00, 3111.50it/s]


----------------------------------------------------------------------------------------------------
def evaluate(response):

  # Define a list of color words
  color_words = ['red', 'blue', 'green', 'yellow', 'orange', 'purple', 'pink', 'black', 'white', 'brown', 'gray', 'turquoise', 'silver', 'gold', 'beige', 'ivory', 'cream', 'cyan', 'magenta', 'violet', 'indigo']

  # Split the response into words
  words = response.split()

  # Check if all words are color words
  for word in words:
    if word.lower() not in color_words:
      return False

  return True

inp: red blue green
out: True
res_val: True
inp: Yellow Orange Purple
out: True
res_val: True
inp: hello world
out: False
res_val: False
valid funcs


100%|██████████| 6/6 [00:00<00:00, 9742.87it/s]
100%|██████████| 3/3 [00:00<00:00, 8160.12it/s]


skipping because flag is False
skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 6533.18it/s]


----------------------------------------------------------------------------------------------------
def evaluate(response):
    # Convert the response to uppercase to handle case-insensitivity
    response = response.upper()
    # Iterate over each character in the response
    for char in response:
        # Check if the character is an alphabet letter and if it's in the first half of the alphabet
        if char.isalpha() and ord(char) < ord('N'):
            # If it is, return False
            return False
    # If no characters in the first half of the alphabet are found, return True
    return True

not valid funcs


100%|██████████| 6/6 [00:00<00:00, 6808.94it/s]
100%|██████████| 3/3 [00:00<00:00, 8112.77it/s]


skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 4017.53it/s]


----------------------------------------------------------------------------------------------------
def evaluate(response):
  sentences = response.split('. ')
  if len(sentences) < 2:
    return False
  word_counts = [len(sentence.split()) for sentence in sentences]
  return all(word_counts[i] > word_counts[i+1] for i in range(len(word_counts)-1))

inp: This is a test sentence. This is a test. This is.
out: True
res_val: True
inp: This is a test sentence. This is a test. This is a test sentence.
out: False
res_val: False
valid funcs


100%|██████████| 6/6 [00:00<00:00, 12282.00it/s]
100%|██████████| 3/3 [00:00<00:00, 8943.08it/s]
100%|██████████| 1/1 [00:00<00:00, 3243.85it/s]


----------------------------------------------------------------------------------------------------
def evaluate(response):
    lines = response.split('\n')
    if len(lines) < 1:
        return False
    for line in lines:
        if not line.strip().startswith('-') and not line.strip().startswith('*'):
            return False
    return True

inp: item1
item2
out: False
res_val: False
inp: - item1
- item2
out: True
res_val: True
inp: * item1
* item2
out: True
res_val: True
valid funcs


100%|██████████| 6/6 [00:00<00:00, 8707.90it/s]


Error executing eval function: unexpected indent (<string>, line 2)
Error executing eval function: unexpected indent (<string>, line 2)
Error executing eval function: unexpected indent (<string>, line 2)
Error executing eval function: unexpected indent (<string>, line 2)
Error executing eval function: unexpected indent (<string>, line 2)
Error executing eval function: unexpected indent (<string>, line 2)


0it [00:00, ?it/s]
0it [00:00, ?it/s]


not valid funcs


100%|██████████| 6/6 [00:00<00:00, 10578.32it/s]
100%|██████████| 3/3 [00:00<00:00, 9954.84it/s]


skipping because flag is False
skipping because flag is False


100%|██████████| 1/1 [00:00<00:00, 4588.95it/s]
 94%|█████████▍| 34/36 [00:00<00:00, 165.25it/s]

----------------------------------------------------------------------------------------------------
import re

def evaluate(response):
  # Check if the response contains any non-emoji characters
  if re.search('[^\u2639-\u2696\u26A0-\u26FF\u2700-\u27BF\u1F600-\u1F64F\u1F300-\u1F5FF\u1F680-\u1F6FF\u2600-\u26FF\u2700-\u27BF]', response):
    return False
  # Check if the response is empty
  if not response:
    return False
  return True

inp: 
out: False
res_val: False
valid funcs


100%|██████████| 6/6 [00:00<00:00, 9485.80it/s]
100%|██████████| 3/3 [00:00<00:00, 11115.65it/s]
100%|██████████| 1/1 [00:00<00:00, 3545.48it/s]


----------------------------------------------------------------------------------------------------
def evaluate(response):
  words = response.split()
  for word in words:
    if 'x' not in word.lower():
      return False
  return True

inp: Hello world
out: False
res_val: False
inp: Maximize exuberant excess
out: True
res_val: True
inp: Exact examples exceed expectations
out: True
res_val: True
valid funcs


100%|██████████| 6/6 [00:00<00:00, 12925.44it/s]
100%|██████████| 3/3 [00:00<00:00, 12421.43it/s]
100%|██████████| 1/1 [00:00<00:00, 4481.09it/s]

----------------------------------------------------------------------------------------------------
def evaluate(response):
      words = response.split()
      alphabet = 'abcdefghijklmnopqrstuvwxyz'
      for i, word in enumerate(words):
        if word[0].lower() != alphabet[i % 26]:
          return False
      return True

inp: Able bodied creatures deliberately establish bad goals
out: False
res_val: False
inp: Able bodied creatures deliberately establish fresh
out: True
res_val: True
inp: Able bodied creatures deliberately establish fresh goals
out: True
res_val: True



100%|██████████| 36/36 [00:00<00:00, 163.17it/s]


valid funcs


In [104]:
len(filtered_generated_eval_functions)

22

In [None]:
# global filtered_generated_eval_functions
# filtered_generated_eval_functions = []
# def filter_generated_eval_function():
#     global generated_eval_functions, filtered_generated_eval_functions
#     collect_packages = []
#     for result in generated_eval_functions:
#         res = result['gpt-answer']
#         eval_funcs, test_cases = [], []
#         for each in res:
#             try:
#                 json_dict = re.findall(r'```json(.*?)```', each, re.DOTALL)[0].strip()
#             except Exception as e:
#                 # print("Skipping because of index error:", e)
#                 continue
#             try:
#                 res_dict = json.loads(json_dict)
#             except Exception as e:
#                 # print("skipping because of JSON load error:", e)
#                 continue
#             func = res_dict['func']
#             if '\\n' in func:
#                 func = func.replace('\\n', '\n')
#             try:
#                 exec(func)
#             except Exception as e:
#                 # print("Error executing eval function:", e)
#                 continue
#             for line in func.split('\n'):
#                 if 'import' in line or 'download' in line or 'requests' in line:
#                     collect_packages.append(line)
#     print(list(set(collect_packages)))

#     for result in tqdm(generated_eval_functions):
#         res = result['gpt-answer']
#         eval_funcs, test_cases = [], []
#         for each in tqdm(res):
#             try:
#                 json_dict = re.findall(r'```json(.*?)```', each, re.DOTALL)[0].strip()
#             except Exception as e:
#                 # print("Skipping because of index error:", e)
#                 continue
#             try:
#                 res_dict = json.loads(json_dict)
#             except Exception as e:
#                 # print("Skipping as JSON load error:", e)
#                 continue

#             # func rejection and cleaning
#             func = res_dict['func'].strip()
#             func = '\n'.join([line for line in func.split('\n') if 'download' not in line and 'requests' not in line])
#             try:
#                 exec(func)
#             except Exception as e:
#                 # print("Error executing eval function:", e)
#                 continue
#             eval_funcs.append(func)

#             for each_case in res_dict['cases']:
#                 try:
#                     test_cases.append((each_case['input'], each_case['output']))
#                 except KeyError:
#                     print(each_case)

#         eval_funcs = list(set(eval_funcs))
#         test_cases = list(map(json.loads, set(map(json.dumps, test_cases))))
#         # if len(eval_funcs) < 3 or len(test_cases) < 10:
#         #     continue
#         # print("len(test_cases):", len(test_cases))
#         filtered_test_cases = []
#         for each in tqdm(test_cases):
#             flag = False
#             for func in eval_funcs:
#                 local_vars = {}
#                 try:
#                     exec(func, globals(), local_vars)
#                 except Exception as e:
#                     # print("Error executing eval function:", e)
#                     continue
#                 if 'evaluate' not in local_vars:
#                     # print("skipping because evaluate not in local_vars")
#                     continue
#                 eval_func = local_vars['evaluate']
#                 try:
#                     signal.signal(signal.SIGALRM, timeout_handler)
#                     signal.alarm(5)
#                     res_val = eval_func(each[0])
#                 except Exception:
#                     res_val = None
#                 finally:
#                     signal.alarm(0)
#                 if res_val is not None and res_val == each[1]:
#                     flag = True
#             if flag:
#                 filtered_test_cases.append(each)
#             else:
#                 print("skipping because flag is False")
#         # print("len(eval_funcs):", len(eval_funcs))
#         scored_funcs = []
#         for func in tqdm(eval_funcs):
#             local_vars = {}
#             try:
#                 exec(func, globals(), local_vars)
#             except Exception as e:
#                 # print("Error executing eval function:", e)
#                 continue
#             if 'evaluate' not in local_vars:
#                 # print("skipping because evaluate not in local_vars")
#                 continue
#             eval_func = local_vars['evaluate']
#             acc = []
#             for inp, out in filtered_test_cases:
#                 try:
#                     signal.signal(signal.SIGALRM, timeout_handler)
#                     signal.alarm(5)
#                     res_val = eval_func(inp)
#                 except Exception:
#                     res_val = None
#                 finally:
#                     signal.alarm(0)
#                 if res_val is None or res_val != out:
#                     acc.append(0)
#                 else:
#                     acc.append(1)
#             acc = np.mean(acc) if acc else 0
#             scored_funcs.append((func, acc))
#         valid_funcs = [each for each in scored_funcs if each[1] >= 0.8]
#         if not valid_funcs:
#             print("not valid funcs")
#             continue
#         else:
#             print("valid funcs")
        
#         filtered_generated_eval_functions.append({
#             "instruction": result['instruction'],
#             "eval_func": valid_funcs,
#             "cases": filtered_test_cases
#         })
#     return filtered_generated_eval_functions


In [15]:
filtered_generated_eval_functions

[{'instruction': '- Ensure your response is exactly 100 characters long.',
  'eval_func': [('def evaluate(response):\n    return len(response) == 100',
    np.float64(1.0))],
  'cases': [['This is a test string that is definitely not one hundred characters long.',
    False],
   ['bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb',
    False],
   ['aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
    True],
   ['This string is definitely not one hundred characters long.', False],
   ['aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
    False],
   ['aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
    False],
   ['This response is definitely not one hundred characters long.', False]]},
 {'instruction': '- Respond with a list of exactly five items.',
  'eval_func': [("def evalua

In [16]:
global generated_instructions
generated_instructions = []
def generate_instruction(k=2):
    global filtered_generated_eval_functions, generated_instructions
    count = 0
    filter_count = 0

    for line in tqdm(filtered_generated_eval_functions, desc="Generating back-translated instructions"):
        funcs = line["eval_func"][:3]

        instruction_prompt = f"""You are an expert in converting the Python eval function code into the corresponding instruction text. I will provide the eval function code. Please strictly follow the code to convert it into the corresponding instruction text. Here's an example: 

[["def evaluate(response):\n    return 'e' not in response.lower()", 1.0], ["def evaluate(response):\n    words = response.split()\n    for word in response.split():\n        if 'e' in word.lower():\n            return False\n    return True", 1.0], ["def evaluate(response):\n    return all('e' not in word.lower() for word in response.split())", 1.0]] 

["Answer without using any words that contain the letter 'E'.","Answer with words that do not contain the letter 'E'.","Answer with words that do not contain the letter 'E'."] Please convert the following eval function into instructions stored in a list: 

{funcs}"""
        line["back_instruction"] = []
        for _ in range(k):
            # input_ids = self.tokenizer.encode(instruction_prompt, return_tensors="pt").cuda()
            # outputs = self.model.generate(input_ids, max_length=1024, do_sample=True, temperature=0.7)
            # generated_text = self.tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)
            generated_text = llm_openai.invoke([{"role": "user", "content": instruction_prompt}]).content
            try:
                back_instruction = json.loads(generated_text)
                break
            except Exception:
                filter_count += 1
                continue
        line["back_instruction"] = back_instruction
        generated_instructions.append(line)
        count += 1
    return generated_instructions

In [17]:
generated_instructions = generate_instruction()

Generating back-translated instructions: 100%|██████████| 9/9 [00:13<00:00,  1.46s/it]


In [18]:
generated_instructions[0]

{'instruction': '- Ensure your response is exactly 100 characters long.',
 'eval_func': [('def evaluate(response):\n    return len(response) == 100',
   np.float64(1.0))],
 'cases': [['This is a test string that is definitely not one hundred characters long.',
   False],
  ['bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb',
   False],
  ['aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
   True],
  ['This string is definitely not one hundred characters long.', False],
  ['aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
   False],
  ['aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
   False],
  ['This response is definitely not one hundred characters long.', False]],
 'back_instruction': ['Provide a response that is exactly 100 characters long.']}

In [19]:
global filtered_generated_instructions
filtered_generated_instructions = []
def filter_generated_instruction():
    global generated_instructions, filtered_generated_instructions
    count = 0 
    filter_count = 0

    for line in tqdm(generated_instructions, desc="Filtering back-translated instructions"):
        back_instructions = line["back_instruction"]
        ori_ins = line["instruction"]

        nli_scores = []
        for back_ins in back_instructions[:3]:
            premise = ori_ins
            hypothesis = back_ins
            inputs = tokenizer_nli(premise, hypothesis, truncation=True, return_tensors="pt")
            output = model_nli(inputs["input_ids"].cuda())
            prediction = torch.softmax(output["logits"][0], -1).tolist()
            label_names = ["entailment", "neutral", "contradiction"]
            prediction_dict = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
            max_label = max(prediction_dict, key=prediction_dict.get)
            nli_scores.append(max_label)

        line["nli_scores"] = nli_scores
        if "contradiction" in nli_scores:
            filter_count += 1
            continue
        else:
            filtered_generated_instructions.append(line)
        count += 1
    return filtered_generated_instructions

In [20]:
filtered_generated_instructions = filter_generated_instruction()

Filtering back-translated instructions: 100%|██████████| 9/9 [00:01<00:00,  8.88it/s]


In [21]:
filtered_generated_instructions

[{'instruction': '- Ensure your response is exactly 100 characters long.',
  'eval_func': [('def evaluate(response):\n    return len(response) == 100',
    np.float64(1.0))],
  'cases': [['This is a test string that is definitely not one hundred characters long.',
    False],
   ['bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb',
    False],
   ['aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
    True],
   ['This string is definitely not one hundred characters long.', False],
   ['aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
    False],
   ['aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
    False],
   ['This response is definitely not one hundred characters long.', False]],
  'back_instruction': ['Provide a response that is exactly 100 characters long.'],
  'nli_scor

In [None]:
global generated_responses
generated_responses = []
def generate_response(text, k=2):
    global filtered_generated_instructions, generated_responses
    for instruction in filtered_generated_instructions:
        prompt = (
            f"Please answer the query strictly following the instruction.\n"
            f"[instruction] {instruction['instruction']}\n"
            f"[Query] {text}"
        )
        
        responses = []
        for _ in range(k):
            input_ids = tokenizer.encode(prompt, return_tensors="pt").cuda()
            outputs = model.generate(input_ids, max_length=1024, do_sample=True, temperature=0.7)
            r1_generated_text = tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True) # using r1
            
            generated_text = llm_openai.invoke([
                    {"role": "system", "content": "Strictly follow user's instructions."},
                    {"role": "user", "content": f"Given below is an answer from an AI Assistant. You have to modify/reformat the answer such that it strictly follows a given instruction.\n"
                                f"--Example--\n"
                                f"Response: "
                                f"Here is the answer from AI Assistant:\n\n"
                                f"{r1_generated_text}\n"
                                f"Here is the instruction:\n"
                                f"{instruction['instruction']}\n\n"
                                f"Do not use any other words or sentences in your response. Just modify the answer from AI Assistant and return it.\n"}
                ]).content
            responses.append(generated_text)
            
            
        
        generated_responses.append({
            "instruction": instruction['instruction'],
            "prompt": prompt,
            "gpt-answer": responses,
            "eval_func": instruction["eval_func"],
        })
    return generated_responses

In [24]:
text = "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?"

In [25]:
import random
random.shuffle(filtered_generated_instructions)

In [26]:
filtered_generated_instructions[0]

{'instruction': '- Format your response as a valid XML document.',
 'eval_func': [('def evaluate(response):\n    import xml.etree.ElementTree as ET\n    try:\n        ET.fromstring(response)\n        return True\n    except ET.ParseError:\n        return False',
   np.float64(1.0))],
 'cases': [["<note><to>Tove</to><from>Jani</from><heading>Reminder</heading><body>Don't forget me this weekend!</body></note>",
   True],
  ["<note><to>Tove</to><from>Jani</from><heading>Reminder</heading><body>Don't forget me this weekend!</body>",
   False],
  ['Just a plain text, not XML.', False]],
 'back_instruction': ['Provide a valid XML response.',
  'Submit a response that is well-formed XML.'],
 'nli_scores': ['entailment', 'entailment']}

In [None]:
global filtered_generated_instructions, generated_responses, r1_responses
r1_responses = []
k = 1
for instruction in filtered_generated_instructions:
    # prompt = (f"Please answer the query strictly following the instruction.\n"
    #          f"[instruction] {instruction['instruction']}\n"
    #          f"[Query] {text}")
    prompt = text
    
    responses = []
    for _ in range(k):
        input_ids = tokenizer.encode(prompt, return_tensors="pt").cuda()
        outputs = model.generate(input_ids, max_length=1024, do_sample=True, temperature=0.7)
        r1_generated_text = tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True) # using r1
        
        user_prompt = (f"{r1_generated_text}\n"
                      f"Re-write the above text following: {instruction['instruction']}\n\n"
                      f"Note: Use the same words and sentences but re-arrange them in a way that strictly follows the instruction.\n")
        generated_text = llm_openai.invoke([
                {"role": "user", "content": user_prompt}
            ]).content
        r1_responses.append(r1_generated_text)
        responses.append(generated_text)
    #     break
    # break
        
    
    generated_responses.append({
        "instruction": instruction['instruction'],
        "prompt": prompt,
        "r1-answer": r1_responses,
        "gpt-answer": responses,
        "eval_func": instruction["eval_func"],
    })

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end gene

In [52]:
generated_responses

[{'instruction': '- Format your response as a valid XML document.',
  'prompt': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
  'gpt-answer': ['```xml\n<salesReport>\n    <month name="April">\n        <sales>\n            <amount>48</amount>\n            <description>Natalia sold 48 clips in April.</description>\n        </sales>\n    </month>\n    <month name="May">\n        <sales>\n            <amount>24</amount>\n            <description>She sold half as many clips in May as she did in April.</description>\n            <calculation>48 / 2 = 24 clips</calculation>\n        </sales>\n    </month>\n    <totalSales>\n        <amount>72</amount>\n        <description>Add April and May sales: 48 + 24 = 72 clips.</description>\n    </totalSales>\n    <finalAnswer>\n        <text>Natalia sold a total of <value>72</value> clips in April and May.</text>\n    </finalAnswer>\n</sale

In [53]:
# generated_responses = generate_response("Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?")

In [54]:
global generated_responses, filtered_generated_responses
filtered_generated_responses = []
def filter_generated_response():
    global generated_responses, filtered_generated_responses
    filtered_samples = []
    for result in tqdm(generated_responses, desc="Filtering back translated responses"):
        eval_funcs = []
        for func, score in result['eval_func']:
            local_vars = {}
            try:
                exec(func, globals(), local_vars)
            except Exception as e:
                print("Error executing eval function:", e)
                continue
            if 'evaluate' in local_vars:
                eval_funcs.append(local_vars['evaluate'])
        
        filter_responses = []
        for response in result['gpt-answer']:
            acc = []
            for eval_func in eval_funcs:
                try:
                    signal.signal(signal.SIGALRM, timeout_handler)
                    signal.alarm(5)
                    res = eval_func(response)
                except Exception as e:
                    print(e)
                    res = None
                finally:
                    signal.alarm(0)
                if res is not None:
                    try:
                        acc.append(int(res))
                    except Exception:
                        continue
            acc = np.mean(acc) if acc else 0
            if acc > 0:
                filter_responses.append(response)
        
        for each in filter_responses:
            try:
                query_match = re.findall(r'\[Query\](.*)$', result['prompt'], re.DOTALL)
                query = query_match[0].strip() if query_match else ""
                filtered_samples.append({
                    'instruction': result['instruction'],
                    'query': query,
                    'response': each,
                    "eval_func": result['eval_func'],
                })
            except IndexError:
                print("Prompt extraction error:", result['prompt'])
    
    filtered_generated_responses = list(map(json.loads, set(map(json.dumps, filtered_samples))))
    return filtered_generated_responses

global filtered2_generated_responses
filtered2_generated_responses = []
def filter2_generated_response(k=2): 
    global filtered_generated_responses, filtered2_generated_responses
    prompt_template = (
        "You are an expert that is good at judging whether a response is following the instruction and query.\n"
        "[Instruction] {instruction}\n"
        "[Query] {query}\n"
        "[Response] {response}\n"
        "Please notice that the response may not be helpful as it needs to strictly follow the requirements in the Instruction.\n"
        "You need to judge whether the response answers the query. Please first provide a detailed analysis and then give a score ranking from 0 to 10 at the last line.\n"
        "Scoring 0 means the response is totally unrelated to the query, while scoring 10 means the response is helpful and highly related to the query.\n"
        "Please only provide a score in the format `Score: {{score}}` without any other contents at the last line."
    )
    for each in filtered_generated_responses:
        each['prompt'] = prompt_template.format(
            instruction=each['instruction'],
            query=each['query'],
            response=each['response']
        )
        each['gen'] = []
        for _ in range(k):
            # input_ids = self.tokenizer.encode(each['prompt'], return_tensors="pt").cuda()
            # outputs = self.model.generate(input_ids, max_length=1024, do_sample=True, temperature=0.7)
            # generated_text = self.tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)
            generated_text = llm_openai.invoke({"role": "user", "content": each['prompt']}).content
            each['gen'].append(generated_text)
        
        scores = []
        for each in each['gen']:
            score = re.findall(r'Score: (\d+?)$', each)
            if score:
                scores.append(int(score[0]))
        score = np.mean(scores) if scores else 0
        if score > 8: # quality score
            filtered2_generated_responses.append(each)

In [11]:
res.choices[0].message.content

'Pablo Picasso (1881-1973) was a Spanish artist, widely regarded as one of the most influential and versatile artists of the 20th century. He is best known for his pioneering work in Cubism, a style of art that emphasizes geometric forms and fragmented representations of reality.\n\n**Early Life and Training**\n\nPicasso was born in Málaga, Spain, to a family of artists. His father, José Ruiz Blasco, was a painter and art teacher, and his mother, María Picasso y López, was a homemaker. Picasso\'s early life was marked by a strong interest in art, and he began drawing and painting at a young age. He received formal training at the Barcelona Academy of Fine Arts, where he studied from 1895 to 1897.\n\n**Artistic Career**\n\nPicasso\'s artistic career spanned over seven decades and can be divided into several distinct periods:\n\n1. **Blue Period (1901-1904)**: Characterized by somber, blue-toned paintings of poverty, misery, and social injustice.\n2. **Rose Period (1904-1906)**: Marked b

In [8]:
import openai

client = openai.Client(api_key="sk-rc-52dYCY3xghaaETCPTdzR_g", base_url="https://fmapi.swissai.cscs.ch")
res = client.chat.completions.create(
    model="meta-llama/Llama-3.3-70B-Instruct",
    messages=[
        {
            "content": "Who is Pablo Picasso?", 
            "role": "user",
        }
    ],
    # stream=True,
)

# for chunk in res:
#     if len(chunk.choices) > 0 and chunk.choices[0].delta.content:
#         print(chunk.choices[0].delta.content, end="", flush=True)

In [55]:
global filtered_generated_responses
filtered_generated_responses = filter_generated_response()
filtered2_generated_responses = filter2_generated_response()

Filtering back translated responses: 100%|██████████| 8/8 [00:00<00:00, 3828.23it/s]


ValueError: Invalid input type <class 'dict'>. Must be a PromptValue, str, or list of BaseMessages.

In [None]:
filtered2_generated_responses[0]

In [None]:
with open("/share/u/harshraj/CotIF/data-v2/filtered2_generated_responses.json", "w") as f:
    json.dump(filtered2_generated_responses, f, indent=4)

In [None]:
from loguru import logger
from collections import defaultdict
from typing import Dict, List, Callable, Tuple, Any, Optional
from functools import lru_cache

import torch
import vllm
import numpy as np
from tqdm import tqdm
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from langdetect import detect, DetectorFactory, LangDetectException

  from .autonotebook import tqdm as notebook_tqdm


INFO 04-22 17:48:14 [__init__.py:239] Automatically detected platform cuda.


2025-04-22 17:48:22,927	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


: 

In [None]:
from vllm import LLM, SamplingParams

# Initialize vLLM model with tensor parallelism for multi-GPU
model = LLM(
    model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
    tensor_parallel_size=1,
    gpu_memory_utilization=0.9,
    dtype="float16",
    trust_remote_code=True,
)

# For vLLM, we still need the tokenizer from Hugging Face for chat templates
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")

# Handle custom chat template if needed
to_delete = "{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}"
if hasattr(tokenizer, 'get_chat_template') and to_delete in tokenizer.get_chat_template():
    new_template = tokenizer.get_chat_template().replace(to_delete, "")
    tokenizer = AutoTokenizer.from_pretrained(
        "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", 
        chat_template=new_template,
        trust_remote_code=True
    )

INFO 04-22 17:48:41 [config.py:689] This model supports multiple tasks: {'reward', 'score', 'generate', 'embed', 'classify'}. Defaulting to 'generate'.
INFO 04-22 17:48:41 [config.py:1901] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 04-22 17:48:44 [core.py:61] Initializing a V1 LLM engine (v0.8.4) with config: model='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='auto', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_m

In [None]:
from vllm import SamplingParams

# Prepare prompts from chat templates
prompts = []
for messages in batch_messages:
    prompt = tokenizer.apply_chat_template(messages, tokenize=False)
    prompts.append(prompt)

# Set generation parameters
sampling_params = SamplingParams(
    temperature=0.7,
    max_tokens=max_new_tokens,
    top_p=1.0
)

# Generate completions in batch
outputs = model.generate(prompts, sampling_params)

# Extract generated text
results = []
for i, output in enumerate(outputs):
    full_response = output.outputs[0].text
    results.append(full_response)

In [1]:
import os
import re
import json
import time
import torch
import random
import signal
import logging
import numpy as np
from typing import List, Dict, Any, Optional, Tuple, Union
from tqdm import tqdm
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    AutoModelForSequenceClassification,
    pipeline
)

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

def timeout_handler(signum, frame):
    """Handler for function execution timeout."""
    raise TimeoutError("Function execution timed out")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
from getpass import getpass

os.environ["OPENAI_API_KEY"] = getpass("Enter your OpenAI API key: ")

In [None]:
class AutoIf:
    def __init__(
        self,
        llm_model_name: str = "meta-llama/Llama-3-8B-Instruct",
        nli_model_name: str = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7",
        seed_instructions_path: Optional[str] = "/share/u/harshraj/CotIF/data/seed_instruction.txt",
        device: str = "cuda" if torch.cuda.is_available() else "cpu",
        max_length: int = 1024,
        temperature: float = 0.7,
        use_vllm: bool = False,
        tensor_parallel_size: int = 1,
        gpu_memory_utilization: float = 0.9,
    ):
        """
        Initialize the AutoIf class with HuggingFace models.
        
        Args:
            llm_model_name: The HuggingFace model name for the language model
            nli_model_name: The HuggingFace model name for the NLI model
            device: Device to run the models on (cuda or cpu)
            max_length: Maximum length for text generation
            temperature: Temperature for text generation
            seed_instructions_path: Path to a file containing seed instructions
            use_vllm: Whether to use vLLM for faster inference
            tensor_parallel_size: Number of GPUs to use for tensor parallelism with vLLM
            gpu_memory_utilization: Fraction of GPU memory to use (for vLLM)
        """
        logger.info(f"Initializing AutoIf with LLM: {llm_model_name} and NLI: {nli_model_name}")
        
        # Set up LLM model
        self.device = device
        self.max_length = max_length
        self.temperature = temperature
        self.use_vllm = use_vllm
        self.vllm_model = None
        
        # Initialize tokenizer first (needed for both backends)
        logger.info("Loading LLM tokenizer...")
        self.llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
        
        # Handle custom chat template if needed (especially for DeepSeek models)
        to_delete = "{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}"
        if hasattr(self.llm_tokenizer, 'get_chat_template') and to_delete in self.llm_tokenizer.get_chat_template():
            new_template = self.llm_tokenizer.get_chat_template().replace(to_delete, "")
            self.llm_tokenizer = AutoTokenizer.from_pretrained(
                llm_model_name, 
                chat_template=new_template,
                trust_remote_code=True
            )
        
        # Setup according to backend preference
        if use_vllm:
            try:
                from vllm import LLM, SamplingParams
                
                logger.info(f"Loading LLM model with vLLM backend (tensor_parallel_size={tensor_parallel_size})...")
                # Initialize vLLM model with tensor parallelism for multi-GPU
                self.vllm_model = LLM(
                    model=llm_model_name,
                    tensor_parallel_size=tensor_parallel_size,
                    gpu_memory_utilization=gpu_memory_utilization,
                    dtype="float16",
                    trust_remote_code=True,
                )
                logger.info("vLLM model loaded successfully")
                
                # No need for HF pipeline with vLLM
                self.llm_model = None
                self.llm_pipeline = None
                
            except Exception as e:
                logger.warning(f"vLLM initialization failed with error: {str(e)}. Falling back to HuggingFace implementation.")
                self.use_vllm = False
        
        # If vLLM is not used or failed to initialize, use HuggingFace
        if not self.use_vllm:
            logger.info("Loading LLM model with HuggingFace backend...")
            self.llm_model = AutoModelForCausalLM.from_pretrained(
                llm_model_name,
                torch_dtype=torch.float16 if device == "cuda" else torch.float32,
                device_map=device
            )
            
            self.llm_pipeline = pipeline(
                "text-generation",
                model=self.llm_model,
                tokenizer=self.llm_tokenizer,
                # device=device if device != "cuda" else 0,
            )
        
        # Initialize NLI model
        logger.info("Loading NLI model and tokenizer...")
        self.tokenizer_nli = AutoTokenizer.from_pretrained(nli_model_name)
        self.model_nli = AutoModelForSequenceClassification.from_pretrained(nli_model_name)
        self.model_nli.to(device)
        self.model_nli.eval()
        
        # Load seed instructions
        self.seed_instructions = []
        if seed_instructions_path and os.path.exists(seed_instructions_path):
            with open(seed_instructions_path, 'r', encoding='utf-8') as f:
                self.seed_instructions = [line.strip() for line in f.readlines() if line.strip()]
            logger.info(f"Loaded {len(self.seed_instructions)} seed instructions from {seed_instructions_path}")
        
        # Initialize result trackers
        self.reset_results()
    
    def reset_results(self):
        """Reset all result trackers."""
        self.generated_eval_functions = []
        self.filtered_generated_eval_functions = []
        self.generated_instructions = []
        self.filtered_generated_instructions = []
        self.generated_responses = []
        self.filtered_generated_responses = []
        self.filtered2_generated_responses = []
        self.preference_data = []
    
    def call_llm(
        self, 
        prompt: str, 
        max_new_tokens: int = 512, 
        temperature: Optional[float] = None,
        max_retries: int = 3,
        backoff_factor: float = 2.0
    ) -> str:
        """
        Call the language model with retry logic.
        
        Args:
            prompt: The input prompt
            max_new_tokens: Maximum number of new tokens to generate
            temperature: Temperature for sampling, overrides self.temperature if provided
            max_retries: Maximum number of retries on failure
            backoff_factor: Exponential backoff factor
            
        Returns:
            The generated text
        """
        temp = temperature if temperature is not None else self.temperature
        
        for attempt in range(1, max_retries + 1):
            try:
                # Use vLLM if available
                if self.use_vllm and self.vllm_model is not None:
                    from vllm import SamplingParams
                    
                    # Set generation parameters
                    sampling_params = SamplingParams(
                        temperature=temp,
                        max_tokens=max_new_tokens,
                        top_p=0.95,
                    )
                    
                    # Generate text
                    outputs = self.vllm_model.generate(prompt, sampling_params)
                    
                    # Extract and return generated text
                    generated_text = outputs[0].outputs[0].text
                    
                    # Remove prompt from output if it's included
                    if generated_text.startswith(prompt):
                        generated_text = generated_text[len(prompt):]
                    
                    return generated_text.strip()
                
                # Otherwise use HuggingFace
                else:
                    # Generate text
                    outputs = self.llm_pipeline(
                        prompt,
                        max_new_tokens=max_new_tokens,
                        do_sample=(temp > 0),
                        temperature=temp,
                        num_return_sequences=1,
                        pad_token_id=self.llm_tokenizer.eos_token_id
                    )
                    
                    # Extract and return generated text
                    generated_text = outputs[0]["generated_text"]
                    
                    # Remove prompt from output if it's included
                    if generated_text.startswith(prompt):
                        generated_text = generated_text[len(prompt):]
                    
                    return generated_text.strip()
                    
            except Exception as e:
                logger.error(f"Error in LLM call (attempt {attempt}/{max_retries}): {str(e)}")
                if attempt < max_retries:
                    sleep_time = backoff_factor ** (attempt - 1)
                    logger.info(f"Retrying in {sleep_time:.1f} seconds...")
                    time.sleep(sleep_time)
                else:
                    logger.error("Max retries reached, giving up")
                    raise RuntimeError(f"LLM call failed after {max_retries} attempts") from e
        
    def batch_call_llm(
        self, 
        prompts: List[str], 
        max_new_tokens: int = 512, 
        temperature: Optional[float] = None,
        batch_size: int = 4,
        show_progress: bool = True
    ) -> List[str]:
        """
        Call LLM on a batch of prompts.
        
        Args:
            prompts: List of input prompts
            max_new_tokens: Maximum number of new tokens to generate
            temperature: Temperature for sampling, overrides self.temperature if provided
            batch_size: Batch size for processing
            show_progress: Whether to show progress bar
            
        Returns:
            List of generated texts
        """
        if not prompts:
            return []
        
        results = []
        temp = temperature if temperature is not None else self.temperature
        
        # Use vLLM for batched inference if available
        if self.use_vllm and self.vllm_model is not None:
            try:
                from vllm import SamplingParams
                
                # Set generation parameters
                sampling_params = SamplingParams(
                    temperature=temp,
                    max_tokens=max_new_tokens,
                    top_p=0.95,
                )
                
                # Process in batches to avoid OOM
                iterator = range(0, len(prompts), batch_size)
                if show_progress:
                    iterator = tqdm(iterator, desc="vLLM batch processing", total=(len(prompts) + batch_size - 1) // batch_size)
                
                for i in iterator:
                    batch_prompts = prompts[i:i+batch_size]
                    
                    # Generate text for batch
                    outputs = self.vllm_model.generate(batch_prompts, sampling_params)
                    
                    # Extract generated texts
                    batch_results = []
                    for j, output in enumerate(outputs):
                        generated_text = output.outputs[0].text
                        prompt = batch_prompts[j] if j < len(batch_prompts) else ""
                        
                        # Remove prompt from output if it's included
                        if prompt and generated_text.startswith(prompt):
                            generated_text = generated_text[len(prompt):]
                        
                        batch_results.append(generated_text.strip())
                    
                    results.extend(batch_results)
                    
            except Exception as e:
                logger.error(f"Error in vLLM batch processing: {str(e)}. Falling back to individual processing.")
                # Fall back to individual processing
                for prompt in prompts:
                    try:
                        result = self.call_llm(prompt, max_new_tokens, temperature)
                        results.append(result)
                    except Exception as sub_e:
                        logger.error(f"Error in individual prompt processing: {str(sub_e)}")
                        results.append("")  # Add empty string on error
                
        else:
            # Use HuggingFace pipeline for batched inference
            # Process in smaller batches to avoid OOM
            iterator = range(0, len(prompts), batch_size)
            if show_progress:
                iterator = tqdm(iterator, desc="HF batch processing", total=(len(prompts) + batch_size - 1) // batch_size)
            
            for i in iterator:
                batch_prompts = prompts[i:i+batch_size]
                try:
                    # Generate text for batch
                    outputs = self.llm_pipeline(
                        batch_prompts,
                        max_new_tokens=max_new_tokens,
                        do_sample=(temp > 0),
                        temperature=temp,
                        num_return_sequences=1,
                        pad_token_id=self.llm_tokenizer.eos_token_id
                    )
                    
                    # Extract generated texts
                    batch_results = []
                    for prompt, output in zip(batch_prompts, outputs):
                        generated_text = output[0]["generated_text"]
                        # Remove prompt from output if it's included
                        if generated_text.startswith(prompt):
                            generated_text = generated_text[len(prompt):]
                        batch_results.append(generated_text.strip())
                    
                    results.extend(batch_results)
                    
                except Exception as e:
                    # On error, fall back to individual processing
                    logger.error(f"Error in HF batch processing: {str(e)}. Falling back to individual processing.")
                    for prompt in batch_prompts:
                        try:
                            result = self.call_llm(prompt, max_new_tokens, temperature)
                            results.append(result)
                        except Exception as sub_e:
                            logger.error(f"Error in individual prompt processing: {str(sub_e)}")
                            results.append("")  # Add empty string on error
        
        return results
    
    def compile(
        self, 
        dataset, 
        batch_size: int = 8, 
        k_eval: int = 1, 
        k_instruction: int = 2, 
        k_response: int = 2, 
        k_filter: int = 2,
        max_samples: int = 0,
        show_progress: bool = True
    ) -> List[Dict[str, Any]]:
        """
        Process a dataset with batch processing capability.
        
        Args:
            dataset: The dataset to process
            batch_size: Batch size for processing
            k_eval: Number of evaluation function generations per instruction
            k_instruction: Number of attempts for instruction generation
            k_response: Number of response generations per instruction
            k_filter: Number of filter attempts
            max_samples: Maximum number of samples to process (0 for all)
            show_progress: Whether to show progress bars
            
        Returns:
            List of processed results
        """
        logger.info(f"Starting compile with batch_size={batch_size}, max_samples={max_samples if max_samples > 0 else 'all'}")
        
        # Reset results for this run
        self.reset_results()
        
        # Limit number of samples if specified
        data_size = len(dataset)
        if max_samples > 0 and max_samples < data_size:
            logger.info(f"Limiting processing to {max_samples} samples from {data_size} total")
            processing_dataset = dataset.select(range(args.max_samples))
        else:
            processing_dataset = dataset
        # logger.info(f"processing_dataset: {processing_dataset}")
        
        # Generate seed instructions
        logger.info("Generating seed instructions...")
        self.seed_instructions = self.generate_seed(self.seed_instructions)
        
        # Generate and filter evaluation functions
        logger.info("Generating evaluation functions...")
        self.generated_eval_functions = self.generate_eval_function(
            self.seed_instructions, k=k_eval, batch_size=batch_size, show_progress=show_progress
        )
        
        logger.info("Filtering evaluation functions...")
        self.filtered_generated_eval_functions = self.filter_generated_eval_function(
            self.generated_eval_functions, show_progress=show_progress
        )
        
        # Generate and filter instructions
        logger.info("Generating instructions...")
        self.generated_instructions = self.generate_instruction(
            self.filtered_generated_eval_functions, k=k_instruction, batch_size=batch_size, show_progress=show_progress
        )
        
        logger.info("Filtering instructions...")
        self.filtered_generated_instructions = self.filter_generated_instruction(
            self.generated_instructions, batch_size=batch_size, show_progress=show_progress
        )
        
        # Process each data point in the dataset
        output = []
        iterator = processing_dataset
        if show_progress:
            iterator = tqdm(iterator, desc="Processing dataset", total=len(processing_dataset))
        
        for batch_start in range(0, len(processing_dataset), batch_size):
            batch_end = min(batch_start + batch_size, len(processing_dataset))
            # current_batch = processing_dataset[batch_start:batch_end]
            current_batch = processing_dataset.select(range(batch_start, batch_end))
            
            batch_results = []
            for datum in current_batch:
                try:
                    # Extract query and generated text
                    messages = datum.get("conversations", [])
                    if len(messages) > 1 and messages[0].get("from") == "user" and messages[1].get("from") == "assistant":
                        query, r1_generated_text = messages[0].get("value", ""), messages[1].get("value", "")
                        
                        # Skip if query or response is empty
                        if not query or not r1_generated_text:
                            logger.warning("Skipping datum with empty query or response")
                            continue
                        
                        # Generate and filter responses
                        self.generated_responses = self.generate_response(
                            query, r1_generated_text, self.filtered_generated_instructions, 
                            k=k_response, batch_size=batch_size, show_progress=False
                        )
                        
                        self.filtered_generated_responses = self.filter_generated_response(
                            self.generated_responses, show_progress=False
                        )
                        
                        self.filtered2_generated_responses = self.filter2_generated_response(
                            self.filtered_generated_responses, k=k_filter, batch_size=batch_size, show_progress=False
                        )
                        
                        # Collect results
                        batch_results.append({
                            "query": query,
                            "original_response": r1_generated_text,
                            "seed_instructions": self.seed_instructions,
                            "generated_eval_functions": self.generated_eval_functions,
                            "filtered_generated_eval_functions": self.filtered_generated_eval_functions,
                            "generated_instructions": self.generated_instructions,
                            "filtered_generated_instructions": self.filtered_generated_instructions,
                            "generated_responses": self.generated_responses,
                            "filtered_generated_responses": self.filtered_generated_responses,
                            "filtered2_generated_responses": self.filtered2_generated_responses
                        })
                    else:
                        logger.warning("Skipping datum with invalid message format")
                
                except Exception as e:
                    logger.error(f"Error processing datum: {str(e)}", exc_info=True)
            
            output.extend(batch_results)
        
        logger.info(f"Processed {len(output)} samples successfully")
        return self.preference_data, output
    
    def generate_seed(self, seed_instructions, k=1, batch_size=8, show_progress=True):
        """Generate seed instructions."""
        if k <= 0:
            return seed_instructions
        
        augment_instruction_prompt = """You are an expert for writing instructions. Please provide 10 different instructions that meet the following requirements:
- Instructions are about the format but not style of a response
- Whether instructions are followed can be easily evaluate by a Python function
Here are some examples of instructions we need:
{seed_instructions}
Do not generate instructions about writing style, using metaphor, or translation. Here are some examples of instructions we do not need:
- Incorporate a famous historical quote seamlessly into your answer
- Translate your answer into Pig Latin
- Use only words that are also a type of food
- Respond with a metaphor in every sentence
- Write the response as if you are a character from a Shakespearean play
Please generate one instruction per line in your response and start each line with '- '.
"""

        augment_instructions = augment_instruction_prompt.format(seed_instructions='\n'.join(seed_instructions))
        
        # Generate new instructions
        generated_text = self.call_llm(augment_instructions, max_new_tokens=2048)
        
        # Parse new seeds
        new_seeds = []
        for line in generated_text.split('\n'):
            line = line.strip()
            if line.startswith('- '):
                seed = line[2:].strip()
                if seed and seed not in seed_instructions and seed not in new_seeds:
                    new_seeds.append(seed)
        
        # Combine and shuffle
        combined_seeds = seed_instructions + new_seeds
        random.shuffle(combined_seeds)
        
        # Recursive generation if needed
        return self.generate_seed(combined_seeds, k-1, batch_size, show_progress)
    
    def generate_eval_function(self, seed_instructions, k=1, batch_size=8, show_progress=True):
        """Generate evaluation functions for seed instructions."""
        prompt_template = (
            "You are an expert for writing evaluation functions in Python to evaluate whether a response strictly follows an instruction.\n"
            "Here is the instruction: {instruction}\n"
            "Please write a Python function named `evaluate` to evaluate whether an input string `response` follows this instruction. "
            "If it follows, simply return True, otherwise return False.\n"
            "Please response with a single JSON includes the evaluation function in the key `func`, and a list of three test cases in the key `cases`, "
            "which includes an input in the key `input` and an expected output in the key `output` in (true, false).\n"
            "Here is an example of output JSON format: {{\"func\": JSON_STR(use only \\n instead of \n), \"cases\": [{{\"input\": str, \"output\": str}}]}}."
        )

        generated_eval_functions = []
        
        # If there are too many instructions, limit to a reasonable number
        if len(seed_instructions) > 50:
            logger.warning(f"Too many seed instructions ({len(seed_instructions)}), limiting to 50")
            selected_instructions = random.sample(seed_instructions, 50)
        else:
            selected_instructions = seed_instructions
        
        # Build all prompts first
        all_prompts = []
        for instruction in selected_instructions:
            for _ in range(k):
                all_prompts.append(prompt_template.format(instruction=instruction))
        
        # Process prompts in batches
        logger.info(f"Generating eval functions for {len(selected_instructions)} instructions with k={k}")
        all_responses = self.batch_call_llm(all_prompts, max_new_tokens=4096, batch_size=batch_size, show_progress=show_progress)
        
        # Process responses
        idx = 0
        for instruction in selected_instructions:
            eval_function_entry = {
                "prompt": prompt_template.format(instruction=instruction),
                "instruction": instruction,
                "gpt-answer": []
            }
            
            for _ in range(k):
                if idx < len(all_responses):
                    eval_function_entry["gpt-answer"].append(all_responses[idx])
                    idx += 1
            
            generated_eval_functions.append(eval_function_entry)
        
        return generated_eval_functions
    
    def filter_generated_eval_function(self, generated_eval_functions, show_progress=True):
        """Filter and validate generated evaluation functions."""
        filtered_generated_eval_functions = []
        
        # First, collect all imports and other suspicious patterns
        collect_packages = []
        count = 0
        
        for result in generated_eval_functions:
            for each in result['gpt-answer']:
                try:
                    json_pattern = r'```json(.*?)```'
                    match = re.search(json_pattern, each, re.DOTALL)
                    if match:
                        json_dict = match.group(1).strip().replace("\n", "")
                    else:
                        # Try to find JSON without code block markers
                        json_pattern = r'\{.*?"func".*?\}'
                        match = re.search(json_pattern, each, re.DOTALL)
                        if match:
                            json_dict = match.group(0)
                        else:
                            count += 1
                            continue
                    
                    res_dict = json.loads(json_dict)
                    func = res_dict['func']
                    if '\\n' in func:
                        func = func.replace('\\n', '\n')
                    
                    try:
                        exec(func)
                    except Exception as e:
                        count += 1
                        logger.debug(f"Error executing eval function: {e}")
                        continue
                    
                    for line in func.split('\n'):
                        if 'import' in line or 'download' in line or 'requests' in line:
                            collect_packages.append(line)
                
                except Exception as e:
                    count += 1
                    logger.debug(f"Error processing eval function: {e}")
        
        if collect_packages:
            logger.warning(f"Found potentially suspicious imports: {list(set(collect_packages))}")
        
        # Process each result
        iterator = generated_eval_functions
        if show_progress:
            iterator = tqdm(iterator, desc="Filtering eval functions")
        
        for result in iterator:
            eval_funcs, test_cases = [], []
            
            # Process each answer
            for each in result['gpt-answer']:
                try:
                    # Extract JSON
                    json_pattern = r'```json(.*?)```'
                    match = re.search(json_pattern, each, re.DOTALL)
                    if match:
                        json_dict = match.group(1).strip().replace("\n", "")
                    else:
                        # Try to find JSON without code block markers
                        json_pattern = r'\{.*?"func".*?\}'
                        match = re.search(json_pattern, each, re.DOTALL)
                        if match:
                            json_dict = match.group(0)
                        else:
                            continue
                    
                    # Parse JSON
                    res_dict = json.loads(json_dict)
                    
                    # Clean and validate function
                    func = res_dict['func'].strip()
                    func = '\n'.join([line for line in func.split('\n') 
                                     if 'download' not in line and 'requests' not in line])
                    
                    try:
                        exec(func)
                    except Exception as e:
                        logger.debug(f"Error executing eval function: {e}")
                        continue
                    
                    eval_funcs.append(func)
                    
                    # Extract test cases
                    for each_case in res_dict.get('cases', []):
                        try:
                            input_val = each_case.get('input', '')
                            output_val = each_case.get('output', False)
                            
                            # Convert string 'true'/'false' to bool if needed
                            if isinstance(output_val, str):
                                output_val = output_val.lower() == 'true'
                            
                            test_cases.append((input_val, output_val))
                        except Exception as e:
                            logger.debug(f"Error processing test case: {e}")
                
                except Exception as e:
                    logger.debug(f"Error processing eval function result: {e}")
            
            # Remove duplicates
            eval_funcs = list(set(eval_funcs))
            test_cases = list(map(json.loads, set(map(json.dumps, test_cases))))
            
            # Skip if not enough functions or test cases
            # if len(eval_funcs) < 1 or len(test_cases) < 3:
            #     continue
            
            # Filter test cases
            filtered_test_cases = []
            for test_case in test_cases:
                flag = False
                for func in eval_funcs:
                    local_vars = {}
                    try:
                        exec(func, globals(), local_vars)
                    except Exception as e:
                        logger.debug(f"Error executing eval function: {e}")
                        continue
                    
                    if 'evaluate' not in local_vars:
                        continue
                    
                    eval_func = local_vars['evaluate']
                    try:
                        signal.signal(signal.SIGALRM, timeout_handler)
                        signal.alarm(5)
                        res_val = eval_func(test_case[0])
                    except Exception as e:
                        logger.debug(f"Error executing eval function: {e}")
                        res_val = None
                    finally:
                        signal.alarm(0)
                    
                    if res_val is not None and res_val == test_case[1]:
                        flag = True
                        break
                
                if flag:
                    filtered_test_cases.append(test_case)
            
            # Score functions
            scored_funcs = []
            for func in eval_funcs:
                local_vars = {}
                try:
                    exec(func, globals(), local_vars)
                except Exception as e:
                    logger.debug(f"Error executing eval function: {e}")
                    continue
                
                if 'evaluate' not in local_vars:
                    continue
                
                eval_func = local_vars['evaluate']
                acc = []
                
                for inp, out in filtered_test_cases:
                    try:
                        signal.signal(signal.SIGALRM, timeout_handler)
                        signal.alarm(5)
                        res_val = eval_func(inp)
                    except Exception as e:
                        logger.debug(f"Error executing eval function: {e}")
                        res_val = None
                    finally:
                        signal.alarm(0)
                    
                    if res_val is None or res_val != out:
                        acc.append(0)
                    else:
                        acc.append(1)
                
                acc_mean = np.mean(acc) if acc else 0
                scored_funcs.append((func, acc_mean))
            
            # Filter valid functions
            valid_funcs = [each for each in scored_funcs if each[1] >= 0.5]
            if not valid_funcs:
                continue
            
            # Add to filtered results
            filtered_generated_eval_functions.append({
                "instruction": result['instruction'],
                "eval_func": valid_funcs,
                "cases": filtered_test_cases
            })
        
        logger.info(f"Filtered to {len(filtered_generated_eval_functions)} valid evaluation functions")
        return filtered_generated_eval_functions
    
    def generate_instruction(self, filtered_generated_eval_functions, k=2, batch_size=8, show_progress=True):
        """Generate instructions from evaluation functions."""
        generated_instructions = []
        count = 0
        filter_count = 0
        
        # Build all prompts first
        all_prompts = []
        for line in filtered_generated_eval_functions:
            funcs = line["eval_func"][:3]
            instruction_prompt = f"""You are an expert in converting the Python eval function code into the corresponding instruction text. I will provide the eval function code. Please strictly follow the code to convert it into the corresponding instruction text. Here's an example: 

[["def evaluate(response):\n    return 'e' not in response.lower()", 1.0], ["def evaluate(response):\n    words = response.split()\n    for word in response.split():\n        if 'e' in word.lower():\n            return False\n    return True", 1.0], ["def evaluate(response):\n    return all('e' not in word.lower() for word in response.split())", 1.0]] 

["Answer without using any words that contain the letter 'E'.","Answer with words that do not contain the letter 'E'.","Answer with words that do not contain the letter 'E'."] Please convert the following eval function into instructions stored in a list: 

{funcs}"""
            
            for _ in range(k):
                all_prompts.append(instruction_prompt)
        
        # Process prompts in batches
        logger.info(f"Generating instructions from {len(filtered_generated_eval_functions)} eval functions with k={k}")
        all_responses = self.batch_call_llm(all_prompts, max_new_tokens=1024, batch_size=batch_size, show_progress=show_progress)
        
        # Process responses
        idx = 0
        for line in filtered_generated_eval_functions:
            back_instruction = None
            
            for _ in range(k):
                if idx < len(all_responses):
                    response = all_responses[idx]
                    idx += 1
                    
                    try:
                        # Try to parse the response as JSON
                        # First look for JSON in code blocks
                        json_pattern = r'```(json)?(.*?)```'
                        match = re.search(json_pattern, response, re.DOTALL)
                        if match:
                            json_text = match.group(2).strip()
                        else:
                            # Try to find a list directly
                            list_pattern = r'\[(.*?)\]'
                            match = re.search(list_pattern, response, re.DOTALL)
                            if match:
                                json_text = f"[{match.group(1)}]"
                            else:
                                continue
                        
                        back_instruction = json.loads(json_text)
                        break
                    except Exception:
                        filter_count += 1
                        continue
            
            if back_instruction:
                line_copy = line.copy()
                line_copy["back_instruction"] = back_instruction
                generated_instructions.append(line_copy)
                count += 1
            else:
                logger.debug(f"Failed to generate back instruction for: {line['instruction']}")
        
        logger.info(f"Generated {len(generated_instructions)} instructions, {filter_count} filtered out")
        return generated_instructions
    
    def filter_generated_instruction(self, generated_instructions, batch_size=8, show_progress=True):
        """Filter generated instructions using NLI model."""
        filtered_generated_instructions = []
        count = 0
        filter_count = 0
        
        iterator = generated_instructions
        if show_progress:
            iterator = tqdm(iterator, desc="Filtering instructions")
        
        for line in iterator:
            try:
                back_instructions = line["back_instruction"]
                ori_ins = line["instruction"]
                
                # Process each back instruction with NLI model
                nli_scores = []
                for back_ins in back_instructions[:3]:
                    premise = ori_ins
                    hypothesis = back_ins
                    
                    inputs = self.tokenizer_nli(premise, hypothesis, truncation=True, return_tensors="pt")
                    inputs = {k: v.to(self.device) for k, v in inputs.items()}
                    
                    with torch.no_grad():
                        output = self.model_nli(**inputs)
                    prediction = torch.softmax(output.logits[0], -1).tolist()
                    label_names = ["entailment", "neutral", "contradiction"]
                    max_label = label_names[prediction.index(max(prediction))]
                    nli_scores.append(max_label)
                
                line["nli_scores"] = nli_scores
                
                # Filter out contradictions
                if "contradiction" in nli_scores:
                    filter_count += 1
                    continue
                else:
                    filtered_generated_instructions.append(line)
                    count += 1
            
            except Exception as e:
                logger.error(f"Error filtering instruction: {str(e)}")
                filter_count += 1
        
        logger.info(f"Filtered to {len(filtered_generated_instructions)} instructions, {filter_count} filtered out")
        return filtered_generated_instructions
    
    def generate_response(self, query, r1_generated_text, filtered_generated_instructions, k=2, batch_size=8, show_progress=True):
        """Generate responses following instructions."""
        generated_responses = []
        
        # Build all prompts
        all_prompts = []
        for instruction in filtered_generated_instructions:
            ins_text = instruction['instruction']
            for _ in range(k):
                prompt = (
                    f"{r1_generated_text}\n"
                    f"Re-write the above text following: {ins_text}\n\n"
                    f"Note: Use the same words and sentences but re-arrange them in a way that strictly follows the instruction.\n"
                )
                all_prompts.append(prompt)
        
        # Process prompts in batches
        if show_progress:
            logger.info(f"Generating responses for {len(filtered_generated_instructions)} instructions with k={k}")
        all_generated_texts = self.batch_call_llm(all_prompts, max_new_tokens=1024, batch_size=batch_size, show_progress=show_progress)
        
        # Process responses
        idx = 0
        for instruction in filtered_generated_instructions:
            prompt = (
                f"Please answer the query strictly following the instruction.\n"
                f"[instruction] {instruction['instruction']}\n"
                f"[Query] {query}"
            )
            
            responses = []
            for _ in range(k):
                if idx < len(all_generated_texts):
                    responses.append(all_generated_texts[idx])
                    idx += 1
            
            generated_responses.append({
                "instruction": instruction['instruction'],
                "prompt": prompt,
                "gpt-answer": responses,
                "eval_func": instruction["eval_func"],
            })
        
        if show_progress:
            logger.info(f"Generated {len(generated_responses)} response sets")
        return generated_responses
    
    def filter_generated_response(self, generated_responses, show_progress=True):
        """Filter generated responses using evaluation functions."""
        filtered_samples = []
        
        iterator = generated_responses
        if show_progress:
            iterator = tqdm(iterator, desc="Filtering responses")
        
        for result in iterator:
            # Extract evaluation functions
            eval_funcs = []
            for func, score in result['eval_func']:
                local_vars = {}
                try:
                    exec(func, globals(), local_vars)
                except Exception as e:
                    logger.debug(f"Error executing eval function: {e}")
                    continue
                
                if 'evaluate' in local_vars:
                    eval_funcs.append(local_vars['evaluate'])
            
            # Skip if no valid evaluation functions
            if not eval_funcs:
                continue
            
            # Filter responses
            filter_responses = []
            for response in result['gpt-answer']:
                acc = []
                for eval_func in eval_funcs:
                    try:
                        signal.signal(signal.SIGALRM, timeout_handler)
                        signal.alarm(5)
                        res = eval_func(response)
                    except Exception as e:
                        logger.debug(f"Error executing eval function: {e}")
                        res = None
                    finally:
                        signal.alarm(0)
                    
                    if res is not None:
                        try:
                            acc.append(int(res))
                        except Exception:
                            continue
                
                acc_mean = np.mean(acc) if acc else 0
                if acc_mean > 0:
                    filter_responses.append(response)
                    preference_item = {
                        "messages": [
                            {"role": "user", "content": result["prompt"]},
                            {"role": "assistant", "content": item.get('response', '')}
                        ],
                        "is_positive": 1
                    }
                else:
                    preference_item = {
                        "messages": [
                            {"role": "user", "content": result["prompt"]},
                            {"role": "assistant", "content": item.get('response', '')}
                        ],
                        "is_positive": 0
                    }
                    
                self.preference_data.append(preference_item)
            
            # Extract query and add filtered responses
            for each in filter_responses:
                try:
                    query_match = re.findall(r'\[Query\](.*)$', result['prompt'], re.DOTALL)
                    query = query_match[0].strip() if query_match else ""
                    
                    filtered_samples.append({
                        'instruction': result['instruction'],
                        'query': query,
                        'response': each,
                        'prompt': result['prompt'],
                        'eval_func': result['eval_func'],
                    })
                except IndexError:
                    logger.debug(f"Prompt extraction error: {result['prompt']}")
        
        # Remove duplicates
        unique_samples = []
        seen = set()
        for item in filtered_samples:
            item_json = json.dumps(item, sort_keys=True)
            if item_json not in seen:
                seen.add(item_json)
                unique_samples.append(item)
        
        if show_progress:
            logger.info(f"Filtered to {len(unique_samples)} unique responses")
        return unique_samples
    
def filter2_generated_response(self, filtered_generated_responses, k=2, batch_size=8, show_progress=True):
    """Further filter responses based on quality scoring."""
    filtered2_generated_responses = []
    
    if not filtered_generated_responses:
        logger.warning("No responses to filter in filter2_generated_response")
        return filtered2_generated_responses
    
    prompt_template = (
        "You are an expert that is good at judging whether a response is following the instruction and query.\n"
        "[Instruction] {instruction}\n"
        "[Query] {query}\n"
        "[Response] {response}\n"
        "Please notice that the response may not be helpful as it needs to strictly follow the requirements in the Instruction.\n"
        "You need to judge whether the response answers the query. Please first provide a detailed analysis and then give a score ranking from 0 to 10 at the last line.\n"
        "Scoring 0 means the response is totally unrelated to the query, while scoring 10 means the response is helpful and highly related to the query.\n"
        "Please only provide a score in the format `Score: {{score}}` without any other contents at the last line."
    )
    
    all_prompts = []
    for idx, each in enumerate(filtered_generated_responses):
        for k_idx in range(k):
            prompt = prompt_template.format(
                instruction=each.get('instruction', ''),
                query=each.get('query', ''),
                response=each.get('response', '')
            )
            all_prompts.append((idx, prompt))
    
    if show_progress:
        logger.info(f"Quality scoring {len(filtered_generated_responses)} responses with k={k}")
    
    batch_results = {}
    for i in range(0, len(all_prompts), batch_size):
        batch_items = all_prompts[i:i+batch_size]
        batch_prompts = [item[1] for item in batch_items]
        
        try:
            batch_responses = self.batch_call_llm(
                batch_prompts, max_new_tokens=1024, batch_size=batch_size, show_progress=False
            )
            
            for j, response in enumerate(batch_responses):
                if i + j < len(all_prompts):
                    idx = all_prompts[i + j][0]
                    if idx not in batch_results:
                        batch_results[idx] = []
                    batch_results[idx].append(response)
        
        except Exception as e:
            logger.error(f"Error in batch processing: {str(e)}")
            
    for idx, responses in batch_results.items():
        if idx < len(filtered_generated_responses):
            item = filtered_generated_responses[idx]
            item_copy = item.copy()
            item_copy['gen'] = responses
            
            scores = []
            for gen in responses:
                score_match = re.search(r'Score:\s*(\d+)', gen)
                if score_match:
                    try:
                        score = int(score_match.group(1))
                        scores.append(score)
                    except ValueError:
                        continue
            
            score = np.mean(scores) if scores else 0
            if score > 5: 
                filtered2_generated_responses.append(item_copy)
                preference_item = {
                    "messages": [
                        {"role": "user", "content": result["prompt"]},
                        {"role": "assistant", "content": item.get('response', '')}
                    ],
                    "is_positive": 1
                }
            else:
                preference_item = {
                    "messages": [
                        {"role": "user", "content": result["prompt"]},
                        {"role": "assistant", "content": item.get('response', '')}
                    ],
                    "is_positive": 0
                }
            self.preference_data.append(preference_item)
            
    if show_progress:
        logger.info(f"Final quality filter: {len(filtered2_generated_responses)} responses")
    
    return filtered2_generated_responses