In [236]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

##### This split ensures that the test set will have at least 50 instances. Also ensures the models will not share the same instances in both training and test sets.
def split_data(df):
    """
    This will ensure that no data points that share the same "id" appear in both training and test sets. 
    The `stratify` parameter in `train_test_split` is used to maintain the same proportion of 
    "correct" and "incorrect" in both training and test sets.
    """
    df["id"] = df["input"] + df["options"]
    train_dfs = []
    test_dfs = []
    
    # Iterate over unique tasks and models
    for task, model in df[['task', 'model']].drop_duplicates().values:
        # Filter dataframe for current task and model
        task_model_df = df[(df['task'] == task) & (df['model'] == model)]
        
        # Get unique ids
        unique_ids = task_model_df['id'].unique()

        # Calculate test size
        test_size = 50 / len(unique_ids)

        # If calculated test size is greater than 0.3, set it to 0.3
        if test_size >= 0.3:
            test_size = test_size
        else:
            test_size = 0.3
        # Split ids into train and test
        ### using random_state can ensure that the models will not share the same instances in both training and test sets.
        train_ids, test_ids = train_test_split(unique_ids, test_size=test_size, random_state=42)
        # print("\n\n\n", task, "\n", train_ids[0])  ### run this to verify it
        
        # If test set is less than 50, resample
        if len(test_ids) < 50:
            missing = 50 - len(test_ids)
            additional_ids = np.random.choice(train_ids, size=missing, replace=False)
            train_ids = [id for id in train_ids if id not in additional_ids]
            test_ids = np.concatenate([test_ids, additional_ids])

        # Create train and test dataframes based on ids
        train_df = task_model_df[task_model_df['id'].isin(train_ids)]
        test_df = task_model_df[task_model_df['id'].isin(test_ids)]
        
        train_dfs.append(train_df)
        test_dfs.append(test_df)

    # Concatenate all train and test dataframes
    train_df = pd.concat(train_dfs, ignore_index=True)
    test_df = pd.concat(test_dfs, ignore_index=True)
    
    return train_df, test_df

### Using LLaMA-2 as the subject systems.

In [237]:
df = pd.read_csv("./SS_data_phase1/SS_data_id.csv")
df_ood = pd.read_csv("./SS_data_phase1/SS_data_ood.csv")
# df = df[df["model"]=="llama-2-13b"]
df_train, df_test = split_data(df)




 abstract_algebra 
 Question: Find the characteristic of the ring Z_3 x Z_3.
A. 0
B. 3
C. 12
D. 30
Answer:{'A': '0', 'B': '3', 'C': '12', 'D': '30'}



 astronomy 
 Question: The nebular theory of the formation of the solar system successfully predicts all but one of the following. Which one does the theory not predict?
A. Planets orbit around the Sun in nearly circular orbits in a flattened disk.
B. the equal number of terrestrial and jovian planets
C. the craters on the Moon
D. asteroids Kuiper-belt comets and the Oort cloud
Answer:{'A': 'Planets orbit around the Sun in nearly circular orbits in a flattened disk.', 'B': 'the equal number of terrestrial and jovian planets', 'C': 'the craters on the Moon', 'D': 'asteroids Kuiper-belt comets and the Oort cloud'}



 business_ethics 
 Question: _______ locate morality beyond the sphere of rationality in an emotional 'moral impulse' towards others.
A. Virtue ethics
B. Ethical egoism
C. Postmodern ethics
D. Ethics of duty
Answer:{'A': '




 logical_fallacies 
 Question: The appeal to joy fallacy involves
A. arguing that someone should do something based only on the assertion that it will make him or her feel happier
B. causing confusion during refutation because of a lack of ability to engage in refutation
C. arguing for a conclusion based only on feeling sorry for someone when that feeling is irrelevant to the conclusion
D. arguing someone should do something only because other people will like him or her for doing it
Answer:{'A': 'arguing that someone should do something based only on the assertion that it will make him or her feel happier', 'B': 'causing confusion during refutation because of a lack of ability to engage in refutation', 'C': 'arguing for a conclusion based only on feeling sorry for someone when that feeling is irrelevant to the conclusion', 'D': 'arguing someone should do something only because other people will like him or her for doing it'}



 machine_learning 
 Question: To achieve an 0/1 loss e




 college_medicine 
 Question: During DNA replication, mistakes are coded into the leading strand about once every 100,000/1 million copies. This DNA is subject to proofreading by several mechanisms. If a mistake is noted and the incorrect base is removed shortly following the time RNA primer is removed, this would most likely be the work of which repair mechanism?
A. DNA polymerase I
B. DNA polymerase III
C. Mismatch repair mechanism
D. Endonuclease repair
Answer:{'A': 'DNA polymerase I', 'B': 'DNA polymerase III', 'C': 'Mismatch repair mechanism', 'D': 'Endonuclease repair'}



 college_physics 
 Question: The best type of laser with which to do spectroscopy over a range of visible wavelengths is
A. a dye laser
B. a helium-neon laser
C. an excimer laser
D. a ruby laser
Answer:{'A': 'a dye laser', 'B': 'a helium-neon laser', 'C': 'an excimer laser', 'D': 'a ruby laser'}



 computer_security 
 Question: A session symmetric key between two parties is used
A. Only once
B. Twice
C. Mul




 jurisprudence 
 Question: The ________ School believes that promoting market efficiency should be the central goal of legal decision-making.
A. Law and Economics
B. Critical Legal Studies
C. Historical
D. Natural Law
Answer:{'A': 'Law and Economics', 'B': 'Critical Legal Studies', 'C': 'Historical', 'D': 'Natural Law'}



 logical_fallacies 
 Question: The appeal to joy fallacy involves
A. arguing that someone should do something based only on the assertion that it will make him or her feel happier
B. causing confusion during refutation because of a lack of ability to engage in refutation
C. arguing for a conclusion based only on feeling sorry for someone when that feeling is irrelevant to the conclusion
D. arguing someone should do something only because other people will like him or her for doing it
Answer:{'A': 'arguing that someone should do something based only on the assertion that it will make him or her feel happier', 'B': 'causing confusion during refutation because of a l




 formal_logic 
 Question:  Construct a complete truth table for the following argument. Then, using the truth table, determine whether the argument is valid or invalid. If the argument is invalid, choose an option which presents a counterexample. (There may be other counterexamples as well.)
J ≡ ~K
J ∨ L
~K / L
A. Valid
B. Invalid. Counterexample when J and K are true and L is false
C. Invalid. Counterexample when J is true and K and L are false
D. Invalid. Counterexample when K is true and J and L are false
Answer:{'A': 'Valid', 'B': 'Invalid. Counterexample when J and K are true and L is false', 'C': 'Invalid. Counterexample when J is true and K and L are false', 'D': 'Invalid. Counterexample when K is true and J and L are false'}



 high_school_biology 
 Question: When we say that an individual organism has a greater fitness than another individual, we specifically mean that the organism
A. lives longer than others of its species.
B. competes for resources more successfully than




 philosophy 
 Question: When using visual methods in a research project what should you take into consideration alongside the legal guidelines?
A. What constitutes a public space
B. Protecting individuals from potential harm
C. Any risk of invasion of privacy
D. All of the above.
Answer:{'A': 'What constitutes a public space', 'B': 'Protecting individuals from potential harm', 'C': 'Any risk of invasion of privacy', 'D': 'All of the above.'}



 prehistory 
 Question: In periods of great difficulty and stress, it appears that the Minoans __________ in order to appease the gods.
A. sacrificed adults and occasionally children
B. burnt their own houses to the ground
C. fasted for long periods
D. abstained from sexual relations
Answer:{'A': 'sacrificed adults and occasionally children', 'B': 'burnt their own houses to the ground', 'C': 'fasted for long periods', 'D': 'abstained from sexual relations'}



 professional_law 
 Question: A plaintiff sued an industrial facility in her neighb




 high_school_geography 
 Question: The theory that every society develops economically according to a five-stage pattern of growth is attributed to
A. Rostow.
B. Ravenstein.
C. von Thünen.
D. Burgess.
Answer:{'A': 'Rostow.', 'B': 'Ravenstein.', 'C': 'von Thünen.', 'D': 'Burgess.'}



 high_school_government_and_politics 
 Question: Which of the following is not a special position held by a member of Congress?
A. President pro tempore
B. Speaker of the House
C. Congressional Management Foundation Chair
D. Minority Whip
Answer:{'A': 'President pro tempore', 'B': 'Speaker of the House', 'C': 'Congressional Management Foundation Chair', 'D': 'Minority Whip'}



 high_school_macroeconomics 
 Question: If Real GDP = $200 billion and the price index = 200 Nominal GDP is
A. $4 billion
B. $400 billion
C. $200 billion
D. $2 billion
Answer:{'A': '$4 billion', 'B': '$400 billion', 'C': '$200 billion', 'D': '$2 billion'}



 high_school_mathematics 
 Question: The positive square root of 200 is 




 public_relations 
 Question: Which of these qualities is NOT listed as something to consider when choosing an opinion leader for a public relations campaigns?
A. Reputation
B. Impact
C. Familiarity with subject or issue
D. Ability to reach people
Answer:{'A': 'Reputation', 'B': 'Impact', 'C': 'Familiarity with subject or issue', 'D': 'Ability to reach people'}



 security_studies 
 Question: How is a process of securitization completed?
A. A process of securitization is when an issue not included in the political sphere becomes politicized and is managed within the political system. A securitized issue is part of public policy requiring governmental action; decision-making, resource allocation and even communal governance.
B. A securitizing actor can argue for an issue to be securitised when it is outside of the political domain. An existential threat can be made a matter for state attention even when it is not included within public debate. Securitization refers to the recognitio

In [238]:
assessor_prompt_lis = []
for index, row in df_train.iterrows():

    model = row["model"]
    param_size = row["system_param_num"]
    model_family = "LLaMA"
    year = 2023
    
    # system_prompt = f"<Instruction>\nYour task is to evaluate whether the answer of a language model characterised by <ID = {model}, parameter number = {param_size }, model family = {model_family}, year = {year}> on a question-answering task is correct or incorrect, given a prompt. In this task, the language model received a few instances as few-shot examples before solving a test instance, and you need to assess the outcome of its performance in the test instance by outputing “<OUTCOME>\ncorrect” (if the language model solved the test instance correctly) or “<OUTCOME>\nincorrect” (if the language model did not solve the test instance correctly) at the end of the task. Namely, you have to evaluate the outcome (correct or incorrect) of the last instance for me."    
    system_prompt = f"### Instruction\nYour task is to evaluate whether the answer of a language model characterised by <ID = {model}, parameter number = {param_size }, model family = {model_family}, year = {year}> on a question-answering task is correct or incorrect, given a prompt, as shown below."
    system_prompt = f"### Instruction\nYour task is to evaluate whether the answer of a language model characterised by <ID = {model}, parameter number = {param_size }, model family = {model_family}, year = {year}> on a question-answering task is correct or incorrect, given a prompt."
    # system_prompt = f"### Subject System's Features\n ID = {model}, parameter number = {param_size}, model family = {model_family}, year = {year}."
   
    text_instance_per_shot = "".join(row["input"].split("\n\n")[-1])
    
    prompt = (system_prompt + "\n### Prompt\n"+ text_instance_per_shot + "\n### Answer\n" + 
              str(row["output"]) + '. ' + ast.literal_eval(row["options"])[row["output"]] + 
              "\n### Evaluation: is the answer correct?.\n" + {1: 'Yes', 0: 'No'}[row["correct"]])
    assessor_prompt_lis.append(prompt)

df_train["assessor_prompt"] = assessor_prompt_lis


assessor_prompt_lis = []
for index, row in df_test.iterrows():

    model = row["model"]
    param_size = row["system_param_num"]
    model_family = "LLaMA-2"
    year = 2023
    
    # system_prompt = f"### Instruction\nYour task is to evaluate whether the answer of a language model characterised by <ID = {model}, parameter number = {param_size }, model family = {model_family}, year = {year}> on a question-answering task is correct or incorrect, given a prompt, as shown below."
    text_instance_per_shot = "".join(row["input"].split("\n\n")[-1])
    
    prompt = (system_prompt + "\n### Prompt\n"+ text_instance_per_shot + "\n### Answer\n" + 
              str(row["output"]) + '. ' + ast.literal_eval(row["options"])[row["output"]] + 
              "\n### Evaluation: is the answer correct?.\n") 
    
    assessor_prompt_lis.append(prompt)

df_test["assessor_prompt"] = assessor_prompt_lis


assessor_prompt_lis = []
for index, row in df_ood.iterrows():

    model = row["model"]
    param_size = row["system_param_num"]
    model_family = "LLaMA-2"
    year = 2023
    
    # system_prompt = f"### Instruction\nYour task is to evaluate whether the answer of a language model characterised by <ID = {model}, parameter number = {param_size }, model family = {model_family}, year = {year}> on a question-answering task is correct or incorrect, given a prompt, as shown below."
    text_instance_per_shot = "".join(row["input"].split("\n\n")[-1])
    
    prompt = (system_prompt + "\n### Prompt\n"+ text_instance_per_shot + "\n### Answer\n" + 
              str(row["output"]) + '. ' + ast.literal_eval(row["options"])[row["output"]] + 
              "\n### Evaluation: is the answer correct?.\n") 
    
    assessor_prompt_lis.append(prompt)

df_ood["assessor_prompt"] = assessor_prompt_lis


In [248]:
# df_train.to_csv("assessor_train_data_id.csv", index = False)
# df_test.to_csv("assessor_test_data_id.csv", index = False)
# df_ood.to_csv("assessor_test_data_ood.csv", index = False)