In [1]:
import os
import re
import json
import numpy as np
import pandas as pd
from tqdm import tqdm

from scipy.stats import mode
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

from collections import Counter

In [2]:
RES_DIR = "/root/sample_gen/data/gen_data/single_files/mmlu"
data_name = RES_DIR.strip().split('/')[-1] 

res, llms = [], []
for fp in tqdm(sorted(os.listdir(RES_DIR))):
    if not fp.endswith(".json"):
        continue
    fpath = os.path.join(RES_DIR,fp)
    llm = fp.split("_")[1]
    llms.append(llm)
    print(f" Size of the {llm} generated file is: {len(pd.read_json(fpath))}")
    for i, d  in enumerate(json.load(open(fpath,'r'))):
        if "query_response" in d:
            if data_name == 'mmlu':
                res.append({
                    "prompt": d["zprompt"],
                    "gold_answer": d["answer"],
                    "split": d["split"],
                    "subject": d["subject"],
                    "llm": llm,
                    "response": d["query_response"]
                })
            if data_name == 'gsm8k':
                res.append({
                        "prompt": d["zprompt"],
                        "gold_answer": d["answer"],
                        "split": d["split"],
                        "llm": llm,
                        "response": d["query_response"]
                    })
                
res_df = pd.DataFrame(res)
res_df.drop_duplicates(subset=['prompt', 'llm'], inplace=True)
print("Data Name and Length :", data_name, len(res_df))
print("Extracted LLMs:", llms)
print("Each LLM Sample (After removing duplicates):", len(res_df)/len(llms))
res_df.head()

  0%|          | 0/8 [00:00<?, ?it/s]

 Size of the falcon-7b generated file is: 15858


 25%|██▌       | 2/8 [00:02<00:06,  1.04s/it]

 Size of the gemma-7b generated file is: 15858


 38%|███▊      | 3/8 [00:04<00:07,  1.57s/it]

 Size of the llama2-7b-lm generated file is: 15858


 50%|█████     | 4/8 [00:06<00:07,  1.85s/it]

 Size of the metamath-7b generated file is: 15858


 62%|██████▎   | 5/8 [00:09<00:06,  2.02s/it]

 Size of the mistral-7b-lm generated file is: 15858


 75%|███████▌  | 6/8 [00:11<00:04,  2.12s/it]

 Size of the olmo-7b generated file is: 15858


 88%|████████▊ | 7/8 [00:13<00:02,  2.15s/it]

 Size of the opt-6.7b generated file is: 15858


100%|██████████| 8/8 [00:15<00:00,  1.97s/it]


Data Name and Length : mmlu 110712
Extracted LLMs: ['falcon-7b', 'gemma-7b', 'llama2-7b-lm', 'metamath-7b', 'mistral-7b-lm', 'olmo-7b', 'opt-6.7b']
Each LLM Sample (After removing duplicates): 15816.0


Unnamed: 0,prompt,gold_answer,split,subject,llm,response
0,<START_QUESTION> Question: What are the frame...,B,dev,security_studies,falcon-7b,[Let's think step by step. We refer to Wikipe...
1,<START_QUESTION> Question: What are the frame...,D,dev,security_studies,falcon-7b,[Let's think step by step. We refer to Wikipe...
2,<START_QUESTION> Question: What are the frame...,C,dev,security_studies,falcon-7b,[Let's think step by step. We refer to Wikipe...
3,<START_QUESTION> Question: What are the frame...,B,dev,security_studies,falcon-7b,[Let's think step by step. We refer to Wikipe...
4,<START_QUESTION> Question: What are the frame...,C,dev,security_studies,falcon-7b,[Let's think step by step. We refer to Wikipe...


In [3]:
def extract_auto_answer(text, data_name):
    # For MMLU
    # Answer Extraction Policy-1 
    if data_name == 'mmlu':
        # candidates = re.search(r'The answer is \((A|B|C|D)\).',text)
        # if candidates!=None:
        #     return candidates[0].split('(')[1].split(')')[0].strip()
        # else: 
        #     return "INVALID"

        #Answer Extraction Policy-1: first neumric number from end 
        candidates = re.findall(r'\((A|B|C|D)\)', text)
        if len(candidates) != 0:
            return candidates[-1].strip()
        else: 
            return "INVALID"
    
    
    # for GSM8K
    # Answer Extraction Policy-1 
    if data_name == 'gsm8k':
        # candidates = re.findall(r'The answer is (\d+)\.', text)
        # if len(candidates) != 0:
        #     return float(candidates[-1].strip())
        # else: 
        #     return "INVALID"
        
       #Answer Extraction Policy-1: first neumric number from end 
        candidates = re.findall(r'(\d+)[^\d]*$', text)
        if len(candidates) != 0:
            return float(candidates[-1].strip())
        else: 
            return "INVALID"
    
res_df["auto_answer"] = [ [extract_auto_answer(ar, data_name) for ar in auto_res] for auto_res in tqdm(res_df["response"])]
res_df["auto_answer"].head()

100%|██████████| 110712/110712 [00:01<00:00, 72193.56it/s]


0    [B, B, B, B, B, B, B, B, B, B]
1    [A, A, A, A, A, A, A, A, B, A]
2    [C, C, C, C, C, C, C, C, C, C]
3    [A, C, C, C, B, B, B, B, C, B]
4    [D, B, B, D, C, B, C, D, C, D]
Name: auto_answer, dtype: object

In [4]:
# VIABILITY
raw_viability = res_df.groupby('llm')['auto_answer'].apply(lambda x: sum(map(lambda lst: lst.count('INVALID'), x)))
viability =  (1 - raw_viability/(len(res_df)*10))*100
print(raw_viability)
print(viability)

llm
falcon-7b        25252
gemma-7b          2654
llama2-7b-lm      3434
metamath-7b       9704
mistral-7b-lm      912
olmo-7b          23140
opt-6.7b         28585
Name: auto_answer, dtype: int64
llm
falcon-7b        97.719127
gemma-7b         99.760279
llama2-7b-lm     99.689826
metamath-7b      99.123492
mistral-7b-lm    99.917624
olmo-7b          97.909892
opt-6.7b         97.418076
Name: auto_answer, dtype: float64


In [5]:
# Percentage of the questions for which all the answers are INVALID.
all_invalid = res_df.groupby('llm')['auto_answer'].apply(lambda x: sum(1 for lst in x if all(str(item).strip() == 'INVALID' for item in lst)))
print(all_invalid/(len(res_df))*100)
print(all_invalid)

llm
falcon-7b        2.058494
gemma-7b         0.000903
llama2-7b-lm     0.000903
metamath-7b      0.042452
mistral-7b-lm    0.000000
olmo-7b          1.919394
opt-6.7b         1.960944
Name: auto_answer, dtype: float64
llm
falcon-7b        2279
gemma-7b            1
llama2-7b-lm        1
metamath-7b        47
mistral-7b-lm       0
olmo-7b          2125
opt-6.7b         2171
Name: auto_answer, dtype: int64


In [6]:
# VIABILITY
# only retaining llms with less than 10% INVALID generations
viable_llms = list(viability.index[viability > 0.8])  
print(viable_llms)
#viable_llms = ['gemma-7b', 'metamath-7b', 'mistral-7b-lm']
#print(viable_llms)
res_df = res_df[res_df["llm"].isin(viable_llms)].reset_index(drop=True)

['falcon-7b', 'gemma-7b', 'llama2-7b-lm', 'metamath-7b', 'mistral-7b-lm', 'olmo-7b', 'opt-6.7b']


In [7]:
# matches 
res_df['matches'] = res_df.apply(lambda row: row['auto_answer'].count(row['gold_answer']), axis=1)
res_df['matches'].head()

0    10
1     0
2    10
3     5
4     3
Name: matches, dtype: int64

In [8]:
# success rate
res_df['sc'] = res_df['matches'].apply(lambda x: 1 if x >= 8 else 0)
res_df['sc'].head()

0    1
1    0
2    1
3    0
4    0
Name: sc, dtype: int64

In [9]:
# Majority Answer
def find_majority(lst):
    if not lst:
        return None
    return max(set(lst), key=lst.count)

# Apply the find_majority function to each row to create the 'majority_answer' column
res_df['majority_answer'] = res_df['auto_answer'].apply(find_majority)
res_df['majority_answer'].head()

0    B
1    A
2    C
3    B
4    D
Name: majority_answer, dtype: object

In [10]:
# Majority Answer Success rate
res_df['majority_sc'] = res_df.apply(lambda row: 1 if str(row['majority_answer']).strip() == str(row['gold_answer']).strip() else 0, axis=1)
res_df['majority_sc'].head()

0    1
1    0
2    1
3    1
4    0
Name: majority_sc, dtype: int64

In [11]:
print(res_df.shape)
res_df.head()

(110712, 11)


Unnamed: 0,prompt,gold_answer,split,subject,llm,response,auto_answer,matches,sc,majority_answer,majority_sc
0,<START_QUESTION> Question: What are the frame...,B,dev,security_studies,falcon-7b,[Let's think step by step. We refer to Wikipe...,"[B, B, B, B, B, B, B, B, B, B]",10,1,B,1
1,<START_QUESTION> Question: What are the frame...,D,dev,security_studies,falcon-7b,[Let's think step by step. We refer to Wikipe...,"[A, A, A, A, A, A, A, A, B, A]",0,0,A,0
2,<START_QUESTION> Question: What are the frame...,C,dev,security_studies,falcon-7b,[Let's think step by step. We refer to Wikipe...,"[C, C, C, C, C, C, C, C, C, C]",10,1,C,1
3,<START_QUESTION> Question: What are the frame...,B,dev,security_studies,falcon-7b,[Let's think step by step. We refer to Wikipe...,"[A, C, C, C, B, B, B, B, C, B]",5,0,B,1
4,<START_QUESTION> Question: What are the frame...,C,dev,security_studies,falcon-7b,[Let's think step by step. We refer to Wikipe...,"[D, B, B, D, C, B, C, D, C, D]",3,0,D,0


In [12]:
print("*"*50)
sc = res_df.groupby('llm')['sc'].sum()/(len(res_df)/len(viable_llms))*100
print("Success Rate:", sc)
print("*"*50)
majority_sc = res_df.groupby('llm')['majority_sc'].sum()/(len(res_df)/len(viable_llms))*100
print("Majority Success Rate:", majority_sc)
print("*"*50)

**************************************************
Success Rate: llm
falcon-7b         1.353060
gemma-7b         40.768842
llama2-7b-lm     20.529843
metamath-7b      26.991654
mistral-7b-lm    39.643399
olmo-7b           3.275164
opt-6.7b          2.959029
Name: sc, dtype: float64
**************************************************
Majority Success Rate: llm
falcon-7b        24.974709
gemma-7b         66.603440
llama2-7b-lm     48.071573
metamath-7b      43.114568
mistral-7b-lm    62.500000
olmo-7b          28.306778
opt-6.7b         22.888214
Name: majority_sc, dtype: float64
**************************************************


In [13]:
# Oracel Computation 
res_temp_df = res_df.copy()
columns_to_drop = ['response']
resd_df = res_temp_df.drop(columns=columns_to_drop)
resd_df.head()

Unnamed: 0,prompt,gold_answer,split,subject,llm,auto_answer,matches,sc,majority_answer,majority_sc
0,<START_QUESTION> Question: What are the frame...,B,dev,security_studies,falcon-7b,"[B, B, B, B, B, B, B, B, B, B]",10,1,B,1
1,<START_QUESTION> Question: What are the frame...,D,dev,security_studies,falcon-7b,"[A, A, A, A, A, A, A, A, B, A]",0,0,A,0
2,<START_QUESTION> Question: What are the frame...,C,dev,security_studies,falcon-7b,"[C, C, C, C, C, C, C, C, C, C]",10,1,C,1
3,<START_QUESTION> Question: What are the frame...,B,dev,security_studies,falcon-7b,"[A, C, C, C, B, B, B, B, C, B]",5,0,B,1
4,<START_QUESTION> Question: What are the frame...,C,dev,security_studies,falcon-7b,"[D, B, B, D, C, B, C, D, C, D]",3,0,D,0


In [14]:
from functools import reduce
grouped = resd_df.groupby('llm')

sub_dataframes =[]
for key, group in grouped:
    new_column_names = {'sc': 'sc_'+str(key), 'majority_sc': 'majority_sc_'+str(key), 'auto_answer': 'auto_answer_'+str(key), 'majority_answer': 'majority_answer_'+str(key), 'matches': 'matches_'+str(key)}
    group.rename(columns=new_column_names, inplace=True)
    group_df = group.drop(columns=["llm"])
    sub_dataframes.append(group_df)

assert len(sub_dataframes) == len(viable_llms)
sub_dataframes[0].head()

if data_name == "gsm8k":
    merged_df = reduce(lambda left, right: pd.merge(left, right, on=['prompt', 'gold_answer', 'split'], how='inner'), sub_dataframes)
elif data_name == "mmlu":
    merged_df = reduce(lambda left, right: pd.merge(left, right, on=['prompt','gold_answer', 'split', 'subject'], how='inner'), sub_dataframes)
assert len(merged_df) == len(res_df)/len(llms)
merged_df.head()

Unnamed: 0,prompt,gold_answer,split,subject,auto_answer_falcon-7b,matches_falcon-7b,sc_falcon-7b,majority_answer_falcon-7b,majority_sc_falcon-7b,auto_answer_gemma-7b,...,auto_answer_olmo-7b,matches_olmo-7b,sc_olmo-7b,majority_answer_olmo-7b,majority_sc_olmo-7b,auto_answer_opt-6.7b,matches_opt-6.7b,sc_opt-6.7b,majority_answer_opt-6.7b,majority_sc_opt-6.7b
0,<START_QUESTION> Question: What are the frame...,B,dev,security_studies,"[B, B, B, B, B, B, B, B, B, B]",10,1,B,1,"[B, B, B, B, B, B, B, B, B, B]",...,"[D, C, B, D, B, D, D, D, B, B]",4,0,D,0,"[B, B, B, B, B, B, B, B, B, B]",10,1,B,1
1,<START_QUESTION> Question: What are the frame...,D,dev,security_studies,"[A, A, A, A, A, A, A, A, B, A]",0,0,A,0,"[D, D, D, D, D, D, D, D, D, D]",...,"[D, D, D, B, D, D, D, D, D, A]",8,1,D,1,"[D, D, D, D, D, D, C, D, D, D]",9,1,D,1
2,<START_QUESTION> Question: What are the frame...,C,dev,security_studies,"[C, C, C, C, C, C, C, C, C, C]",10,1,C,1,"[C, C, C, C, C, C, C, C, C, C]",...,"[C, D, C, D, C, C, C, D, C, C]",7,0,C,1,"[C, C, INVALID, C, C, B, C, C, B, C]",7,0,C,1
3,<START_QUESTION> Question: What are the frame...,B,dev,security_studies,"[A, C, C, C, B, B, B, B, C, B]",5,0,B,1,"[B, B, B, B, B, B, B, B, B, B]",...,"[B, B, B, B, B, B, B, B, B, B]",10,1,B,1,"[B, B, B, B, B, C, B, B, B, B]",9,1,B,1
4,<START_QUESTION> Question: What are the frame...,C,dev,security_studies,"[D, B, B, D, C, B, C, D, C, D]",3,0,D,0,"[C, C, C, C, C, C, C, C, C, C]",...,"[D, A, INVALID, B, C, D, C, C, C, B]",4,0,C,1,"[C, C, C, D, INVALID, C, D, C, INVALID, B]",5,0,C,1


In [15]:
merged_df['sc_all'] = merged_df.apply(lambda row: row[row.index.str.startswith('sc')].tolist(), axis=1)
merged_df['majority_sc_all'] = merged_df.apply(lambda row: row[row.index.str.startswith('majority_sc')].tolist(), axis=1)
merged_df.head()

Unnamed: 0,prompt,gold_answer,split,subject,auto_answer_falcon-7b,matches_falcon-7b,sc_falcon-7b,majority_answer_falcon-7b,majority_sc_falcon-7b,auto_answer_gemma-7b,...,sc_olmo-7b,majority_answer_olmo-7b,majority_sc_olmo-7b,auto_answer_opt-6.7b,matches_opt-6.7b,sc_opt-6.7b,majority_answer_opt-6.7b,majority_sc_opt-6.7b,sc_all,majority_sc_all
0,<START_QUESTION> Question: What are the frame...,B,dev,security_studies,"[B, B, B, B, B, B, B, B, B, B]",10,1,B,1,"[B, B, B, B, B, B, B, B, B, B]",...,0,D,0,"[B, B, B, B, B, B, B, B, B, B]",10,1,B,1,"[1, 1, 1, 1, 1, 0, 1]","[1, 1, 1, 1, 1, 0, 1]"
1,<START_QUESTION> Question: What are the frame...,D,dev,security_studies,"[A, A, A, A, A, A, A, A, B, A]",0,0,A,0,"[D, D, D, D, D, D, D, D, D, D]",...,1,D,1,"[D, D, D, D, D, D, C, D, D, D]",9,1,D,1,"[0, 1, 1, 1, 1, 1, 1]","[0, 1, 1, 1, 1, 1, 1]"
2,<START_QUESTION> Question: What are the frame...,C,dev,security_studies,"[C, C, C, C, C, C, C, C, C, C]",10,1,C,1,"[C, C, C, C, C, C, C, C, C, C]",...,0,C,1,"[C, C, INVALID, C, C, B, C, C, B, C]",7,0,C,1,"[1, 1, 1, 1, 1, 0, 0]","[1, 1, 1, 1, 1, 1, 1]"
3,<START_QUESTION> Question: What are the frame...,B,dev,security_studies,"[A, C, C, C, B, B, B, B, C, B]",5,0,B,1,"[B, B, B, B, B, B, B, B, B, B]",...,1,B,1,"[B, B, B, B, B, C, B, B, B, B]",9,1,B,1,"[0, 1, 1, 1, 1, 1, 1]","[1, 1, 1, 1, 1, 1, 1]"
4,<START_QUESTION> Question: What are the frame...,C,dev,security_studies,"[D, B, B, D, C, B, C, D, C, D]",3,0,D,0,"[C, C, C, C, C, C, C, C, C, C]",...,0,C,1,"[C, C, C, D, INVALID, C, D, C, INVALID, B]",5,0,C,1,"[0, 1, 1, 1, 1, 0, 0]","[0, 1, 1, 1, 1, 1, 1]"


In [16]:
id2lm = { i: mod  for i, mod in enumerate(viable_llms)}
lm2id = { mod: i  for i, mod in enumerate(viable_llms)}
print(id2lm)
print(lm2id)
scdec =  [ [[ id2lm[i] for i, val in enumerate(item) if int(val) == 1]]  for item in merged_df["sc_all"].tolist()]
mjscdec =  [ [[ id2lm[i] for i, val in enumerate(item) if int(val) == 1]]  for item in merged_df["majority_sc_all"].tolist()]
merged_df["sc_all_model"] = pd.DataFrame(scdec)
merged_df["majority_sc_all_model"] = pd.DataFrame(mjscdec)
print(scdec[:2])

{0: 'falcon-7b', 1: 'gemma-7b', 2: 'llama2-7b-lm', 3: 'metamath-7b', 4: 'mistral-7b-lm', 5: 'olmo-7b', 6: 'opt-6.7b'}
{'falcon-7b': 0, 'gemma-7b': 1, 'llama2-7b-lm': 2, 'metamath-7b': 3, 'mistral-7b-lm': 4, 'olmo-7b': 5, 'opt-6.7b': 6}
[[['falcon-7b', 'gemma-7b', 'llama2-7b-lm', 'metamath-7b', 'mistral-7b-lm', 'opt-6.7b']], [['gemma-7b', 'llama2-7b-lm', 'metamath-7b', 'mistral-7b-lm', 'olmo-7b', 'opt-6.7b']]]


In [17]:
pt = merged_df["prompt"].tolist()
clean_pt =  [item.strip().split('<START_QUESTION>')[-1].strip()[10:].split("<END_QUESTION>")[0].strip() for item in pt]
assert len(pt) == len(clean_pt)
merged_df["question"] = pd.DataFrame(clean_pt)
merged_df.head()

Unnamed: 0,prompt,gold_answer,split,subject,auto_answer_falcon-7b,matches_falcon-7b,sc_falcon-7b,majority_answer_falcon-7b,majority_sc_falcon-7b,auto_answer_gemma-7b,...,auto_answer_opt-6.7b,matches_opt-6.7b,sc_opt-6.7b,majority_answer_opt-6.7b,majority_sc_opt-6.7b,sc_all,majority_sc_all,sc_all_model,majority_sc_all_model,question
0,<START_QUESTION> Question: What are the frame...,B,dev,security_studies,"[B, B, B, B, B, B, B, B, B, B]",10,1,B,1,"[B, B, B, B, B, B, B, B, B, B]",...,"[B, B, B, B, B, B, B, B, B, B]",10,1,B,1,"[1, 1, 1, 1, 1, 0, 1]","[1, 1, 1, 1, 1, 0, 1]","[falcon-7b, gemma-7b, llama2-7b-lm, metamath-7...","[falcon-7b, gemma-7b, llama2-7b-lm, metamath-7...",What distinguishes coercive diplomacy from mil...
1,<START_QUESTION> Question: What are the frame...,D,dev,security_studies,"[A, A, A, A, A, A, A, A, B, A]",0,0,A,0,"[D, D, D, D, D, D, D, D, D, D]",...,"[D, D, D, D, D, D, C, D, D, D]",9,1,D,1,"[0, 1, 1, 1, 1, 1, 1]","[0, 1, 1, 1, 1, 1, 1]","[gemma-7b, llama2-7b-lm, metamath-7b, mistral-...","[gemma-7b, llama2-7b-lm, metamath-7b, mistral-...",Which of the following is the best lens throug...
2,<START_QUESTION> Question: What are the frame...,C,dev,security_studies,"[C, C, C, C, C, C, C, C, C, C]",10,1,C,1,"[C, C, C, C, C, C, C, C, C, C]",...,"[C, C, INVALID, C, C, B, C, C, B, C]",7,0,C,1,"[1, 1, 1, 1, 1, 0, 0]","[1, 1, 1, 1, 1, 1, 1]","[falcon-7b, gemma-7b, llama2-7b-lm, metamath-7...","[falcon-7b, gemma-7b, llama2-7b-lm, metamath-7...","In order to become securitized, a threat must ..."
3,<START_QUESTION> Question: What are the frame...,B,dev,security_studies,"[A, C, C, C, B, B, B, B, C, B]",5,0,B,1,"[B, B, B, B, B, B, B, B, B, B]",...,"[B, B, B, B, B, C, B, B, B, B]",9,1,B,1,"[0, 1, 1, 1, 1, 1, 1]","[1, 1, 1, 1, 1, 1, 1]","[gemma-7b, llama2-7b-lm, metamath-7b, mistral-...","[falcon-7b, gemma-7b, llama2-7b-lm, metamath-7...",How can we best describe the relationship betw...
4,<START_QUESTION> Question: What are the frame...,C,dev,security_studies,"[D, B, B, D, C, B, C, D, C, D]",3,0,D,0,"[C, C, C, C, C, C, C, C, C, C]",...,"[C, C, C, D, INVALID, C, D, C, INVALID, B]",5,0,C,1,"[0, 1, 1, 1, 1, 0, 0]","[0, 1, 1, 1, 1, 1, 1]","[gemma-7b, llama2-7b-lm, metamath-7b, mistral-...","[gemma-7b, llama2-7b-lm, metamath-7b, mistral-...",What are the frameworks of analysis within whi...


In [18]:
merged_df.to_json(f"/root/llm_classifier/data/raw_unified/{data_name}_nlm_for_classifier_v2.json", orient='records', lines=True, index=False)
merged_df.to_csv(f"/root/llm_classifier/data/raw_unified/{data_name}_nlm_for_classifier_v2.csv", encoding='utf-8', index=False)

In [19]:
from collections import Counter

def lab_dist(all_in):
    temp = [ [ llms[i] for i, val in enumerate(item) if val !=0 ] for item in all_in]
    flattened_list = [item for sublist in temp for item in sublist]

    # Count the occurrences of each element
    element_counts = Counter(flattened_list)

    # Display the counts
    for element, count in element_counts.items():
        percentage = (count / len(all_in)) * 100
        print(f"{element}: {count} times ({percentage:.2f}%)")

print("*********** SC *************")
lab_dist(merged_df['sc_all'].tolist())
print("")
print("*********** Prediction Label Distribution ****************")
lab_dist(merged_df['majority_sc_all'].tolist())

*********** SC *************
falcon-7b: 214 times (1.35%)
gemma-7b: 6448 times (40.77%)
llama2-7b-lm: 3247 times (20.53%)
metamath-7b: 4269 times (26.99%)
mistral-7b-lm: 6270 times (39.64%)
opt-6.7b: 468 times (2.96%)
olmo-7b: 518 times (3.28%)

*********** Prediction Label Distribution ****************
falcon-7b: 3950 times (24.97%)
gemma-7b: 10534 times (66.60%)
llama2-7b-lm: 7603 times (48.07%)
metamath-7b: 6819 times (43.11%)
mistral-7b-lm: 9885 times (62.50%)
opt-6.7b: 3620 times (22.89%)
olmo-7b: 4477 times (28.31%)


In [20]:
print("********** Running Orecle Model *********")
# Oracle Model: Always selct best model
def oreacle(all_lab):
    acc_count = 0
    temp_count = 0
    llm_count = len(viable_llms)*[0]
    for lab in all_lab:
        for i, val in enumerate(lab):
            if val == 1:
                acc_count += 1
                llm_count[i] += 1
                break
            if i ==len(viable_llms)-1:
                temp_count += 1 
   
    assert temp_count + sum(llm_count) == len(all_lab)
    return acc_count, llm_count
        

acc, llm_count_list = oreacle(merged_df['sc_all'].tolist())
facc = acc/(len(merged_df['sc_all'].tolist()))
print("Max Accuracy (%) for SC : ", facc*100)

acc, llm_count_list = oreacle(merged_df['majority_sc_all'].tolist())
facc = acc/(len(merged_df['majority_sc_all'].tolist()))
print("Max Accuracy (%) for Majority_SC : ", facc*100)

********** Running Orecle Model *********
Max Accuracy (%) for SC :  56.221547799696516
Max Accuracy (%) for Majority_SC :  89.19448659585231
