# **Analysis**

## **0. Configure environment**
### **0.1 Install all dependencies**

In [14]:
# %%capture
# %pip install pandas==2.0.1
# %pip install seaborn==0.12.2
# %pip install sentence-transformers==2.2.2

### **0.2 Import all the libraries needed for this notebook**

In [15]:
# TO AID WITH REPRODUCIBILITY...
# ENVIRONMENT ============================= #
# Python                             3.11.3 #
#                                           #
# LIBRARIES ------------------------------- #
# pandas                              2.0.1 #
# seaborn                            0.12.2 #
# sentence-transformers               2.2.2 #
# ========================================= #

# python libraries ------------------------ #
import re
import os
import random

import sys
from io import StringIO

import threading

try:
    import thread
except ImportError:
    import _thread as thread

# pandas ---------------------------------- #
import pandas

# seaborn --------------------------------- #
import seaborn

# sentence_transformers ------------------- #
from sentence_transformers import SentenceTransformer, util

import numpy as np
from sklearn.cluster import KMeans
from matplotlib import pyplot

### **0.3 Specify all needed constants**
This allows the notebook to be configured in a single spot rather than having to jump around the code base.

In [16]:
# file paths ------------------------------ #
GROUND_TRUTH_FILE_PATH = 'in/ground.jsonl'

RESPONSES_FILE_PATH = f'out/1-base.jsonl'
# RESPONSES_FILE_PATH = f'out/1-baseline-temp-0.jsonl'
# RESPONSES_FILE_PATH = f'out/1-baseline-top-p-0.1.jsonl'

GRADED_FILE_PATH = f'out/2-graded.jsonl'
EMBEDDING_FILE_PATH = f'out/3-embeddings.jsonl'

GROUND_INDEX = 'task_id'
RESPONSES_INDEX = 'question_id'

CHOICE_COLUMN = 'choices'
ANSWER_COLUMN = 'answer'

RESPONSES_COLUMNS_TO_KEEP = ['question_id', 'choices', 'usage']
GROUND_COLUMNS_TO_KEEP = ['task_id', 'entry_point', 'test']

SAMPLE_COLUMN = 'sample_[index]'

N_JOBS = 10

ALL_MIN_LM_16 = "all_min_lm_16"
ALL_MPNET_BASE_V2 = "all-mpnet-base-v2"

In [17]:
def sample_column(index):
    return f'sample_{index}'

def correct_column(index):
    return f'{sample_column(index)}__is_correct'

def embedding_column(index, transformer_name):
    return f'{sample_column(index)}__{transformer_name}'

## **1. Load data**
### **1.1 Load data from files**

In [18]:
ground = pandas.read_json(GROUND_TRUTH_FILE_PATH, lines=True)
ground = ground[GROUND_COLUMNS_TO_KEEP]
ground.head(1)

Unnamed: 0,task_id,entry_point,test
0,HumanEval/0,has_close_elements,"\n\nMETADATA = {\n 'author': 'jt',\n 'da..."


In [19]:
responses = pandas.read_json(RESPONSES_FILE_PATH, lines=True)
responses = responses[RESPONSES_COLUMNS_TO_KEEP]
responses = responses.join(other=ground.set_index(GROUND_INDEX), on=RESPONSES_INDEX)
responses.head(1)

Unnamed: 0,question_id,choices,usage,entry_point,test
0,HumanEval/3,"[{'message': {'role': 'assistant', 'content': ...","{'prompt_tokens': 155, 'completion_tokens': 25...",below_zero,"\n\nMETADATA = {\n 'author': 'jt',\n 'da..."


### **1.2 Extract choices into individual columns**

In [20]:
def extract_choices(row):
    choices = row[CHOICE_COLUMN]
    for index in range(len(choices)): 
        row[sample_column(index)] = choices[index]['message']['content']
    return row

responses = responses.apply(lambda row : extract_choices(row), axis=1)
responses.head(1)

Unnamed: 0,question_id,choices,usage,entry_point,test,sample_0,sample_1,sample_2,sample_3,sample_4,...,sample_40,sample_41,sample_42,sample_43,sample_44,sample_45,sample_46,sample_47,sample_48,sample_49
0,HumanEval/3,"[{'message': {'role': 'assistant', 'content': ...","{'prompt_tokens': 155, 'completion_tokens': 25...",below_zero,"\n\nMETADATA = {\n 'author': 'jt',\n 'da...",'''python\ndef below_zero(operations: List[int...,'''python\nfrom typing import List\n\ndef belo...,'''python\nfrom typing import List\n\ndef belo...,'''python\ndef below_zero(operations: List[int...,'''python\ndef below_zero(operations):\n ba...,...,'''python\ndef below_zero(operations: List[int...,'''python\nfrom typing import List\n\n\ndef be...,'''python\ndef below_zero(operations: List[int...,'''python\nfrom typing import List\n\n\ndef be...,'''python\nfrom typing import List\n\ndef belo...,'''python\nfrom typing import List\n\ndef belo...,'''python\ndef below_zero(operations: List[int...,'''python\nfrom typing import List\n\ndef belo...,'''python\ndef below_zero(operations: List[int...,'''python\ndef below_zero(operations: List[int...


### **1.3 Compute API costs**

In [21]:
total_tokens = responses['usage'].apply(lambda row : row['total_tokens']).sum()
total_cost = total_tokens / 1000 * 0.002
print("TOTAL TOKENS:", total_tokens)
print("TOTAL COST  :", total_cost)

## **2. Grading**
### **2.1 Retrieve cached results (if they exist)**

In [22]:
if not os.path.isfile(GRADED_FILE_PATH): responses.to_json(GRADED_FILE_PATH, orient='records', lines=True)

### **2.2 Evaluate ChatGPT's response**
In our prompts to ChatGPT, we specified for ChatGPT to end its response with the phrase:  
'The answer is...', we look for this string in the final answer and check for the words yes or no after the sentence.

In [23]:
def quit_function(fn_name):
    # print to stderr, unbuffered in Python 2.
    sys.stderr.flush() # Python 3 stderr is likely buffered.
    thread.interrupt_main() # raises KeyboardInterrupt

def remove_tag(text : str):
    return text[9:-3]
# ------------------------------------------- #
#   A function that times out functions 
#   after a certain period of time
# ------------------------------------------- #
def exit_after(s):
    '''
    use as decorator to exit process if 
    function takes longer than s seconds
    '''
    def outer(fn):
        def inner(*args, **kwargs):
            timer = threading.Timer(s, quit_function, args=[fn.__name__])
            timer.start()
            try:
                result = fn(*args, **kwargs)
            finally:
                timer.cancel()
            return result
        return inner
    return outer

# ------------------------------------------- #
#   Solves a system of equations given in
#   a list of str
# ------------------------------------------- #
@exit_after(5)
def evaluate(text : str):
    try:
        old_stdout = sys.stdout
        sys.stdout = mystdout = StringIO()
        exec(text)
        sys.stdout = old_stdout
        message = mystdout.getvalue()
        return message
    except Exception as e: return str(e)
    
def eval_wrap(text : str):
    try: return evaluate(text)
    except KeyboardInterrupt: return "TIMEOUT"

def is_choice_correct(solution : str): 
    return 1 if len(eval_wrap(solution)) == 0 else 0

def grade_choices(row : pandas.Series):
    choices = row[CHOICE_COLUMN]
    for index in range(len(choices)): 
        if correct_column(index) in set(row.keys()): continue

        solution = remove_tag(row[sample_column(index)])
        solution = f"{solution}\n{row['test']}\ncheck({row['entry_point']})"

        row[correct_column(index)] = is_choice_correct(solution)
    return row

graded = pandas.read_json(GRADED_FILE_PATH, lines=True)
graded = graded.combine_first(responses)
graded = graded.apply(lambda row : grade_choices(row), axis=1)
graded.head(1)

Unnamed: 0,choices,entry_point,question_id,sample_0,sample_0__is_correct,sample_1,sample_10,sample_10__is_correct,sample_11,sample_11__is_correct,...,sample_6,sample_6__is_correct,sample_7,sample_7__is_correct,sample_8,sample_8__is_correct,sample_9,sample_9__is_correct,test,usage
0,"[{'message': {'role': 'assistant', 'content': ...",below_zero,HumanEval/3,'''python\ndef below_zero(operations: List[int...,name 'List' is not defined,'''python\nfrom typing import List\n\ndef belo...,'''python\nfrom typing import List\n\n\ndef be...,,'''python\ndef below_zero(operations: List[int...,name 'List' is not defined,...,'''python\nfrom typing import List\n\ndef belo...,,'''python\ndef below_zero(operations: List[int...,name 'List' is not defined,'''python\nfrom typing import List\n\ndef belo...,,'''python\nfrom typing import List\n\ndef belo...,,"\n\nMETADATA = {\n 'author': 'jt',\n 'da...","{'prompt_tokens': 155, 'completion_tokens': 25..."


### **2.3 Cache results**

In [24]:
graded.to_json(GRADED_FILE_PATH, orient='records', lines=True)

## **3. Data exploration**

In [25]:
def get_choice_correct_count(row):
    count = 0
    for index in range(N_JOBS): 
        count += row[correct_column(index)]
    return count

counts = graded.apply(lambda row : get_choice_correct_count(row), axis=1)
stats = counts.value_counts().sort_index()
barplot = seaborn.barplot(x=stats.index, y=stats.values)
barplot.set(title="Number of correct samples distribution")
barplot.set(xlabel="Number of correct samples")
barplot.set(ylabel="Number of correct samples")
# for i in barplot.containers:
#     barplot.bar_label(i,)

TypeError: unsupported operand type(s) for +=: 'int' and 'str'

In [None]:
# graded['counts'] = counts
# graded.to_json('a.jsonl', lines=True, orient='records')

In [None]:
(pandas.Series(stats.index) * stats.values).sum() / N_JOBS

91.0

## **4. Embedding**
### **4.1 Retrieve cache results (if they exist)**

In [None]:
if not os.path.isfile(EMBEDDING_FILE_PATH): graded.to_json(EMBEDDING_FILE_PATH, orient='records', lines=True)

### **4.2 Initialize all the transformers that will be used**

In [None]:
all_mini_lm_l6_v2 = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
def sentence_transformer_encode_choices(row : pandas.Series, transformer : SentenceTransformer, transformer_name):
    choices = row[CHOICE_COLUMN]
    for index in range(len(choices)): 
        if embedding_column(index, transformer_name) in set(row.keys()): continue
        row[embedding_column(index, transformer_name)] = transformer.encode(row[sample_column(index)])
    return row

embedding = pandas.read_json(EMBEDDING_FILE_PATH, lines=True)
embedding = embedding.combine_first(graded)

embedding = embedding.apply(lambda row : sentence_transformer_encode_choices(row, all_mini_lm_l6_v2, ALL_MIN_LM_16), axis=1)
embedding.head(1)

Unnamed: 0,question_id,choices,usage,entry_point,test,sample_0,sample_1,sample_2,sample_3,sample_4,...,sample_40__all_min_lm_16,sample_41__all_min_lm_16,sample_42__all_min_lm_16,sample_43__all_min_lm_16,sample_44__all_min_lm_16,sample_45__all_min_lm_16,sample_46__all_min_lm_16,sample_47__all_min_lm_16,sample_48__all_min_lm_16,sample_49__all_min_lm_16
0,HumanEval/3,"[{'message': {'role': 'assistant', 'content': ...","{'prompt_tokens': 155, 'completion_tokens': 25...",below_zero,"\n\nMETADATA = {\n 'author': 'jt',\n 'da...",'''python\nfrom typing import List\n\ndef belo...,'''python\ndef below_zero(operations: List[int...,'''python\nfrom typing import List\n\ndef belo...,'''python\nfrom typing import List\n\ndef belo...,'''python\nfrom typing import List\n\n\ndef be...,...,"[-0.020492185, 0.105527945, -0.037848745, 0.01...","[-0.017801007, 0.09964598, -0.036717366, 0.003...","[-0.017801007, 0.09964598, -0.036717366, 0.003...","[-0.023401706, 0.085186094, -0.01715884, 0.017...","[-0.027135618, 0.0906136, -0.016690323, 0.0282...","[0.0031148298, 0.08860269, 0.023330076, 0.0116...","[-0.023401706, 0.085186094, -0.01715884, 0.017...","[-0.027135618, 0.0906136, -0.016690323, 0.0282...","[-0.017801007, 0.09964598, -0.036717366, 0.003...","[-0.026205802, 0.08687013, -0.045525987, -0.00..."


### **4.3 Cache embeddings**

In [None]:
embedding.to_json(EMBEDDING_FILE_PATH, orient='records', lines=True)

## **5. Sample selection**
### **5.1 Clustering**

In [None]:
all_mini_lm_16_scores = []
average_scores = []

for n_jobs in range(1, N_JOBS + 1):
    def select_by_clustering(row, transformer_name):
        embeddings = []
        for index in range(n_jobs): embeddings.append(row[embedding_column(index, transformer_name)])

        kmeans = KMeans(n_clusters=1, n_init=10)
        kmeans.fit_predict(embeddings)

        centroid = kmeans.cluster_centers_[0]

        min_distance = float('inf')
        min_sample = correct_column(0)

        for index, el in enumerate(embeddings):
            distance = np.linalg.norm(centroid - el)
            if distance < min_distance: 
                min_distance = distance
                min_sample = correct_column(index)
        return row[min_sample]
    
    def get_choice_correct_count(row):
        total = 0
        for index in range(n_jobs): total += row[correct_column(index)]
        return total
     
    clustering = embedding.apply(lambda row : select_by_clustering(row, ALL_MIN_LM_16), axis=1)
    all_mini_lm_16_scores.append(clustering.sum())
    
    average_performance = embedding.apply(lambda row : get_choice_correct_count(row), axis=1).sum() / n_jobs
    average_scores.append(average_performance)

In [None]:
disp = pandas.DataFrame()
disp['number of samples'] = pandas.Series(range(1, N_JOBS + 1))
disp['average'] = pandas.Series(average_scores)
disp[ALL_MIN_LM_16] = pandas.Series(all_mini_lm_16_scores)

seaborn.lineplot(data=disp, x='number of samples', y=ALL_MIN_LM_16, label=f'{ALL_MIN_LM_16}_clustering')
seaborn.lineplot(data=disp, x='number of samples', y='average', label='average')
pyplot.show()

NameError: name 'average_scores' is not defined