In [2]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import random

directory = "../data/"

collection = []
for filename in os.listdir(directory):
    if filename.endswith("_results.jsonl"):
#        print(filename)
        df = pd.read_json(directory + filename, lines=True)
        df['model'] = filename.replace(".jsonl_results.jsonl", "").replace("samples_", "")
        collection.append(df)

df = pd.concat(collection)
# Ensure the 'passed' column is boolean for correct processing.
df['passed'] = df['passed'].astype(bool)
# shorten task name
df['task_id'] = [str(t).replace("../test_cases/", "").replace(".ipynb","") for t in df['task_id']]

def print_no_comments(filt, show=True):
    """
    Splits a string into lines, and only prints the lines that do not start with a python comment or backticks.
    """
    all_lines = []
    for line in filt.split("\n"):
        if not line.strip(" ").startswith("#") and not line.strip(" ").startswith("`") and not line.strip(" ") == '':
            if show==True:
                print(line)
            all_lines.append(line)
    string_rep = '\n'.join(all_lines)
    return(string_rep)

def filter_df(df, task, model=None):
    """
    Filters a dataframe according to the task_id supplied, and if there's a model supplied, will also filter by model.
    """
    if model == None:
        df_filt = df[df["task_id"] == task]
    elif any(df["model"].str.contains(model)):
        df_filt = df[(df["task_id"] == task) & (df["model"] == model)]
    else: 
        df_filt = "error" 
    return(df_filt)

def assess_performance(df, task, model=None, show=False, comments=False):
    """
    Assesses performance of the task+model, printing out which instances passed and failed, and the LLM's python code which was responsible.
    """
    df_filt = filter_df(df, task, model)
    df_task = filter_df(df, task)
    
    print(f"Testing task:\t\t\t {task}")
    print(f"Testing model:\t\t\t {model}")
    print(f"Number of times passed:\t\t {sum(df_filt['passed'])} / {len(df_filt['passed'])}")
    print(f"Model score:\t\t\t {round(sum(df_filt['passed']) / len(df_filt['passed'])*100,1)}%")
    print(f"Expected score:\t\t\t {round(sum(df_task['passed']) / len(df_task['passed'])*100,1)}%")
    
    print("<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>")
    for enum, (filt, pf) in enumerate(zip(df_filt["full_response"], df_filt["passed"])):
        if (pf == False):
            if comments == True:
                print(f"Passed trial {str(enum)}? \x1b[1;31m{str(pf)}\x1b[0m \n {filt} \n")
            else:
                print_no_comments(f"Passed trial {str(enum)}? \x1b[1;31m{str(pf)}\x1b[0m \n {filt} \n", show=show)
        else:
            if comments == True:
                print(f"Passed trial {str(enum)}? \x1b[1;32m{str(pf)}\x1b[0m \n {filt} \n")
            else:
                print_no_comments(f"Passed trial {str(enum)}? \x1b[1;32m{str(pf)}\x1b[0m \n {filt} \n", show=show)        
        if show==True:
            print("")
    return(df_filt)

def get_LLM_function(completion):
    """
    Attempt to parse each completion, importing necessarily libraries, and return an executable function.

    Note: If you try to run the function, it will run. This means that you'll be running LLM code locally. So beware!
    """
    curr_fn = str(print_no_comments(completion,show=False))
    namespace = {}
    exec(curr_fn, namespace, namespace)
    func_name = [name for name in namespace if callable(namespace[name])][0]
    func = namespace[func_name]
    return(func)

random_task = random.choice(df["task_id"].unique())
random_model = random.choice(df["model"].unique())

# df_filt = assess_performance(df, 
#                          task=random_task, 
#                          model=random_model)
print("Model choices:\n", df["model"].unique(),"\n")

df_filt = assess_performance(df, 
                             task="crop_quarter_image", 
                             model="llama3-8b-instruct-fp16",
                             show=True,
                             comments=True)

Model choices:
 ['gemini-1.5-flash-001' 'llama3-70b-instruct-q4_0' 'gpt-4-1106-preview'
 'codegemma-7b-instruct-fp16' 'codellama-70b-instruct-q4_0'
 'claude-3-5-sonnet-20240620' 'gpt-4o-2024-05-13' 'claude-3-opus-20240229'
 'gpt-4-turbo-2024-04-09' 'mixtral-8x7b-instruct-v0.1-q5_0'
 'command-r-plus-104b-q4_0' 'gemini-pro' 'gpt-3.5-turbo-1106'
 'mixtral-8x22b-instruct-v0.1-q4_0' 'llama3-8b-instruct-fp16' 'reference'
 'llama3-70b-instruct-q8_0' 'codellama' 'phi3-3.8b-mini-instruct-4k-fp16'] 

Testing task:			 crop_quarter_image
Testing model:			 llama3-8b-instruct-fp16
Number of times passed:		 2 / 10
Model score:			 20.0%
Expected score:			 40.0%
<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
Passed trial 0? [1;31mFalse[0m 
 Here is the complete code:

```Python
# Plan:
# 1. Check if input is an image
# 2. If it's an image, get its dimensions and calculate half of each dimension
# 3. Use Pillow library to crop the image
# 4. Return the cropped image

import PIL.Image

In [3]:
#print(df_filt["completion"].iloc[14])
len(df)

10830

In [4]:
# df_filt = filter_df(df, task="sum_images") # Testing a specific filter

df_filt = df # No filter

undefined_python_enums = []
model_count = Counter()

for enum, resp in enumerate(df_filt["completion"]):
    try:
        func= get_LLM_function(df_filt["completion"].iloc[enum])
#        print(func)
#        print(func(coins(), coins())) # Beware! Do not uncomment this line unless you're comfortable running arbitrary LLM code! 
    except Exception as error:
        # Error evaluation
        # print("Model="+df_filt["model"].iloc[enum])
        # print("Task="+df_filt["task_id"].iloc[enum])
        # print("\tAn error occurred:", error)
        if str(error) == "name 'Python' is not defined":
            # 'Python not defined' error evaluation
            undefined_python_enums.append(enum)
            model_count[df_filt["model"].iloc[enum]] += 1
#    print('\n')
#    if enum == 5:
#         break


2024-07-12 14:58:55.007225: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-12 14:58:55.007253: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-12 14:58:55.008074: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-12 14:58:55.012945: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


    ID   Name   Age    City
0  101   John  35.0  London
1  102  Sarah  40.0     NaN
2  103    NaN   NaN   Paris
     A    B
A  1.0  1.0
B  1.0  1.0


[ WARN:0@3.184] global loadsave.cpp:248 findDecoder imread_('your_image_file.png'): can't open/read file: check file path/integrity
[ WARN:0@3.231] global loadsave.cpp:248 findDecoder imread_('sunflower.jpg'): can't open/read file: check file path/integrity
[ WARN:0@3.233] global loadsave.cpp:248 findDecoder imread_('path/to/image'): can't open/read file: check file path/integrity
[ WARN:0@3.235] global loadsave.cpp:248 findDecoder imread_('binary_image.jpg'): can't open/read file: check file path/integrity
[ WARN:0@3.238] global loadsave.cpp:248 findDecoder imread_('input.png'): can't open/read file: check file path/integrity
[ WARN:0@3.244] global loadsave.cpp:248 findDecoder imread_('input_image.png'): can't open/read file: check file path/integrity


oeWoWoldeeH
[0 0]


[ WARN:0@3.644] global loadsave.cpp:248 findDecoder imread_('path/to/input_image1'): can't open/read file: check file path/integrity
[ WARN:0@3.645] global loadsave.cpp:248 findDecoder imread_('path/to/input_image2'): can't open/read file: check file path/integrity
[ WARN:0@3.649] global loadsave.cpp:248 findDecoder imread_('path/to/image.png'): can't open/read file: check file path/integrity
[ WARN:0@3.654] global loadsave.cpp:248 findDecoder imread_('image1.png'): can't open/read file: check file path/integrity
[ WARN:0@3.654] global loadsave.cpp:248 findDecoder imread_('image2.png'): can't open/read file: check file path/integrity
[ WARN:0@3.663] global loadsave.cpp:248 findDecoder imread_('path/to/image'): can't open/read file: check file path/integrity


[4 5 6]
[4 5 6]
53
0.0
(34.0, 9.16515138991168)
[]
   column1  column2  mean                  diff
0        1        2   2.5  [[1], [1], [1], [1]]
1        2        3   3.5                   NaN
2        3        4   NaN                   NaN
3        4        5   NaN                   NaN
[0 3 1]
None
   column1  column2  mean  diff
0        1        4   7.0     3
1        2        5   7.0     3
2        3        6   7.0     3
nan


[ WARN:0@4.403] global loadsave.cpp:248 findDecoder imread_('image.jpg'): can't open/read file: check file path/integrity
[ WARN:0@4.419] global loadsave.cpp:248 findDecoder imread_('image.jpg'): can't open/read file: check file path/integrity
[ WARN:0@4.421] global loadsave.cpp:248 findDecoder imread_('image.jpg'): can't open/read file: check file path/integrity
[ WARN:0@4.431] global loadsave.cpp:248 findDecoder imread_('image.jpg'): can't open/read file: check file path/integrity
[ WARN:0@4.448] global loadsave.cpp:248 findDecoder imread_('example.jpg'): can't open/read file: check file path/integrity
[ WARN:0@4.451] global loadsave.cpp:248 findDecoder imread_('image.jpg'): can't open/read file: check file path/integrity
[ WARN:0@4.471] global loadsave.cpp:248 findDecoder imread_('image.jpg'): can't open/read file: check file path/integrity


In [5]:
# Showing failing models
model_count

Counter({'llama3-8b-instruct-fp16': 244,
         'llama3-70b-instruct-q8_0': 92,
         'llama3-70b-instruct-q4_0': 64})

In [6]:
# Note that the error message indicates the distance between 'Python' and the first callable? I think?
np.where(df.iloc[undefined_python_enums]["result"] =="failed: name 'Python' is not defined")

(array([209]),)

In [7]:
print(df.iloc[undefined_python_enums]["completion"].iloc[209])


# Step 1: Import necessary libraries for image processing
# Step 2: Define a function that takes an input image and a mask image, 
#         then applies the mask to the input image and returns the result.

Python
from PIL import Image


def mask_image(image, mask):
    # Step 1: Open both images as PIL Images
    img = Image.open(image)
    msk = Image.open(mask)

    # Step 2: Ensure the images are of the same size
    if not img.size == msk.size:
        raise ValueError("Image and mask must be the same size")

    # Step 3: Apply the mask to the image
    output_image = img.copy()
    for x in range(img.width):
        for y in range(img.height):
            pix = output_image.getpixel((x,y))
            if msk.getpixel((x,y)) > 128:
                output_image.putpixel((x,y),(0,0,0)) # Replace with any default color
            else:
                output_image.putpixel((x,y),pix) # Keep the original pixel

    return output_image




In [8]:
# Showing invalid syntax results, 
Counter(df.iloc[undefined_python_enums]["result"])

Counter({'failed: invalid syntax (<string>, line 4)': 252,
         'failed: invalid syntax (<string>, line 5)': 105,
         'failed: invalid syntax (<string>, line 6)': 21,
         'failed: invalid syntax (<string>, line 10)': 8,
         'failed: invalid syntax (<string>, line 7)': 7,
         'failed: invalid syntax (<string>, line 8)': 4,
         'failed: invalid syntax (<string>, line 9)': 2,
         "failed: name 'Python' is not defined": 1})

In [9]:
Counter(df.iloc[undefined_python_enums]["passed"])

Counter({False: 400})

In [10]:
# errs = []
# for enum in undefined_python_enums:
#     completion = df.iloc[enum]["completion"]
#     lines = print_no_comments(completion, show=False).split("def")[0].split("\n")
#     errs.append(len(lines))
#     print(lines)
# print(Counter(errs))

In [12]:
#completion.split("def")[0].split("\n")

In [13]:
# Code for inspecting the benchmarks themselves. Incomplete for now.

# import nbformat as nbf
# case_dir = "../test_cases"
# cell_count = Counter()
# line_count = Counter()

# for k in os.listdir(case_dir):
#     if k.endswith("ipynb"):
#         ntbk = nbf.read(os.path.join(case_dir, k), nbf.NO_CONVERT)
#         k = k.replace(".ipynb","")
#         fn = ntbk["cells"][0]["source"].split("\n")[0].split('def ')[1]
#         source = ntbk["cells"][1]["source"]
#         #print("".join(source.split('"""')[-1]))
#         print(fn)
#         print(source)

In [2]:
# Pretty printing with colors!
# print('\x1b[1;31m'+'Hello world'+'\x1b[0m')

[1;31mHello world[0m
