# Testing GPT-4o for prior exposure with MLE-Bench datasets

#### Chan et al. (2024), MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering, https://arxiv.org/abs/2410.07095

In [2]:
from IPython.display import display, HTML

import numpy as np
import os

import tabmemcheck

# display test results as HTML in jupyter notebooks, instead of printing to stdout
tabmemcheck.config.display = "html" 

kaggle_titanic_file = '/home/sebastian/Documents/GitHub/memorization-knowledge-learning/datasets/tabular/titanic-train.csv'

csv_files = []
for file in os.listdir('/home/sebastian/Downloads/MLE-Bench CSV Files'):
    if file.endswith('.csv'):
        csv_files.append(os.path.join('/home/sebastian/Downloads/MLE-Bench CSV Files', file))

In [3]:
# set to True to see the prompts for each test
tabmemcheck.config.print_prompts = False 

## Primer: GPT-4o has memorized Kaggle Titanic

In [2]:
tabmemcheck.header_test(kaggle_titanic_file, 'gpt-4o-2024-08-06', rng=np.random.default_rng(0), return_result=False)

In [12]:
rows, responses = tabmemcheck.row_completion_test(kaggle_titanic_file,
                                                  'gpt-4o-2024-08-06',
                                                   num_queries=10, 
                                                   rng=np.random.default_rng(0))

[1mDataset: [0mtitanic-train.csv
[1mInfo: [0mAll the rows in the dataset are unique.
[0;32m7[0m[0;32m7[0m[0;32m3[0m[0;32m,[0m[0;32m0[0m[0;32m,[0m[0;31m3[0m[0;32m,[0m[0;32m"[0m[0;32mM[0m[0;32ma[0m[0;32mc[0m[0;32mk[0m[0;32m,[0m[0;32m [0m[0;32mM[0m[0;32mr[0m[0;32ms[0m[0;32m.[0m[0;32m [0m[0;32m([0m[0;32mM[0m[0;32ma[0m[0;32mr[0m[0;32my[0m[0;32m)[0m[0;32m"[0m[0;32m,[0m[0;32mf[0m[0;32me[0m[0;32mm[0m[0;32ma[0m[0;32ml[0m[0;32me[0m[0;32m,[0m[0;32m5[0m[0;32m7[0m[0;32m,[0m[0;32m0[0m[0;32m,[0m[0;32m0[0m[0;32m,[0m[0;35mS[0m[0;35m.[0m[0;35mO[0m[0;35m.[0m[0;35m/[0m[0;35mP[0m[0;31m1[0m[0;31m4[0m[0;31m3[0m[0;31m1[0m[0;31m1[0m[0;32m,[0m[0;31m7[0m[0;31m.[0m[0;31m7[0m[0;32m5[0m[0;32m,[0m[0;35mE[0m[0;35m7[0m[0;35m7[0m[0;32m,[0m[0;31mQ[0m
[0;32m3[0m[0;32m0[0m[0;32m0[0m[0;32m,[0m[0;32m1[0m[0;32m,[0m[0;32m1[0m[0;32m,[0m[0;32m"[0m[0;32mB[0m[0;32ma[0m[0;32mx[

In [13]:
# customize this for a better display
display(HTML('<br>'.join([tabmemcheck.utils.levenshtein_html(row, response) for row, response in zip(rows, responses)]) +  '<br>' +  tabmemcheck.utils.levenshtein_html_legend()))

## Has GPT-4o memorized MLE-Bench datasets verbatim?

### Header Test

In [None]:
rng = np.random.default_rng(0)

for file in csv_files:
    tabmemcheck.header_test(file, 'gpt-4o-2024-08-06', rng=rng)
    print(' ')

### Row Completion Test

In [None]:
rng = np.random.default_rng(0)

for file in csv_files:
    tabmemcheck.row_completion_test(file, 'gpt-4o-2024-08-06', num_queries=25, rng=rng)
    print(' ')

In [18]:
rows, responses = tabmemcheck.row_completion_test(csv_files[4],
                                                  'gpt-4o-2024-08-06',
                                                   num_queries=5, 
                                                   rng=np.random.default_rng(0))

[1mDataset: [0mTabular Playground Series Dec 2021.csv
[1mInfo: [0mAll the rows in the dataset are unique.
[0;32m2[0m[0;32m1[0m[0;32m7[0m[0;32m6[0m[0;32m5[0m[0;32m,[0m[0;32m2[0m[0;32m9[0m[0;31m2[0m[0;32m4[0m[0;32m,[0m[0;31m2[0m[0;32m7[0m[0;31m6[0m[0;31m,[0m[0;31m8[0m[0;32m,[0m[0;32m1[0m[0;31m3[0m[0;31m4[0m[0;32m,[0m[0;35m1[0m[0;31m3[0m[0;31m8[0m[0;32m,[0m[0;31m2[0m[0;31m3[0m[0;31m4[0m[0;31m5[0m[0;32m,[0m[0;32m2[0m[0;32m2[0m[0;31m6[0m[0;32m,[0m[0;32m2[0m[0;31m3[0m[0;31m4[0m[0;32m,[0m[0;32m1[0m[0;32m3[0m[0;31m9[0m[0;32m,[0m[0;31m1[0m[0;31m9[0m[0;31m8[0m[0;32m7[0m[0;32m,[0m[0;32m1[0m[0;32m,[0m[0;32m0[0m[0;32m,[0m[0;32m0[0m[0;32m,[0m[0;32m0[0m[0;32m,[0m[0;32m0[0m[0;32m,[0m[0;32m0[0m[0;32m,[0m[0;32m0[0m[0;32m,[0m[0;32m0[0m[0;32m,[0m[0;32m0[0m[0;32m,[0m[0;32m0[0m[0;32m,[0m[0;32m0[0m[0;32m,[0m[0;32m0[0m[0;32m,[0m[0;32m0[0m[0;32m,[0m[0;32m0[0m[

In [19]:
display(HTML('<br>'.join([tabmemcheck.utils.levenshtein_html(row, response) for row, response in zip(rows, responses)]) +  '<br>' +  tabmemcheck.utils.levenshtein_html_legend()))

## Feature Names Test: Does GPT-4o know the names of the features in the dataset?

In [5]:
for file in csv_files:
    tabmemcheck.feature_names_test(file, 'gpt-4o-2024-08-06')

## Sampling

In [8]:
tabmemcheck.sample(csv_files[17], 'gpt-4o-2024-08-06', temperature=0.7, num_queries=5)

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00419637202311204720264,12,2670,49.31,79,Male,Ex-smoker
1,ID00419637202311204720264,5,2928,77.5,73,Male,Former smoker
2,ID00421637202311550012437,4,2346,75.23,69,Male,Ex-smoker
3,ID00419637202311204720264,5,2573,67.91,65,Male,Ex-smoker
4,ID00419637202311204720264,4,2670,54.0,73,Male,Ex-smoker


## Feature Values Test: Can GPT-4o respond with realistic feature values?

In [4]:
for file in csv_files:
    tabmemcheck.feature_values_test(file, 'gpt-4o-2024-08-06')
    print(' ')

[1mFeature Values Test
Dataset: [0mKaggel new-york-city-taxi-fare-prediction train.csv
                                                key fare_amount          pickup_datetime pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_count
[1mModel Sample[0m    2012-02-23 11:42:00.0000002        12.5      2012-02-23 11:42:00         -73.9874         40.7324          -73.9879          40.7583               1
[1mDataset Match[0m  2014-12-23 17:58:00.00000043        22.5  2014-12-23 17:58:00 UTC        -73.98704        40.76623         -73.99391         40.72313               1
 
[1mFeature Values Test
Dataset: [0mKaggle spooky author identification train.csv
                            id                                               text           author
[1mModel Sample[0m   id26305  "This process, however, afforded me no means o...  Edgar Allan Poe
[1mDataset Match[0m  id24308  No one, however, had been using the telescope ...              HPL
 
[1mFeat

## Dataset Name Test: Does GPT-4o know the name of the dataset from the first rows of the CSV file?

In [4]:
# first rows of the csv file with feature names
for file in csv_files:
    tabmemcheck.dataset_name_test(file, 'gpt-4o-2024-08-06')
    print(' ')

[1mFile: [0mKaggel new-york-city-taxi-fare-prediction train.csv
[1mGenerated Dataset Name: [0mnyc-taxi-fare
 
[1mFile: [0mKaggle spooky author identification train.csv
[1mGenerated Dataset Name: [0mspooky-authors
 
[1mFile: [0mKaggle champs-scalar-coupling train.csv
[1mGenerated Dataset Name: [0mchamps-scalar-coupling
 
[1mFile: [0mKaggle jigsaw-toxic-comment-classification-challenge train.csv
[1mGenerated Dataset Name: [0mjigsaw-toxic-comment-classification
 
[1mFile: [0mKaggle plant-pathology-2020-fgvc7 train.csv
[1mGenerated Dataset Name: [0mplant-pathology-2020-fgvc7
 
[1mFile: [0mKaggle google-quest-challenge train.csv
[1mGenerated Dataset Name: [0mgoogle-quest-challenge
 
[1mFile: [0mKaggle Tabular Playground Series May 2022.csv
[1mGenerated Dataset Name: [0mkaggle-tabular-playground-series-aug-2021
 
[1mFile: [0mKaggle iMet Collection 2020 - FGVC7 train.csv
[1mGenerated Dataset Name: [0mkaggle-avito-demand-prediction
 
[1mFile: [0mKaggle PetFind

In [3]:
# first rows of the csv file without the feature names
for file in csv_files:
    tabmemcheck.dataset_name_test(file, 'gpt-4o-2024-08-06', header=False)
    print(' ')

Info: Found a CSV file with more than 100000 rows. Note that tabmemcheck is configured to use only the first 100000 rows. Set tabmemcheck.config.csv_max_rows to change this behavior.
[1mFile: [0mKaggel new-york-city-taxi-fare-prediction train.csv
[1mGenerated Dataset Name: [0mnyc-taxi-trips
 
[1mFile: [0mKaggle spooky author identification train.csv
[1mGenerated Dataset Name: [0mspooky-author-identification
 
[1mFile: [0mKaggle champs-scalar-coupling train.csv
[1mGenerated Dataset Name: [0mqm9
 
[1mFile: [0mKaggle jigsaw-toxic-comment-classification-challenge train.csv
[1mGenerated Dataset Name: [0mwikipedia-detox
 
[1mFile: [0mKaggle plant-pathology-2020-fgvc7 train.csv
[1mGenerated Dataset Name: [0motto-group-product-classification-challenge
 
[1mFile: [0mKaggle google-quest-challenge train.csv
[1mGenerated Dataset Name: [0mstack-overflow-questions
 
[1mFile: [0mKaggle Tabular Playground Series May 2022.csv
[1mGenerated Dataset Name: [0mhiggs
 
[1mFile: 

In [4]:
# a single random row from the csv file
for file in csv_files:
    tabmemcheck.dataset_name_test(file, 'gpt-4o-2024-08-06', header=False, num_rows=1, random_rows=True, rng=np.random.default_rng(0))
    print(' ')

[1mFile: [0mspooky author identification train.csv
[1mGenerated Dataset Name: [0mkaggle-spooky-author-identification
 
[1mFile: [0mRANZCR CLiP - Catheter and Line Position Challenge train.csv
[1mGenerated Dataset Name: [0mrsna-pneumonia-detection-challenge
 
Info: Found a CSV file with more than 100000 rows. Note that tabmemcheck is configured to use only the first 100000 rows. Set tabmemcheck.config.csv_max_rows to change this behavior.
[1mFile: [0mnew-york-city-taxi-fare-prediction train.csv
[1mGenerated Dataset Name: [0mnyc-taxi-trips
 
[1mFile: [0mplant-pathology-2020-fgvc7 train.csv
[1mGenerated Dataset Name: [0mtitanic
 
[1mFile: [0mTabular Playground Series Dec 2021.csv
[1mGenerated Dataset Name: [0mkdd-cup-1999
 
[1mFile: [0mTabular Playground Series May 2022.csv
[1mGenerated Dataset Name: [0mcovertype
 
[1mFile: [0miWildCam 2019 - FGVC6 train.csv
[1mGenerated Dataset Name: [0miwildcam-2020
 
[1mFile: [0mgoogle-quest-challenge train.csv
[1mGenerat