# Testing GPT-4o for prior exposure with tabular MLE-bench datasets

### MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering: https://arxiv.org/abs/2410.07095

In [1]:
from IPython.display import display, HTML

import tabmemcheck

import numpy as np

# the absolute path to all csv files in 'home/sebastian/Downloads/MLE-Bench CSV Files'
import os

csv_files = []
for file in os.listdir('/home/sebastian/Downloads/MLE-Bench CSV Files'):
    if file.endswith('.csv'):
        csv_files.append(os.path.join('/home/sebastian/Downloads/MLE-Bench CSV Files', file))
        #print(file)

# remove file at index 9
csv_files.pop(9)

'/home/sebastian/Downloads/MLE-Bench CSV Files/Kaggel billion-word-imputation train_v2.csv'

In [4]:
tabmemcheck.config.print_prompts = False
tabmemcheck.config.print_responses = False

## Motivating Example: GPT-4o has memorized the initial row of Kaggle Titanic

## Header Test: Has GPT-4o memorized the initial row of MLE-Bench datasets?

In [2]:
np.random.seed(0) 

for file in csv_files[:3]:
    tabmemcheck.header_test(file, 'gpt-4o-2024-08-06')
    print(' ')

[1mDataset: [0mKaggel new-york-city-taxi-fare-prediction train.csv[1m
Header Test: [0m[0;30mkey,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
2009-06-15 17:26:21.0000001,4.5,[0;32m2[0m[0;32m0[0m[0;32m0[0m[0;32m9[0m[0;32m-[0m[0;32m0[0m[0;32m6[0m[0;32m-[0m[0;32m1[0m[0;32m5[0m[0;32m [0m[0;32m1[0m[0;32m7[0m[0;32m:[0m[0;32m2[0m[0;32m6[0m[0;32m:[0m[0;32m2[0m[0;32m1[0m[0;32m [0m[0;32mU[0m[0;32mT[0m[0;32mC[0m[0;32m,[0m[0;32m-[0m[0;32m7[0m[0;32m3[0m[0;32m.[0m[0;32m8[0m[0;32m4[0m[0;32m4[0m[0;32m3[0m[0;32m1[0m[0;32m1[0m[0;32m,[0m[0;32m4[0m[0;32m0[0m[0;32m.[0m[0;32m7[0m[0;32m2[0m[0;32m1[0m[0;32m3[0m[0;32m1[0m[0;32m9[0m[0;32m,[0m[0;32m-[0m[0;32m7[0m[0;32m3[0m[0;32m.[0m[0;32m8[0m[0;32m4[0m[0;32m1[0m[0;32m6[0m[0;32m1[0m[0;32m,[0m[0;32m4[0m[0;32m0[0m[0;32m.[0m[0;32m7[0m[0;32m1[0m[0;32m2[0m[0;32m2[0m[0;32m7[0

## Row Completion Test: Has GPT-4o memorized random rows of the dataset?

In [2]:
np.random.seed(0)

for file in csv_files[:3]:
    tabmemcheck.row_completion_test(file, 'gpt-4o-2024-08-06', num_queries=3)
    print(' ')

[1mDataset: [0mKaggel new-york-city-taxi-fare-prediction train.csv
[1mInfo: [0mAll the rows in the dataset are unique.
[0;32m2[0m[0;32m0[0m[0;32m1[0m[0;31m0[0m[0;32m-[0m[0;31m0[0m[0;31m6[0m[0;32m-[0m[0;31m1[0m[0;31m0[0m[0;32m [0m[0;32m1[0m[0;31m8[0m[0;32m:[0m[0;31m4[0m[0;31m5[0m[0;32m:[0m[0;31m0[0m[0;31m0[0m[0;32m.[0m[0;32m0[0m[0;32m0[0m[0;32m0[0m[0;32m0[0m[0;32m0[0m[0;32m0[0m[0;31m1[0m[0;32m,[0m[0;31m5[0m[0;32m.[0m[0;31m3[0m[0;32m,[0m[0;32m2[0m[0;32m0[0m[0;32m1[0m[0;31m0[0m[0;32m-[0m[0;31m0[0m[0;31m6[0m[0;32m-[0m[0;31m1[0m[0;31m0[0m[0;32m [0m[0;32m1[0m[0;31m8[0m[0;32m:[0m[0;31m4[0m[0;31m5[0m[0;32m:[0m[0;31m0[0m[0;31m0[0m[0;32m [0m[0;32mU[0m[0;32mT[0m[0;32mC[0m[0;32m,[0m[0;32m-[0m[0;32m7[0m[0;32m3[0m[0;32m.[0m[0;32m9[0m[0;31m8[0m[0;31m7[0m[0;31m6[0m[0;31m5[0m[0;31m4[0m[0;32m,[0m[0;32m4[0m[0;32m0[0m[0;32m.[0m[0;32m7[0m[0;31m4[0m[0;31m8[0m

## Feature Names Test: Does GPT-4o know the names of the features in the dataset?

In [3]:
for file in csv_files:
    tabmemcheck.feature_names_test(file, 'gpt-4o-2024-08-06')
    print(' ')

[1mDataset: [0mKaggel new-york-city-taxi-fare-prediction train.csv[1m
Feature Names: [0mkey, fare_amount, pickup_datetime, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude, passenger_count[1m
Feature Names Test: [0m[0;32mp[0m[0;32mi[0m[0;32mc[0m[0;32mk[0m[0;32mu[0m[0;32mp[0m[0;32m_[0m[0;32md[0m[0;32ma[0m[0;32mt[0m[0;32me[0m[0;32mt[0m[0;32mi[0m[0;32mm[0m[0;32me[0m[0;32m,[0m[0;32m [0m[0;32mp[0m[0;32mi[0m[0;32mc[0m[0;32mk[0m[0;32mu[0m[0;32mp[0m[0;32m_[0m[0;32ml[0m[0;32mo[0m[0;32mn[0m[0;32mg[0m[0;32mi[0m[0;32mt[0m[0;32mu[0m[0;32md[0m[0;32me[0m[0;32m,[0m[0;32m [0m[0;32mp[0m[0;32mi[0m[0;32mc[0m[0;32mk[0m[0;32mu[0m[0;32mp[0m[0;32m_[0m[0;32ml[0m[0;32ma[0m[0;32mt[0m[0;32mi[0m[0;32mt[0m[0;32mu[0m[0;32md[0m[0;32me[0m[0;32m,[0m[0;32m [0m[0;32md[0m[0;32mr[0m[0;32mo[0m[0;32mp[0m[0;32mo[0m[0;32mf[0m[0;32mf[0m[0;32m_[0m[0;32ml[0m[0;32mo[0m[0;32mn[0m

## Feature Values Test: Can GPT-4o respond with realistic feature values?

In [4]:
for file in csv_files:
    tabmemcheck.feature_values_test(file, 'gpt-4o-2024-08-06')
    print(' ')

[1mFeature Values Test
Dataset: [0mKaggel new-york-city-taxi-fare-prediction train.csv
                                                key fare_amount          pickup_datetime pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_count
[1mModel Sample[0m    2012-02-23 11:42:00.0000002        12.5      2012-02-23 11:42:00         -73.9874         40.7324          -73.9879          40.7583               1
[1mDataset Match[0m  2014-12-23 17:58:00.00000043        22.5  2014-12-23 17:58:00 UTC        -73.98704        40.76623         -73.99391         40.72313               1
 
[1mFeature Values Test
Dataset: [0mKaggle spooky author identification train.csv
                            id                                               text           author
[1mModel Sample[0m   id26305  "This process, however, afforded me no means o...  Edgar Allan Poe
[1mDataset Match[0m  id24308  No one, however, had been using the telescope ...              HPL
 
[1mFeat

In [6]:
tabmemcheck.config.print_responses = True

In [7]:
csv_files[9]

'/home/sebastian/Downloads/MLE-Bench CSV Files/Kaggel nomad2018-predict-transparent-conductors train.csv'

In [8]:
for file in csv_files:
    tabmemcheck.sample(file, 'gpt-4o-2024-08-06', temperature=0, num_queries=1)
    #print('\n')

Info: Found a CSV file with more than 100000 rows. Note that tabmemcheck is configured to read only the first 100000 rows. Set tabmemcheck.config.csv_max_rows to change this behavior.
[1mResponse: [0m[0;35mkey = 2012-02-15 17:26:21.0000002, fare_amount = 12.5, pickup_datetime = 2012-02-15 17:26:21, pickup_longitude = -73.993896, pickup_latitude = 40.750562, dropoff_longitude = -73.974998, dropoff_latitude = 40.750256, passenger_count = 2[0m
[1mResponse: [0m[0;35mid = id26305, text = "The moon shone down on the path, casting eerie shadows that danced with the wind.", author = Edgar Allan Poe[0m
[1mResponse: [0m[0;35mid = 12345, molecule_name = dsgdb9nsd_000123, atom_index_0 = 1, atom_index_1 = 6, type = 1JHC, scalar_coupling_constant = 84.567[0m
[1mResponse: [0m[0;35mid = 12345, comment_text = "I can't believe you would say something like that. It's completely unacceptable!", toxic = 1, severe_toxic = 0, obscene = 0, threat = 0, insult = 1, identity_hate = 0[0m
[1mRespo

In [10]:
df = tabmemcheck.utils.load_csv_df(csv_files[9])

In [14]:
df.iloc

<pandas.core.indexing._iLocIndexer at 0x796455cefb10>

In [34]:
tabmemcheck.sample(datasets[3], 'gpt-4o-2024-08-06', temperature=0, num_queries=1)

[1mResponse: [0m[0;35mPatient = ID00419637202311204720264, Weeks = 4, FVC = 2670, Percent = 79.8, Age = 68, Sex = Male, SmokingStatus = Ex-smoker[0m


Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00419637202311204720264,4,2670,79.8,68,Male,Ex-smoker


In [36]:
tabmemcheck.sample(datasets[3], 'gpt-4o-2024-08-06', temperature=1, num_queries=10)

[1mResponse: [0m[0;35mPatient = ID00422637202311677017371, Weeks = 20, FVC = 2670, Percent = 64.3, Age = 69, Sex = Male, SmokingStatus = Ex-smoker[0m
[1mResponse: [0m[0;35mPatient = ID00427637202342189391666, Weeks = 3, FVC = 2774, Percent = 55.2, Age = 79, Sex = Male, SmokingStatus = Ex-smoker[0m
[1mResponse: [0m[0;35mPatient = ID00419637202311204720264, Weeks = 4, FVC = 2348, Percent = 74.3401, Age = 69, Sex = Male, SmokingStatus = Ex-smoker[0m
[1mResponse: [0m[0;35mPatient = ID00422637202312137826377, Weeks = 1, FVC = 2120, Percent = 72.0, Age = 69, Sex = Male, SmokingStatus = Former smoker[0m
[1mResponse: [0m[0;35mPatient = ID00007637202177411956430, Weeks = 4, FVC = 1900, Percent = 67.13, Age = 71, Sex = Male, SmokingStatus = Ex-smoker[0m
[1mResponse: [0m[0;35mPatient = ID00388637202201836583918, Weeks = -4, FVC = 2600, Percent = 55.8, Age = 68, Sex = Male, SmokingStatus = Ex-smoker[0m
[1mResponse: [0m[0;35mPatient = ID00052637202186188008618, Weeks = 5,

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00422637202311677017371,20,2670,64.3,69,Male,Ex-smoker
1,ID00427637202342189391666,3,2774,55.2,79,Male,Ex-smoker
2,ID00419637202311204720264,4,2348,74.3401,69,Male,Ex-smoker
3,ID00422637202312137826377,1,2120,72.0,69,Male,Former smoker
4,ID00007637202177411956430,4,1900,67.13,71,Male,Ex-smoker
5,ID00388637202201836583918,-4,2600,55.8,68,Male,Ex-smoker
6,ID00052637202186188008618,5,2700,88.54,52,Male,Ex-smoker
7,ID0011063720224447687914530803,-12,2710,64.0,79,Male,Ex-smoker
8,ID00419637202311204720264,10,2620,51.882616,70,Male,Ex-smoker
9,ID00421637202311550012437,5,1523,56.0,63,Male,Never smoked
