# This notebook shows how to test GPT-3.5 and GPT-4 for memorization of tabular datasets

In [2]:
from IPython.display import display, HTML

import tabmemcheck

## Run all the different tests at once
#### For the Iris dataset, all tests indicate contamination

In [3]:
tabmemcheck.run_all_tests('iris.csv', 'gpt-4o-2024-08-06')

[1mInfo: [0mRemoved the few-shot dataset iris.csv because it is similar to the dataset being tested.
[1mDataset: [0miris
[1mModel: [0mgpt-4o-2024-08-06
[1mFew-Shot: [0madult-train, openml-diabetes, uci-wine, california-housing
[1mDataset: [0miris.csv
[1mFeature Names Test: [0m[0;30msepal_length, [0;32ms[0m[0;32me[0m[0;32mp[0m[0;32ma[0m[0;32ml[0m[0;32m_[0m[0;32mw[0m[0;32mi[0m[0;32md[0m[0;32mt[0m[0;32mh[0m[0;32m,[0m[0;32m [0m[0;32mp[0m[0;32me[0m[0;32mt[0m[0;32ma[0m[0;32ml[0m[0;32m_[0m[0;32ml[0m[0;32me[0m[0;32mn[0m[0;32mg[0m[0;32mt[0m[0;32mh[0m[0;32m,[0m[0;32m [0m[0;32mp[0m[0;32me[0m[0;32mt[0m[0;32ma[0m[0;32ml[0m[0;32m_[0m[0;32mw[0m[0;32mi[0m[0;32md[0m[0;32mt[0m[0;32mh[0m[0;32m,[0m[0;32m [0m[0;32ms[0m[0;32mp[0m[0;32me[0m[0;32mc[0m[0;32mi[0m[0;32me[0m[0;32ms[0m[0m[1m
Legend:  [0mPrompt [0;32mCorrect [0;31mIncorrect [0m[0;35mMissing[0m
[1mFile: [0miris.csv
[1mFeature Values 

## The header test asks the LLM to complete the initial rows of a csv file
#### It provides strong evidence of memorization of the UCI Wine dataset

In [2]:
header_prompt, header_completion, response = tabmemcheck.header_test('uci-wine.csv', 'gpt-3.5-turbo-0613', completion_length=350)

[1mHeader Test: [0m[0;30mtarget,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280_od315_of_diluted_wines,proline
1,14.23,1.71,2.43,15.6,127,2.8,3.06,.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,.26,1.28,4.38,1.05,3.4,1050
1,13.16,2.36,2.67,18.6,101,2.8,3.24,.[0;32m3[0m[0;31m9[0m[0;32m,[0m[0;32m2[0m[0;32m.[0m[0;32m8[0m[0;32m1[0m[0;32m,[0m[0;31m2[0m[0;31m.[0m[0;31m2[0m[0;31m9[0m[0;31m,[0m[0;32m5[0m[0;32m.[0m[0;32m6[0m[0;32m8[0m[0;32m,[0m[0;32m1[0m[0;32m.[0m[0;32m0[0m[0;32m3[0m[0;32m,[0m[0;32m3[0m[0;32m.[0m[0;32m1[0m[0;32m7[0m[0;32m,[0m[0;32m1[0m[0;32m1[0m[0;32m8[0m[0;32m5[0m[0;32m
[0m[0;32m1[0m[0;32m,[0m[0;32m1[0m[0;32m4[0m[0;32m.[0m[0;32m3[0m[0;32m7[0m[0;32m,[0m[0;32m1[0m[0;32m.[0m[0;32m9[0m[0;32m5[0m[0;32m,[0m[0;32m2[0m[0;32m.[0m[0;32m5[0m[0;32m,[0m[0;32m1[0m[0;32m6[0m[0;3

#### We can visualize the Levenshtein string distance between the actual header and the model completion
##### https://en.wikipedia.org/wiki/Levenshtein_distance

In [8]:
display(HTML(header_prompt.replace('\n', '<br>') + tabmemcheck.utils.levenshtein_html(header_completion, response)))

## The row completion test asks the LLM to complete random rows of a csv file
#### It provides strong evidence of memorization of the Iris dataset

In [2]:
rows, responses = tabmemcheck.row_completion_test('iris.csv', 'gpt-4-0125-preview', num_queries=25)

[1mInfo: [0m1.99% of the rows in this dataset are duplicates.
[0;32m5[0m[0;32m,[0m[0;32m3[0m[0;32m.[0m[0;32m5[0m[0;32m,[0m[0;32m1[0m[0;32m.[0m[0;32m3[0m[0;32m,[0m[0;32m0[0m[0;32m.[0m[0;32m3[0m[0;32m,[0m[0;32mI[0m[0;32mr[0m[0;32mi[0m[0;32ms[0m[0;32m-[0m[0;32ms[0m[0;32me[0m[0;32mt[0m[0;32mo[0m[0;32ms[0m[0;32ma[0m
[0;32m5[0m[0;32m.[0m[0;31m9[0m[0;32m,[0m[0;31m3[0m[0;32m.[0m[0;31m2[0m[0;32m,[0m[0;32m4[0m[0;31m.[0m[0;31m8[0m[0;32m,[0m[0;32m1[0m[0;32m.[0m[0;31m8[0m[0;32m,[0m[0;32mI[0m[0;32mr[0m[0;32mi[0m[0;32ms[0m[0;32m-[0m[0;32mv[0m[0;32me[0m[0;32mr[0m[0;32ms[0m[0;32mi[0m[0;32mc[0m[0;32mo[0m[0;32ml[0m[0;32mo[0m[0;32mr[0m
[0;32m6[0m[0;32m.[0m[0;32m9[0m[0;32m,[0m[0;32m3[0m[0;32m.[0m[0;32m2[0m[0;32m,[0m[0;32m5[0m[0;32m.[0m[0;32m7[0m[0;32m,[0m[0;32m2[0m[0;32m.[0m[0;32m3[0m[0;32m,[0m[0;32mI[0m[0;32mr[0m[0;32mi[0m[0;32ms[0m[0;32m-[0m[0;32mv[

In [4]:
display(HTML('<br>'.join([tabmemcheck.utils.levenshtein_html(row, response) for row, response in zip(rows, responses)])))

## The feature completion test asks the LLM to complete the value of a specific feature in a csv file
#### It provides strong evidence of memorization of the Kaggle Titanic datast

In [6]:
feature_values, responses = tabmemcheck.feature_completion_test('/home/sebastian/Downloads/titanic-train.csv', 'gpt-3.5-turbo-0125', feature_name='Name', num_queries=25)

[1mFeature Completion Test ("Name"): [0m[0;30m25/25 exact matches.[0m


In [7]:
display(HTML('<br>'.join([tabmemcheck.utils.levenshtein_html(feature, response) for feature, response in zip(feature_values, responses)])))

## The first token test asks the LLM to complete the value of the first token in the next row of a csv file

#### It provides no evidence of memorization of the Adult Income dataset

In [3]:
tabmemcheck.first_token_test('adult-train.csv', 'gpt-3.5-turbo-0125', num_queries=100)

[1mFirst Token Test: [0m[0;30m37/100 exact matches.
[0m[1mFirst Token Test Baseline (Matches of most common first token): [0m50/100.


## You can see all prompts that are being send to the model, and the raw responses

In [2]:
tabmemcheck.config.print_prompts = True
tabmemcheck.feature_completion_test('/home/sebastian/Downloads/titanic-train.csv', 'gpt-3.5-turbo-0125', feature_name='Name', num_queries=1)

[1mSystem: [0m[0;34mYou are a helpful autocomplete bot for tabular datasets. You will be prompted with a portion of a sample from a machine learning dataset. Your task is to complete the observation from the dataset. The observations are presented to you in the format "Feature Name = Feature Value".[0m
[1mUser: [0m[0;34mPassengerId = 757, Survived = 0, Pclass = 3, Sex = male, Age = 28.0, SibSp = 0, Parch = 0, Ticket = 350042, Fare = 7.7958, Cabin = nan, Embarked = S[0m
[1mAssistant: [0m[0;34mName = Carlsson, Mr. August Sigfrid[0m
[1mUser: [0m[0;34mPassengerId = 19, Survived = 0, Pclass = 3, Sex = female, Age = 31.0, SibSp = 1, Parch = 0, Ticket = 345763, Fare = 18.0, Cabin = nan, Embarked = S[0m
[1mAssistant: [0m[0;34mName = Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)[0m
[1mUser: [0m[0;34mPassengerId = 730, Survived = 0, Pclass = 3, Sex = female, Age = 25.0, SibSp = 1, Parch = 0, Ticket = STON/O2. 3101271, Fare = 7.925, Cabin = nan, Embarked = S[0m


(['Carr, Miss. Helen "Ellen"'], ['Carr, Miss. Helen "Ellen"'])