# We use gpt-4-turbo to perturb the names on titanic and spaceship titanic

In [1]:
import tabmemcheck
from tabmemcheck import send_chat_completion

from sklearn.model_selection import train_test_split

#### titanic

In [2]:
names = tabmemcheck.datasets.load_dataset('../tabular/titanic-train.csv').Name.tolist()

In [4]:
llm = tabmemcheck.openai_setup('gpt-4-turbo-preview')

tabmemcheck.set_logging_task('titanic-name-replacement')
tabmemcheck.config.tempertaure = 0.7

mapping = {}
for name in names:
    messages = [{"role": "system", "content": "You are a helpful assistant. The user provides you with a name, and you replace it with a different, fictional name in the same format that indicates the same nationality, gender and social class."}]
    messages.append({"role": "user", "content": name})
    response = send_chat_completion(llm, messages)
    mapping[name] = response
    print(f"'{name}': '{response}'")


'Braund, Mr. Owen Harris': 'Hawkins, Mr. Edward James'
'Cumings, Mrs. John Bradley (Florence Briggs Thayer)': 'Harrington, Mrs. Charles Edward (Elizabeth Harper Ford)'
'Heikkinen, Miss. Laina': 'Korhonen, Miss. Sari'
'Futrelle, Mrs. Jacques Heath (Lily May Peel)': 'Montclair, Mrs. Henri Lucien (Rose Elise Dubois)'
'Allen, Mr. William Henry': 'Bennett, Mr. Charles Edward'
'Moran, Mr. James': 'Bennett, Mr. William'
'McCarthy, Mr. Timothy J': 'O'Reilly, Mr. Patrick F.'
'Palsson, Master. Gosta Leonard': 'Svensson, Master. Erik Gustav'
'Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)': 'Smith, Mrs. Henry T (Charlotte Wilhelmina Lund)'
'Nasser, Mrs. Nicholas (Adele Achem)': 'Almasri, Mrs. Karim (Layla Farid)'
'Sandstrom, Miss. Marguerite Rut': 'Lindberg, Miss. Elisabeth Ingrid'
'Bonnell, Miss. Elizabeth': 'Crawford, Miss. Margaret'
'Saundercock, Mr. William Henry': 'Bramblewood, Mr. Edward Charles'
'Andersson, Mr. Anders Johan': 'Svensson, Mr. Sven Karl'
'Vestrom, Miss. Hulda Amanda Adolfi

In [8]:
# remove new lines in the responses
for name in mapping:
    mapping[name] = mapping[name].replace('\n', ' ')

# remove duplicate spaces
for name in mapping:
    mapping[name] = ' '.join(mapping[name].split())

# save mapping dict to yaml
import yaml
with open('titanic_name_mapping.yaml', 'w') as file:
    documents = yaml.dump(mapping, file)

#### spaceship titanic

In [17]:
names = tabmemcheck.datasets.load_dataset('../tabular/spaceship-titanic-train.csv').Name.tolist()

# we only perturb the names for the 1000 observations that we test the LLM with, and the first 10 names
names, _ = train_test_split(
    all_names, test_size=0.2, random_state=42
)
names = names[:1000]
names.extend(all_names[:10])

In [2]:
names = tabmemcheck.datasets.load_dataset('../tabular/spaceship-titanic-train.csv').Name.tolist()

In [3]:
llm = tabmemcheck.openai_setup('gpt-4-turbo-preview')

tabmemcheck.set_logging_task('spaceship-titanic-name-replacement')
tabmemcheck.config.tempertaure = 0.7

mapping = {}
for name in names:
    try:
        messages = [{"role": "system", "content": "You are a helpful assistant. The user provides you with a name, and you replace it with a different, fictional name in the same format that indicates the same nationality, gender and social class."}]
        messages.append({"role": "user", "content": name})
        response = send_chat_completion(llm, messages)
        mapping[name] = response
        print(f"'{name}': '{response}'")
    except Exception as e:
        print(f"Error with name: {name}.")
        print(e)


'Maham Ofracculy': 'Sadia Khatun'
'Juanna Vines': 'Mariana Flores'
'Altark Susent': 'Borivik Grentov'
'Solam Susent': 'Lorik Vansen'
'Willy Santantines': 'Billy Montalvo'
'Sandie Hinetthews': 'Lara Finleyson'
'Billex Jacostaffey': 'Willen Jamichester'
'Candra Jacostaffey': 'Lorena Marishire'
'Andona Beston': 'Lorena Weston'
'Erraiam Flatic': 'Lorien Saric'
'Altardr Flatic': 'Borislav Petrovic'
'Wezena Flatic': 'Lorena Zoric'
'Berers Barne': 'Bjorn Borgesson'
'Reney Baketton': 'Lorey Harington'
'Elle Bertsontry': 'Ella Berthamshire'
'Justie Pooles': 'Jasper Hales'
'Flats Eccle': 'Bricks Dunley'
'Carry Hughriend': 'Marry Loufriend'
'Alus Upead': 'Linas Vėjas'
'Lyde Brighttt': 'Ella Shinebright'
'Philda Brighttt': 'Elsie Lightwood'
'Almary Brantuarez': 'Elena Castilione'
'Glendy Brantuarez': 'Lorena Mendivaz'
'Mollen Mcfaddennon': 'Brenna O'Donnellan'
'Breney Jacostanley': 'Bramley Jaceston'
'Mael Brantuarez': 'Lorik Duvallier'
'Terta Mcfaddennon': 'Lorna O'Donnelly'
Error with name: nan.

In [4]:
# remove new lines in the responses
for name in mapping:
    mapping[name] = mapping[name].replace('\n', ' ')

# remove duplicate spaces
for name in mapping:
    mapping[name] = ' '.join(mapping[name].split())

# save mapping dict to yaml
import yaml
with open('spaceship_titanic_name_mapping.yaml', 'w') as file:
    documents = yaml.dump(mapping, file)