# Dataset creation

In [None]:
from names_dataset import NameDataset

nd = NameDataset()

In [40]:
sum(nd.search("Angel")["first_name"]["country"].values())

1.0

In [1]:
import pandas as pd

df_es = pd.read_csv("./name_dataset/ES.csv", names = ["first_name", "last_name", "gender", "alpha2"])

In [2]:
len(df_es)

10891211

In [3]:
df_es.dropna(subset = ["first_name", "last_name"], inplace=True)
len(df_es)

10858177

In [None]:
from unidecode import unidecode

df_es["first_name_unidecoded"] = df_es.first_name.apply(unidecode)
df_es["last_name_unidecoded"] = df_es.last_name.apply(unidecode)

In [66]:
import nest_asyncio
import asyncio
import time
from openai import AsyncOpenAI

# Patch asyncio for Jupyter
nest_asyncio.apply()

# Initialize the async OpenAI client
client = AsyncOpenAI()

async def get_completion(i):
    start_time = time.time()
    response = await client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": f"Hello, this is request {i}."}]
    )
    elapsed_time = time.time() - start_time
    return f"Request {i}: {response.choices[0].message.content} (Time taken: {elapsed_time:.2f} sec)"

async def main():
    start = time.time()
    tasks = [get_completion(i) for i in range(4)]
    results = await asyncio.gather(*tasks)
    total_time = time.time() - start
    
    for result in results:
        print(result)
    
    print(f"\nTotal elapsed time: {total_time:.2f} sec")

# Run the async function in Jupyter
await main()

Request 0: Hello! How can I assist you with your request today? (Time taken: 1.52 sec)
Request 1: Hello! How can I assist you with your request? (Time taken: 1.13 sec)
Request 2: Hello! How can I assist you today with your request? (Time taken: 1.15 sec)
Request 3: Hello! How can I assist you today with your request? (Time taken: 1.30 sec)

Total elapsed time: 1.52 sec


In [61]:
import nest_asyncio
nest_asyncio.apply()

In [91]:
import asyncio
from openai import AsyncOpenAI

client = AsyncOpenAI()

async def fetch_canonical_name(prompt: str) -> str:
    """Calls GPT to return the canonical version from a list of name variations."""
    start_time = time.time()
    response = await client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are an expert name corrector. Respond only with the right name from the options provided"},
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        max_tokens=10,
    )
    elapsed_time = time.time() - start_time
    canonical_name = response.choices[0].message.content.strip()
    print(f"{canonical_name} ({elapsed_time:.2f} sec)")
    return canonical_name

def get_name_prompts(df, name_column, unidecoded_column, threshold_pctg = 0.001, group_threshold_pctg = 0.01):
    """
    Extracts names and prepares prompts.
    
    For each unidecoded name that appears more than a threshold (1% of total rows),
    it collects variations from the specified name_column using a sub-threshold (1% of group count).
    Only if there are multiple relevant variations, it returns a prompt.
    
    Returns a list of tuples: (unidecoded name, prompt)
    """
    threshold = threshold_pctg * len(df)
    counts = df[unidecoded_column].value_counts()
    selected_names = counts[counts > threshold].index

    prompts = []
    # Group once to avoid repeated heavy extraction
    groups = df.groupby(unidecoded_column)[name_column]
    for uname in selected_names:
        group = groups.get_group(uname)
        var_counts = group.value_counts()
        sub_threshold = group_threshold_pctg * group.count()
        variations = var_counts[var_counts >= sub_threshold].index.tolist()
        if len(variations) > 1:
            prompt = f"Given the following variations: {', '.join(variations)}, what is the correct accepted version in Spanish?"
            prompts.append((uname, prompt))
    return prompts

async def run_api_calls(prompts):
    """
    Runs GPT calls concurrently for each prompt.
    
    Expects a list of tuples (unidecoded name, prompt) and returns a mapping:
    {unidecoded name: canonical name}
    """
    start = time.time()
    tasks = [fetch_canonical_name(prompt) for _, prompt in prompts]
    responses = await asyncio.gather(*tasks)
    total_time = time.time() - start
    print(f"\nTotal elapsed time: {total_time:.2f} sec")
    return {uname: canonical for (uname, _), canonical in zip(prompts, responses)}

# Process first names
prompts_first = get_name_prompts(df_es, "first_name", "first_name_unidecoded")
name_map_first = await run_api_calls(prompts_first)
df_es['first_name_corrected'] = df_es['first_name_unidecoded'].map(name_map_first).fillna(df_es['first_name'])

# Process surnames similarly
prompts_surname = get_name_prompts(df_es, "last_name", "last_name_unidecoded", 0.0001, 0.01)
name_map_surname = await run_api_calls(prompts_surname)
df_es['last_name_corrected'] = df_es['last_name_unidecoded'].map(name_map_surname).fillna(df_es['last_name'])

Julia (0.43 sec)
Antonia (0.45 sec)
María (0.48 sec)
Inés (0.42 sec)
Miguel Ángel (0.48 sec)
Héctor (0.44 sec)
Mónica (0.48 sec)
Begoña (0.45 sec)
Silvia (0.49 sec)
Álex (0.50 sec)
Lidia (0.47 sec)
Belén (0.48 sec)
Iván (0.52 sec)
Ángela (0.49 sec)
Raúl (0.52 sec)
Víctor (0.52 sec)
José Manuel (0.52 sec)
Rocío (0.52 sec)
Lucía (0.51 sec)
Gloria (0.49 sec)
César (0.49 sec)
Ana María (0.51 sec)
José (0.56 sec)
Ramón (0.53 sec)
Joaquín (0.53 sec)
Juan José (0.53 sec)
Rosa María (0.52 sec)
Miriam (0.53 sec)
Sonia (0.56 sec)
Núria (0.57 sec)
Félix (0.54 sec)
Julián (0.55 sec)
Toñi (0.58 sec)
Óscar (0.61 sec)
Claudia (0.59 sec)
Verónica (0.63 sec)
Adrián (0.66 sec)
María Jesús (0.64 sec)
Tomás (0.64 sec)
José Luis (0.70 sec)
Jesús (0.74 sec)
Ángel (0.74 sec)
Ángeles (0.71 sec)
Andrés (0.75 sec)
Agustín (0.74 sec)
Natalia (0.78 sec)
José María (0.78 sec)
Fátima (0.78 sec)
María José (0.91 sec)
Álvaro (0.96 sec)
José Antonio (1.04 sec)
Rubén (6.11 sec)

Total elapsed time: 6.13 sec
Fernández (

In [92]:
((df_es.first_name != df_es.first_name_corrected )| (df_es.last_name != df_es.last_name_corrected)).sum()

np.int64(2171341)

### Cleaning names which are not common

In [102]:
names = df_es.first_name_corrected.value_counts()


In [115]:
total_count = names.sum()

# Calculate the percentage of data left for each threshold from 1 to 10
percentages = {}
for i in range(1, 20):
    filtered_sum = names[names >= i].sum()
    percentages[i] = (filtered_sum / total_count) * 100

# Convert to DataFrame for better visualization
percentages_df = pd.DataFrame(list(percentages.items()), columns=["Threshold", "Percentage Left"])

# Display the DataFrame
print(percentages_df)

    Threshold  Percentage Left
0           1       100.000000
1           2        95.599749
2           3        94.527820
3           4        93.910783
4           5        93.457926
5           6        93.090415
6           7        92.772461
7           8        92.500150
8           9        92.257973
9          10        92.036582
10         11        91.838391
11         12        91.657964
12         13        91.484565
13         14        91.328084
14         15        91.184966
15         16        91.043229
16         17        90.903832
17         18        90.772475
18         19        90.654113


In [136]:
# We filter only by names with more than 11 appearances
valid_names = names[names >= 11].index
print(valid_names)

Index(['José', 'María', 'Antonio', 'Ana', 'Juan', 'Carmen', 'Manuel', 'David',
       'Carlos', 'Javier',
       ...
       'Holga', 'Ramone', 'Horus', 'Faru', 'Elena Beatriz', 'Wale',
       'Juana Del Carmen', 'Luisa Pilar', 'Adriam', 'Pepe Toni'],
      dtype='object', name='first_name_corrected', length=27266)


In [133]:
df_es = df_es[df_es.first_name_corrected.isin(valid_names)]

In [137]:
# We filter out rows in which surname and name are exactly the same
df_es = df_es[df_es.first_name != df_es.last_name]

In [138]:
# We get the DataFrame with the corrected names and the other desired columns
df_es = df_es[['first_name_corrected', 'last_name_corrected', 'gender', 'alpha2']].copy()

# Rename the corrected columns to the standard names
df_es.rename(columns={
    'first_name_corrected': 'first_name',
    'last_name_corrected': 'last_name'
}, inplace=True)

# Optionally, inspect the new DataFrame
print(df_es.columns)

Index(['first_name', 'last_name', 'gender', 'alpha2'], dtype='object')


## Gender assignation
We assign gender to the names without by computing the most common gender for each name and filling the values

In [154]:
# Count occurrences of each (first_name, gender) pair
df_counts = df_es.groupby(["first_name", "gender"]).size().unstack(fill_value=0)

# Ensure the columns are treated as numeric
df_counts = df_counts.astype(int)


# Compute the dominant ratio (larger count divided by the smaller count)
df_counts["dominant_ratio"] = df_counts.max(axis=1) / df_counts.min(axis=1).replace(0, 1)  # Avoid division by zero

# Compute the dominant gender
df_counts["dominant_gender"] = df_counts[["M", "F"]].idxmax(axis=1)

# Reset index for a cleaner DataFrame
df_counts = df_counts.reset_index()

# Sort by ratio in descending order
df_counts = df_counts.sort_values(by="dominant_ratio", ascending=False)

# Display the result
df_counts

gender,first_name,F,M,dominant_ratio,dominant_gender
10493,Iratxe,1228,1,1228.0,F
7446,Estibaliz,1227,1,1227.0,F
433,Adrià,2,1689,844.5,M
15418,Maialen,843,1,843.0,F
11884,Jorge Luis,0,811,811.0,M
...,...,...,...,...,...
3485,Blue,3,3,1.0,M
3434,Blair,5,5,1.0,M
23587,Snoopy,6,6,1.0,M
25078,Valentine,14,14,1.0,M


In [157]:
dominant_gender_names = df_counts[df_counts.dominant_ratio > 5].set_index("first_name").dominant_gender.to_dict()

In [159]:
df_es["gender"] = df_es["gender"].fillna(df_es["first_name"].map(dominant_gender_names))

In [167]:
# There are some rows without gender but we can drop them safely
df_es.gender.isna().sum()

df_es.dropna(inplace = True)

In [168]:
# We have now cleaned the dataset. We remain with 90% of the data.
df_es.shape[0]/10891211

0.9089856031620359

In [169]:
df_es.to_parquet("./df_es_clean.parquet")

## Creating Training dataset

The goal here is to build a model which can 
- reconstruct names from misspelled forms (oscar -> Óscar)
- separate first_name from last_name (oscar sanchez -> Óscar, Sánchez
- estimate the gender given name (oscar sanchez) -> M

The input of the model is a string, the output is a JSON of the form

```
input: "oscar sanchez"
output:
{
    "first_name" : "Óscar",
    "last_name" : "Sánchez",
    "gender" : "M"
}
```

In [174]:
print(df_es.sample(4))

        first_name        last_name gender alpha2
2267206   Gabriela         Mircheva      F     ES
5196677      María            Otero      F     ES
8069579   Fernando  Rodríguez Tapia      M     ES
5027431     Carlos          Roberto      M     ES


In [1]:
import pandas as pd
import random
import json
from unidecode import unidecode

# Transformation functions.
def transform_correct(text):
    return text

def transform_remove_accents_and_lower(text):
    return unidecode(text).lower()

def transform_insert_extra_spaces(text, extra_space_range=(1, 3)):
    words = text.split()
    return " ".join(word + " " * random.randint(*extra_space_range) for word in words).strip()

def transform_random_uppercase(text, uppercase_prob=0.1):
    noisy_chars = []
    for char in text:
        if char.isalpha() and random.random() < uppercase_prob:
            noisy_chars.append(char.upper())
        else:
            noisy_chars.append(char)
    return "".join(noisy_chars)

def transform_duplicate_pair(text):
    # Duplicate the full name, e.g. "Jaime Pérez" -> "Jaime Pérez Jaime Pérez"
    return f"{text} {text}"

def transform_no_space(text):
    # Remove the space between first and last names.
    return "".join(text.split())

def transform_first_name_only(sample):
    # Return only the first name.
    return sample['first_name']
    
def transform_last_name_only(sample):
    # Return only the last name.
    return sample['last_name']

def transform_duplicates_end_character(sample):
    # maria -> mariaa ; Juan Luis -> Juan Luiss
    return sample['first_name'] + sample['first_name'][-1]
    

def generate_noisy_inputs(sample, transform_probs):
    full_name = f"{sample['first_name']} {sample['last_name']}"
    outputs = []
    # Each effect is applied independently.
    DEFAULT_PROB = 0.01
    if random.random() < transform_probs.get('correct', DEFAULT_PROB):
        outputs.append(('correct', transform_correct(full_name)))
    if random.random() < transform_probs.get('remove_accents_and_lower', DEFAULT_PROB):
        outputs.append(('remove_accents_and_lower', transform_remove_accents_and_lower(full_name)))
    if random.random() < transform_probs.get('insert_extra_spaces', DEFAULT_PROB):
        outputs.append(('insert_extra_spaces', transform_insert_extra_spaces(full_name)))
    if random.random() < transform_probs.get('random_uppercase', DEFAULT_PROB):
        outputs.append(('random_uppercase', transform_random_uppercase(full_name)))
    if random.random() < transform_probs.get('duplicate_pair', DEFAULT_PROB):
        outputs.append(('duplicate_pair', transform_duplicate_pair(full_name)))
    if random.random() < transform_probs.get('no_space', DEFAULT_PROB):
        outputs.append(('no_space', transform_no_space(full_name)))
    if random.random() < transform_probs.get('first_name_only', DEFAULT_PROB):
        outputs.append(('first_name_only', transform_first_name_only(sample)))
    if random.random() < transform_probs.get('last_name_only', DEFAULT_PROB):
        outputs.append(('last_name_only', transform_last_name_only(sample)))
    if random.random() < transform_probs.get('transform_duplicates_end_character',DEFAULT_PROB):
        outputs.append(('transform_duplicates_end_character', transform_last_name_only(sample)))    
    return outputs

def create_training_examples(row, transform_probs):
    noisy_versions = generate_noisy_inputs(row, transform_probs)
    examples = []
    for trans_name, noisy_input in noisy_versions:
        # Adjust target based on the transformation.
        if trans_name == 'first_name_only':
            target = {"first_name": row["first_name"], "last_name": None, "gender": row["gender"]}
        elif trans_name == 'last_name_only':
            target = {"first_name": None, "last_name": row["last_name"], "gender": row["gender"]}
        else:
            target = {"first_name": row["first_name"], "last_name": row["last_name"], "gender": row["gender"]}
        target_json = json.dumps(target, ensure_ascii=False)
        examples.append({"input": noisy_input, "output": target_json, "transformation": trans_name})
    return examples


df_es = pd.read_parquet("./df_es_clean.parquet")

# Define transformation probabilities.
transform_probs = {
    "correct": 1.0,                   
    "remove_accents_and_lower": 0.7,    
    "insert_extra_spaces": 0.3,         
    "random_uppercase": 0.1,            
    "duplicate_pair": 0.1,              
    "no_space": 0.3,
    "first_name_only": 0.2,
    "last_name_only": 0.2, 
    "transform_duplicates_end_character" : 0.05
}

# Create training examples for all samples.
all_examples = []
for _, row in df_es.iterrows():
    all_examples.extend(create_training_examples(row, transform_probs))

# Print results.
for example in all_examples[:10]:
    print("Transformation:", example["transformation"])
    print("Input:", example["input"])
    print("Output:", example["output"])
    print("-" * 50)

Transformation: correct
Input: Alberto Cea
Output: {"first_name": "Alberto", "last_name": "Cea", "gender": "M"}
--------------------------------------------------
Transformation: remove_accents_and_lower
Input: alberto cea
Output: {"first_name": "Alberto", "last_name": "Cea", "gender": "M"}
--------------------------------------------------
Transformation: insert_extra_spaces
Input: Alberto   Cea
Output: {"first_name": "Alberto", "last_name": "Cea", "gender": "M"}
--------------------------------------------------
Transformation: last_name_only
Input: Cea
Output: {"first_name": null, "last_name": "Cea", "gender": "M"}
--------------------------------------------------
Transformation: correct
Input: Adrián Gómez
Output: {"first_name": "Adrián", "last_name": "Gómez", "gender": "M"}
--------------------------------------------------
Transformation: random_uppercase
Input: AdriáN GÓmez
Output: {"first_name": "Adrián", "last_name": "Gómez", "gender": "M"}
-----------------------------------

In [4]:
len(all_examples)


29204113

In [5]:
augmented_df_es = pd.DataFrame(all_examples)


In [191]:
augmented_df_es.sample(19)

Unnamed: 0,input,output,transformation
8283789,Raza Ali,"{""first_name"": ""Raza"", ""last_name"": ""Ali"", ""ge...",correct
11554589,carlos alvarez cabo,"{""first_name"": ""Carlos"", ""last_name"": ""Álvarez...",remove_accents_and_lower
17228826,AnDreA COrraLes Bujan,"{""first_name"": ""Andrea"", ""last_name"": ""Corrale...",random_uppercase
11846174,mohamad zahir,"{""first_name"": ""Mohamad"", ""last_name"": ""Zahir""...",remove_accents_and_lower
8752156,Manuel Palenzuela Gonzalez,"{""first_name"": ""Manuel"", ""last_name"": ""Palenzu...",correct
22835797,Teresa,"{""first_name"": ""Teresa"", ""last_name"": null, ""g...",first_name_only
15540819,Francisco Segura,"{""first_name"": ""Francisco"", ""last_name"": ""Segu...",insert_extra_spaces
4916549,Marisol Hornillos Lopez,"{""first_name"": ""Marisol"", ""last_name"": ""Hornil...",correct
17152947,Sara Soriano,"{""first_name"": ""Sara"", ""last_name"": ""Soriano"",...",correct
25248470,Ángel Álvarez,"{""first_name"": ""Ángel"", ""last_name"": ""Álvarez""...",correct


In [190]:
augmented_df_es.to_parquet("df_es_clean_augmented.parquet")

#### Convert to {first}|{last}|{gender} format

In [4]:
import pandas as pd
augmented_df_es = pd.read_parquet("df_es_clean_augmented.parquet")

In [5]:
def format_output(x):
    data = json.loads(x)
    # Replace None with an empty string
    first = data.get("first_name") or ""
    last = data.get("last_name") or ""
    gender = data.get("gender") or ""
    return f"{first}|{last}|{gender}"

df = augmented_df_es 

df['formatted_output'] = df['output'].apply(format_output)
df["output"] = df["formatted_output"]

df = df.drop(columns=["transformation", "formatted_output"])

In [7]:
df.shape

(26726287, 2)

In [8]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

# Split the dataframe (df has columns "input" and "output")
train_val_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.1, random_state=42)

# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Combine into a DatasetDict
datasets = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

In [9]:
datasets

DatasetDict({
    train: Dataset({
        features: ['input', 'output', '__index_level_0__'],
        num_rows: 21648292
    })
    validation: Dataset({
        features: ['input', 'output', '__index_level_0__'],
        num_rows: 2405366
    })
    test: Dataset({
        features: ['input', 'output', '__index_level_0__'],
        num_rows: 2672629
    })
})

In [10]:
import os
from huggingface_hub import login
login(token = os.environ.get("HUGGINGFACE_AUTH_TOKEN"))

In [11]:
datasets.push_to_hub("juanluisrto/names-es")

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/7217 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/7217 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/7217 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2406 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2673 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/juanluisrto/names-es/commit/d6d3a28dc1fd7ccc88d4072d4ef17dae97a823a4', commit_message='Upload dataset', commit_description='', oid='d6d3a28dc1fd7ccc88d4072d4ef17dae97a823a4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/juanluisrto/names-es', endpoint='https://huggingface.co', repo_type='dataset', repo_id='juanluisrto/names-es'), pr_revision=None, pr_num=None)