# Generation and Insertion of Direct Identifiers into PUMS Dataset

For each individual in the PUMS dataset, we will generate the following direct identifiers: 
1. Name, consisting of first and last name. 
2. Social Security Number (SSN)
3. Credit card number 
4. Telephone number
5. Full address
6. (Email tbd)


In [1]:
import pickle
from tqdm import tqdm
import pandas as pd
import random
import numpy as np
import requests

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

tqdm.pandas()

## (0) Set Parameters and load the source data

In [None]:
INPUT_DATASET = "../data/100_profiles.csv" # This is the dataset we read from
OUTPUT_DATASET = "../data/100_profiles.csv" # This is the dataset we write to (can be same as INPUT_DATASET if we want to overwrite)
IDENTIFIER_FUNCTIONS = {
    "CREDCARD": "generate_card", # Key = Feature name (i.e. dataset column header), Value = Function for generating feature 
    "SSN": "generate_SSN"
}

# other resources we will need when generating the names
# The first name is sampled from the actual distribution of baby names, conditioned on both year of birth and gender. 
# Source: https://www.ssa.gov/oact/babynames/limits.html
# The last name is sampled from the actual distribution for last names more frequent than 1000 occurrences from the US Census 2010.
# This is not dependent on gender, nor on year of birth.
# Source: https://www.census.gov/topics/population/genealogy/data.html
FIRST_NAMES_FILE = "FIRST_NAMES_FILE"
LAST_NAMES_FILE = "LAST_NAMES_FILE"

## resources needed for the address generation
PUMA_TO_ZCTA_FILE = "/data/puma_to_zcta.pickle"
GOOGLE_MAPS_API_KEY = ""

In [4]:
df = pd.read_csv(INPUT_DATASET, na_values=" ?")
print(len(df))
df.head()

100


Unnamed: 0,OCCP,RAC2P,ST,CIT,ESR,SCHL,MAR,SEX,PUMA,PUMA_FULL,DOB,DOB-Day,DOB-Month,DOB-Year,PUMA_og,identifiers
0,1,1,2,1,3,7,2,2,2504,84,31143,5,August,2015,202504,"RAC2P,SEX,DOB,OCCP,MAR"
1,1,1,1,1,3,6,2,2,7900,394,31382,31,March,2016,107900,"OCCP,ESR,ST,CIT,SEX"
2,7,5,1,3,1,22,1,2,5420,1414,13542,28,May,1967,105420,"SCHL,SEX,ST,ESR,MAR"
3,1,1,1,1,3,6,2,2,2900,1158,31880,11,August,2017,102900,"CIT,ESR,RAC2P,DOB,ST"
4,1,1,3,1,2,17,2,2,3504,1812,27470,15,July,2005,303504,"SCHL,RAC2P,OCCP,ESR,CIT"


## (1) Generate full names

In [5]:
FIRST_NAME_DF = pd.read_csv(FIRST_NAMES_FILE)
LAST_NAME_DF = pd.read_csv(LAST_NAMES_FILE)

# convert to np 
_FIRST_NAMES = FIRST_NAME_DF["first_name"].to_numpy()
_FIRST_GENDERS = FIRST_NAME_DF["gender"].to_numpy()

_LAST_NAMES = LAST_NAME_DF["last_name"].to_numpy()
_LAST_P = LAST_NAME_DF["last_name_frequency"].to_numpy(dtype=float)

def get_full_name(gender, yob, min_year=1880, max_year=2024):
    '''
    Generate a full name based on gender and age.
    Input: 
        gender: 'M' or 'F'
        age: integer
              
    The first name is sampled from the actual distribution of baby names, conditioned on both year of birth and gender. 
    Source: https://www.ssa.gov/oact/babynames/limits.html
    
    The last name is sampled from the actual distribution for last names more frequent than 1000 occurrences from the US Census 2010.
    This is not dependent on gender, nor on year of birth.
    Source: https://www.census.gov/topics/population/genealogy/data.html
    '''
    
    if yob < min_year: yob = min_year
    elif yob > max_year: yob = max_year
    
    # sample first name
    freq = FIRST_NAME_DF[f"freq_{yob}"].to_numpy(dtype=float)
    mask = (_FIRST_GENDERS == gender) & (freq > 0)
    p = freq[mask]
    first_name = np.random.choice(_FIRST_NAMES[mask], p=p)
    
    # sample last name
    last_name = np.random.choice(_LAST_NAMES, p=_LAST_P)
    
    return first_name + ' ' + last_name

# let's test this first
print("For a male of age 30: ", get_full_name('M', 1995))
print("For a female of age 45: ", get_full_name('F', 1980))

For a male of age 30:  Joshua Tyne
For a female of age 45:  Lauren Matson


In [6]:
# Takes approx 5 mins to run

def row_to_full_name(row):
    sex = row['SEX']
    yob = row['DOB-Year']
    gender = {1: 'M', 2: 'F'}.get(int(sex))
    try:
        return get_full_name(gender, yob)
    except Exception:
        return None

df['name'] = df.progress_apply(row_to_full_name, axis=1)

100%|██████████| 100/100 [00:00<00:00, 254.55it/s]


In [7]:
df.name.head()

0         Lilly Morales
1        Pearl Striplin
2         Pamela Manion
3    Elizabeth Buhrmann
4          Cadence Cole
Name: name, dtype: object

## (2) SSN

In [8]:
def checkSSNvalid(SSN):
    # Check if all digits are same
    firstdigit = SSN[0]
    digit_all_same_flag = True
    for c in SSN:
        if c != firstdigit:
            digit_all_same_flag = False

    if digit_all_same_flag:
        return False

    return True

def generate_SSN():
    # SSNs are comprised of 3 parts, Area Number, Group Number, Serial Number
    SSN = ""

    # Generate Area Number, Area number cannot be 000, 900-999 or 666
    AreaNumber = 666
    while AreaNumber == 666:
        AreaNumber = random.randint(1, 899)
    GroupNumber = random.randint(1, 99)
    SerialNumber = random.randint(1, 9999)
    if AreaNumber < 100:
        SSN = SSN + "0"
        if AreaNumber < 10:
            SSN = SSN + "0"
    SSN = SSN + str(AreaNumber) + "-"

    # Generate Group Number, Group number cannot be 00
    if GroupNumber < 10:
        SSN = SSN + "0"
    SSN = SSN + str(GroupNumber) + "-"

    # Generate Serial Number, Serial number cannot be 00
    if SerialNumber < 1000:
        SSN = SSN + "0"
        if SerialNumber < 100:
            SSN = SSN + "0"
            if SerialNumber < 10:
                SSN = SSN + "0"
    SSN = SSN + str(SerialNumber)

    # SSNs cannot have all digits the same
    if checkSSNvalid(SSN) == False:
        SSN = generate_SSN()

    return SSN

# test this out
print("Generated SSN: ", generate_SSN())

Generated SSN:  655-15-0410


In [9]:
# add this for all
df['SSN'] = df.progress_apply(lambda x: generate_SSN(), axis=1)

100%|██████████| 100/100 [00:00<00:00, 40626.73it/s]


## (3) Credit card number

In [10]:
def luhn_checksum(card_number: str) -> int:
    """Calculate the Luhn checksum for validation."""

    def digits_of(n):
        return [int(d) for d in str(n)]

    digits = digits_of(card_number)
    odd_digits = digits[-1::-2]
    even_digits = digits[-2::-2]
    total = sum(odd_digits)
    for d in even_digits:
        total += sum(digits_of(d * 2))
    return total % 10

def generate_card_number(prefix: str, length: int) -> str:
    """Generate a card number with given prefix and length that passes Luhn check."""
    number = prefix
    while len(number) < (length - 1):
        number += str(random.randint(0, 9))

    # calculate check digit
    check_digit = [
        str(d) for d in range(10) if luhn_checksum(number + str(d)) == 0
    ][0]
    return number + check_digit


def generate_card():
    issuer = random.choice(
        ["visa", "mastercard", "amex", "discover", "diners", "jcb"]
    )
    """Generate dummy card numbers by issuer."""
    issuers = {
        "visa": ("4", 16),
        "mastercard": (str(random.choice(range(51, 56))), 16),
        "amex": (str(random.choice(["34", "37"])), 15),
        "discover": ("6011", 16),
        "diners": (
            str(
                random.choice(
                    ["300", "301", "302", "303", "304", "305", "36", "38"]
                )
            ),
            14,
        ),
        "jcb": ("35", 16),
    }

    if issuer.lower() not in issuers:
        raise ValueError(
            "Unknown issuer. Choose from: " + ", ".join(issuers.keys())
        )

    prefix, length = issuers[issuer.lower()]
    card = generate_card_number(prefix, length)
    return card

# test this out
print("Generated Credit Card: ", generate_card())

Generated Credit Card:  4326773602606478


In [11]:
# apply to all 
df['credit card number'] = df.progress_apply(lambda x: generate_card(), axis=1)

100%|██████████| 100/100 [00:00<00:00, 6928.38it/s]


## (4) Phone number

In [None]:
# A full US phone number has 10 digits, usually written as: (NPA) NXX-XXXX. Where:
# NPA = Area code (3 digits)
# NXX = Central office / exchange code (3 digits)
# XXXX = Line number (4 digits)
# Example: (415) 555-1234

# Let's create a function that generates a random, yet realistic US phone number prompted on US state. 

# Area codes are linked to location and US states, but imply where a number was first issued, not where someone lives now
# For something realistic, we curate a set of "likely" area codes per state (not exhaustive).
# based on: https://en.wikipedia.org/wiki/List_of_North_American_Numbering_Plan_area_codes

# Mappings consistent with encodings we use in the dataset
STATE_TO_AREA_CODES = {
    23: [205, 251, 256, 334, 659, 938],  # Alabama
    47: [907],  # Alaska
    16: [480, 520, 602, 623, 928],  # Arizona
    32: [327, 479, 501, 870],  # Arkansas
    1: [
        209, 213, 279, 310, 323, 341, 408, 415, 424, 442,
        510, 530, 559, 562, 619, 626, 628, 650, 657, 661,
        669, 707, 714, 747, 760, 805, 818, 820, 831, 858,
        909, 916, 925, 949, 951
    ],  # California
    22: [303, 719, 720, 970],  # Colorado
    29: [203, 475, 860, 959],  # Connecticut
    45: [302],  # Delaware
    50: [202],  # District of Columbia
    4: [
        239, 305, 321, 352, 386, 407, 561, 727, 754, 772,
        786, 813, 850, 863, 904, 941, 954
    ],  # Florida
    10: [229, 404, 470, 478, 678, 706, 762, 770, 912],  # Georgia
    40: [808],  # Hawaii
    39: [208],  # Idaho
    5: [217, 224, 309, 312, 331, 618, 630, 708, 773, 779, 815, 847, 872],  # Illinois
    15: [219, 260, 317, 574, 765, 812, 930],  # Indiana
    30: [319, 515, 563, 641, 712],  # Iowa
    34: [316, 620, 785, 913],  # Kansas
    26: [270, 364, 502, 606, 859],  # Kentucky
    25: [225, 318, 337, 504, 985],  # Louisiana
    42: [207],  # Maine
    20: [240, 301, 410, 443, 667],  # Maryland
    14: [339, 351, 413, 508, 617, 774, 781, 857, 978],  # Massachusetts
    8: [231, 248, 269, 313, 517, 586, 616, 734, 810, 906, 947, 989],  # Michigan
    21: [218, 320, 507, 612, 651, 763, 952],  # Minnesota
    31: [228, 601, 662, 769],  # Mississippi
    18: [314, 417, 573, 636, 660, 816],  # Missouri
    44: [406],  # Montana
    38: [308, 402, 531],  # Nebraska
    35: [702, 725, 775],  # Nevada
    41: [603],  # New Hampshire
    11: [201, 551, 609, 640, 732, 848, 856, 862, 908, 973],  # New Jersey
    36: [505, 575],  # New Mexico
    3: [
        212, 315, 332, 347, 516, 518, 585, 607, 631,
        646, 680, 716, 718, 838, 845, 914, 917, 929, 934
    ],  # New York
    9: [252, 336, 704, 743, 828, 910, 919, 980, 984],  # North Carolina
    48: [701],  # North Dakota
    7: [216, 234, 220, 330, 380, 419, 440, 513, 567, 614, 740, 937],  # Ohio
    28: [405, 539, 580, 918],  # Oklahoma
    27: [458, 503, 541, 971],  # Oregon
    6: [215, 223, 267, 272, 412, 570, 610, 717, 724, 814, 878],  # Pennsylvania
    43: [401],  # Rhode Island
    24: [803, 843, 864],  # South Carolina
    46: [605],  # South Dakota
    17: [423, 615, 629, 731, 865, 901, 931],  # Tennessee
    2: [
        210, 214, 254, 281, 325, 346, 361, 409, 430, 432,
        469, 512, 713, 737, 806, 817, 832, 903, 915, 936,
        940, 945, 956, 972, 979
    ],  # Texas
    33: [385, 435, 801],  # Utah
    49: [802],  # Vermont
    12: [276, 434, 540, 571, 703, 757, 804],  # Virginia
    13: [206, 253, 360, 425, 509, 564],  # Washington
    37: [304, 681],  # West Virginia
    19: [262, 414, 608, 534, 715, 920],  # Wisconsin
    51: [307],  # Wyoming
}

# For the 3-digit exchange code, we can generate random numbers 
# but we need to exclude the reserved 3-digit service codes _N11 
_N11 = {211, 311, 411, 511, 611, 711, 811, 911}

def generate_phone_number(state: str) -> str:
    """
    Generate a NANP-valid, local-looking US phone number for a given state (2-letter code).
    Example: generate_phone_number("CA") -> "(415) 732-1049"
    """

    if state not in STATE_TO_AREA_CODES:
        raise ValueError(f"Unknown/unsupported state code: {state!r}. Use a 2-letter code like 'CA' or 'NY'.")
    
    # (1) get an area code for the state
    npa = random.choice(STATE_TO_AREA_CODES[state])  

    # (2) Get the exchange code: NXX where N=2..9; avoid N11
    while True:
        nxx = random.randint(200, 999)
        if nxx not in _N11:
            break

    # (3) Line number: 0000-9999 
    xxxx = random.randint(0, 9999)

    return f"({npa:03d}) {nxx:03d}-{xxxx:04d}"

# test this out
print("Generated Phone Number (CA): ", generate_phone_number(6))

Generated Phone Number (CA):  (570) 678-9038


In [13]:
# now get for all 

df['phone number'] = df.progress_apply(lambda x: generate_phone_number(x['ST']), axis=1)

100%|██████████| 100/100 [00:00<00:00, 35919.36it/s]


## (5) Save it all

In [None]:
df.to_csv(OUTPUT_DATASET, index=False)