# Process Names Spreadsheet

This notebook walk you through the process of parsing a CSV containing names and appending demographic facts.

This notebook runs on Python3 and has no external dependencies.

Before you get started, you will need to prepare to make API calls to the HumanGraphics API. If you need help, this [article](https://www.humangraphics.io/help-center-articles/making-your-first-api-call) walks you through the preparation process.

In [86]:
# Set up some context

# First, read our home directory from the environment
from os import getenv
HOME = getenv("HOME")

# Next, load our .humangraphics file.
#
# It should be stored at: $HOME/.humangraphics
#
# It should look like this (unindented), where $PLAN is the plan you're subscribed to:
#
#    [api]
#    key = XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
#    base_url = https://api.humangraphics.io/$PLAN
#
from configparser import ConfigParser
config = ConfigParser()
config.read(f"{HOME}/.humangraphics")

HUMANGRAPHICS_API_KEY = config["api"]["key"]
HUMANGRAPHICS_API_BASE_URL = config["api"]["base_url"]

In [87]:
from urllib.request import Request, urlopen
from urllib.error import HTTPError
from json import dumps, loads
from time import sleep

# Here are some functions to help along the way
def parse_human_name(name):
    attempt = 1
    while True:
        # Prepare our entity content
        entity = dumps({"text": name}).encode("utf-8")

        # Prepare our request
        rin = Request(f"{HUMANGRAPHICS_API_BASE_URL}/v1/humans/names/parse")
        rin.add_header("x-blobr-key", HUMANGRAPHICS_API_KEY)
        rin.add_header("content-type", "application/json; charset=utf-8")
        rin.add_header("content-length", len(entity))
        rin.add_header("accept", "application/json")

        # Make our request
        try:
            rout = urlopen(rin, entity)
        except HTTPError as e:
            if e.code == 429:
                # Let's use an exponential backoff strategy
                sleep(min(64, 1 << (attempt - 1)))
                attempt = attempt + 1
                continue
            raise e;
            
        break

        # Process our successful response
        
    # Decoding UTF-8 is safe here because HumanGraphics always uses UTF-8 for JSON.
    return loads(rout.read().decode("utf-8"))

def dict_multi_get(d, *args):
    if d is None:
        return None

    result = d
    for arg in args:
        result = result.get(arg)
        if result is None:
            return None
    
    return result

In [88]:
# Set up some contants
NAME_FIELD_NAME = "name"

HUMAN_NAME_CONFIDENCE = "human_name_confidence"
GIVEN_NAME_FIELD_NAME = "given_name"
MIDDLE_NAME_FIELD_NAME = "middle_name"
SECOND_MIDDLE_NAME_FIELD_NAME = "second_middle_name"
NICK_NAME_FIELD_NAME = "nick_name"
FAMILY_NAME_FIELD_NAME = "family_name"
SECOND_FAMILY_NAME_FIELD_NAME = "second_family_name"

AGE_BEST_ESTIMATE_VALUE_FIELD_NAME = "age_best_estimate_value"
AGE_BEST_ESTIMATE_LIKELIHOOD_FIELD_NAME = "age_best_estimate_likelihood"
AGE_UNDER_18_LIKELIHOOD_FIELD_NAME = "age_under_18_likelihood"
AGE_FROM_18_TO_24_LIKELIHOOD_FIELD_NAME = "age_from_18_to_24_likelihood"
AGE_FROM_25_TO_34_LIKELIHOOD_FIELD_NAME = "age_from_25_to_34_likelihood"
AGE_FROM_35_TO_44_LIKELIHOOD_FIELD_NAME = "age_from_35_to_44_likelihood"
AGE_FROM_45_TO_54_LIKELIHOOD_FIELD_NAME = "age_from_45_to_54_likelihood"
AGE_FROM_55_to_64_LIKELIHOOD_FIELD_NAME = "age_from_55_to_64_likelihood"
AGE_OVER_65_LIKELIHOOD_FIELD_NAME = "age_over_65_likelihood"

GENDER_BEST_ESTIMATE_VALUE_FIELD_NAME = "gender_best_estimate_value"
GENDER_BEST_ESTIMATE_LIKELIHOOD_FIELD_NAME = "gender_best_etimate_likelihood"
GENDER_MALE_LIKELIHOOD_FIELD_NAME = "gender_male_likelihood"
GENDER_FEMALE_LIKELIHOOD_FIELD_NAME = "gender_female_likelihood"

RACE_BEST_ESTIMATE_VALUE_FIELD_NAME = "race_best_estimate_value"
RACE_BEST_ETIMATE_LIKELIHOOD_FIELD_NAME = "race_best_estimate_likelihood"
RACE_WHITE_LIKELIHOOD_FIELD_NAME = "race_white_likelihood"
RACE_BLACK_LIKELIHOOD_FIELD_NAME = "race_black_likelihood"
RACE_ASIAN_PACIFIC_ISLANDER_LIKELIHOOD_FIELD_NAME = "race_asian_pacific_islander_likelihood"
RACE_AMERICAN_INDIAN_LIKELIHOOD_FIELD_NAME = "race_american_indian_likelihood"
RACE_HISPANIC_LIKELIHOOD_FIELD_NAME = "race_hispanic_likelihood"
RACE_TWO_PLUS_RACES_LIKELIHOOD_FIELD_NAME = "race_two_plus_races_likelihood"

def get_age_best_estimate_likelihood(d):
    age_best_estimate = dict_multi_get(d, "age")
    return None if age_best_estimate is None else dict_multi_get(d, "ageEstimate", age_best_estimate)

def get_gender_best_estimate_likelihood(d):
    gender_best_estimate = dict_multi_get(d, "gender")
    return None if gender_best_estimate is None else dict_multi_get(d, "genderEstimate", gender_best_estimate)

def get_race_best_estimate_likelihood(d):
    race_best_estimate = dict_multi_get(d, "race")
    return None if race_best_estimate is None else dict_multi_get(d, "raceEstimate", race_best_estimate)
                
APPENDED_FIELDS = [
    (HUMAN_NAME_CONFIDENCE, lambda d: dict_multi_get(d, "confidence")),
    (GIVEN_NAME_FIELD_NAME, lambda d: dict_multi_get(d, "match", "givenName", "matchedText")),
    (MIDDLE_NAME_FIELD_NAME, lambda d: dict_multi_get(d, "match", "middleName", "matchedText")),
    (SECOND_MIDDLE_NAME_FIELD_NAME, lambda d: dict_multi_get(d, "match", "secondMiddleName", "matchedText")),
    (NICK_NAME_FIELD_NAME, lambda d: dict_multi_get(d, "match", "nickName", "matchedText")),
    (FAMILY_NAME_FIELD_NAME, lambda d: dict_multi_get(d, "match", "familyName", "matchedText")),
    (SECOND_FAMILY_NAME_FIELD_NAME, lambda d: dict_multi_get(d, "match", "secondFamilyName", "matchedText")),
    (AGE_BEST_ESTIMATE_VALUE_FIELD_NAME, lambda d: dict_multi_get(d, "age")),
    (AGE_BEST_ESTIMATE_LIKELIHOOD_FIELD_NAME, get_age_best_estimate_likelihood),
    (AGE_UNDER_18_LIKELIHOOD_FIELD_NAME, lambda d: dict_multi_get(d, "ageEstimate", "under18")),
    (AGE_FROM_18_TO_24_LIKELIHOOD_FIELD_NAME, lambda d: dict_multi_get(d, "ageEstimate", "from18To24")),
    (AGE_FROM_25_TO_34_LIKELIHOOD_FIELD_NAME, lambda d: dict_multi_get(d, "ageEstimate", "from25To34")),
    (AGE_FROM_35_TO_44_LIKELIHOOD_FIELD_NAME, lambda d: dict_multi_get(d, "ageEstimate", "from35To44")),
    (AGE_FROM_45_TO_54_LIKELIHOOD_FIELD_NAME, lambda d: dict_multi_get(d, "ageEstimate", "from45To54")),
    (AGE_FROM_55_to_64_LIKELIHOOD_FIELD_NAME, lambda d: dict_multi_get(d, "ageEstimate", "from55To64")),
    (AGE_OVER_65_LIKELIHOOD_FIELD_NAME, lambda d: dict_multi_get(d, "ageEstimate", "over65")),
    (GENDER_BEST_ESTIMATE_VALUE_FIELD_NAME, lambda d: dict_multi_get(d, "gender")),
    (GENDER_BEST_ESTIMATE_LIKELIHOOD_FIELD_NAME, get_gender_best_estimate_likelihood),
    (GENDER_MALE_LIKELIHOOD_FIELD_NAME, lambda d: dict_multi_get(d, "genderEstimate", "male")),
    (GENDER_FEMALE_LIKELIHOOD_FIELD_NAME, lambda d: dict_multi_get(d, "genderEstimate", "female")),
    (RACE_BEST_ESTIMATE_VALUE_FIELD_NAME, lambda d: dict_multi_get(d, "race")),
    (RACE_BEST_ETIMATE_LIKELIHOOD_FIELD_NAME, get_race_best_estimate_likelihood),
    (RACE_WHITE_LIKELIHOOD_FIELD_NAME, lambda d: dict_multi_get(d, "raceEstimate", "white")),
    (RACE_BLACK_LIKELIHOOD_FIELD_NAME, lambda d: dict_multi_get(d, "raceEstimate", "black")),
    (RACE_ASIAN_PACIFIC_ISLANDER_LIKELIHOOD_FIELD_NAME, lambda d: dict_multi_get(d, "raceEstimate", "asianPacificIslander")),
    (RACE_AMERICAN_INDIAN_LIKELIHOOD_FIELD_NAME, lambda d: dict_multi_get(d, "raceEstimate", "americanIndian")),
    (RACE_HISPANIC_LIKELIHOOD_FIELD_NAME, lambda d: dict_multi_get(d, "raceEstimate", "hispanic")),
    (RACE_TWO_PLUS_RACES_LIKELIHOOD_FIELD_NAME, lambda d: dict_multi_get(d, "raceEstimate", "twoPlusRaces")),
]

In [89]:
from csv import DictReader, DictWriter

INPUT_FILENAME = f"{HOME}/records.csv"
OUTPUT_FILENAME = f"{HOME}/enriched-records.csv"

# The input file should contain a header row with field names.
# It must define a field called "name" which contains the name to parse.
# Any additional fields are ignored and passed along to the output.
with open(INPUT_FILENAME, "r", newline="") as input_file:
    input_rows = DictReader(input_file)
    
    if NAME_FIELD_NAME not in input_rows.fieldnames:
        raise ValueError(f"Input file {INPUT_FILENAME} does not contain a field named {NAME_FIELD_NAME}")
        
    with open(OUTPUT_FILENAME, "w", newline="") as output_file:
        output_rows = DictWriter(output_file, fieldnames=input_rows.fieldnames + [ fi[0] for fi in APPENDED_FIELDS ])
        
        output_rows.writeheader()
        
        for input_row in input_rows:
            name = input_row[NAME_FIELD_NAME].strip()
            
            if name == "":
                output_rows.writerow(input_row | { fi[0]: "" for fi in APPENDED_FIELDS })
                continue
                
            parsed_human_name = None
            try:
                parsed_human_name = parse_human_name(name)
            except Exception as e:
                print(e)
                output_rows.writerow(input_row | { fi[0]: "" for fi in APPENDED_FIELDS })
                continue
                
            output_row = {**input_row}
            for fi in APPENDED_FIELDS:
                name = fi[0]
                value = fi[1](parsed_human_name)
                output_row[name] = str(value) if value else ""
                
            # print(output_row)
                
            output_rows.writerow(output_row)