In [23]:
import pandas as pd
import os
import hashlib

In [24]:
# Set working directory and paths
notebook_dir = os.getcwd()
input_path = os.path.join(notebook_dir, '..', 'data', 'checkpoints', 'checkpoint4_geocode_complete.csv')
output_path = os.path.join(notebook_dir, '..', 'data', 'checkpoints', 'checkpoint5_hashed.csv')
mapping_path = os.path.join(notebook_dir, '..', 'data', 'intermediate', 'name_mapping.csv')

In [25]:
checkpoint4 = pd.read_csv(input_path)

In [26]:
# Function to generate a hashed person_id from Name and DOB
def generate_id(name, dob):
    if pd.isna(name) or pd.isna(dob):
        return ''
    combined = f"{str(name).strip()}_{str(dob).strip()}"
    return hashlib.sha256(combined.encode()).hexdigest()

In [27]:
# Create person_id column only if 'Charges' is present
checkpoint4['person_id'] = checkpoint4.apply(
    lambda row: generate_id(row['Name'], row['DOB']) if pd.notna(row['Charges']) and str(row['Charges']).strip() != '' else '',
    axis=1
)

In [28]:
for _, row in checkpoint4.iterrows():
    person_id = row['person_id']
    if person_id != '':
        print(row['Incident #'], person_id)

18000001.0 20f72481f083d4756c89b98fd499514c5953a8a4c1025338d5742d4ad0a906bc
18000002.0 2d5adb553cad8a22fd72d2e390525997c4963bac1b185e10ca0f51af30df2fcb
18000005.0 41c43f9f4255dee8e2c910e87f2a983e1b13beecf27eacfaa7b448083abd8718
18000018.0 6d5fb726029a990cafbca655fc265c2aa034d99d5ef62eb6141420143069d52d
18000020.0 bf5796a28c5e9c6a388dbc91b93e82176533fab08017c062e1c86d9f184ee930
18000040.0 5a2653f1548f32c7849cc8cebe964e504b250b957f4017ae70d5a7dcf0efd40e
18000040.0 6119c9403628dbe93d8fbb1c28cf7031a1730b621a83eefb33adc2bc73b488e5
18000040.0 84b40950b653a7496a2f68d5f663618d750bf565dca9510152928824cdd0dca6
18000040.0 086e6699a90a3bace64bab112097f96013644089fb588dcd33da277c883c06e4
18000056.0 b288e84e98d84834a48b9a0f9c3aa9bc9f7200bd3127a54787a52652f67aa0f9
18000070.0 9200eed2ac12c65c74e36deff20063dab0948a0cc1db886653329bfd880b3e5b
18000070.0 122e508f15a10151102a169567cb0c0e7f09e3272d0fbc63ae78d2e4e298f0ce
18000094.0 00444f72b8f7a64553d4069077ea6f5469ed4d49c16bc874090b571404f59d1e
18000158.0 6

KeyboardInterrupt: 

In [29]:
checkpoint4 = checkpoint4.drop(columns=['Name'])

In [30]:
# Save the mapping of person_id -> Name, DOB
name_mapping = checkpoint4[checkpoint4['person_id'] != ''][['person_id', 'Name', 'DOB']].drop_duplicates()
name_mapping.to_csv(mapping_path, index=False)

# Save the checkpoint with person_id
checkpoint4.to_csv(output_path, index=False)

KeyError: "['Name'] not in index"

### IF ever need to recover the names

In [None]:
# import pandas as pd
# import os

# # Set paths
# notebook_dir = os.getcwd()
# checkpoint_path = os.path.join(notebook_dir, '..', 'data', 'checkpoints', 'checkpoint5_hashed.csv')
# mapping_path = os.path.join(notebook_dir, '..', 'data', 'checkpoints', 'name_mapping.csv')

# # Load the checkpoint and name mapping files
# checkpoint_df = pd.read_csv(checkpoint_path)
# name_mapping = pd.read_csv(mapping_path)

# # Merge on person_id to recover names (automatically handles missing matches by returning NaNs)
# recovered_df = checkpoint_df.merge(name_mapping, on='person_id', how='left')

# # Output preview
# print("\nSample of recovered names:")
# print(recovered_df[['person_id', 'Name', 'DOB']].head())

# # Save recovered data
# output_path = os.path.join(notebook_dir, '..', 'data', 'checkpoints', 'checkpoint_with_names.csv')
# recovered_df.to_csv(output_path, index=False)
