In [42]:
import pandas as pd
import os
import hashlib

In [43]:
notebook_dir = os.getcwd()

In [44]:
md_file_path = os.path.join(notebook_dir, '..', 'data', 'missing_dates_csv')

In [45]:
# Set working directory and paths
input_path = os.path.join(md_file_path, 'md_checkpoints', 'md_checkpoint4_geocode_complete.csv')
output_path = os.path.join(md_file_path, 'md_checkpoints', 'md_checkpoint5_hashed.csv')
mapping_path = os.path.join(md_file_path, 'md_intermediate', 'md_name_mapping.csv')

In [46]:
checkpoint4 = pd.read_csv(input_path)

In [47]:
# Function to generate a hashed person_id from Name and DOB
def generate_id(name, dob):
    if pd.isna(name) or pd.isna(dob):
        return ''
    combined = f"{str(name).strip()}_{str(dob).strip()}"
    return hashlib.sha256(combined.encode()).hexdigest()

In [48]:
# Create person_id column only if 'Charges' is present
checkpoint4['person_id'] = checkpoint4.apply(
    lambda row: generate_id(row['Name'], row['DOB']) if pd.notna(row['Charges']) and str(row['Charges']).strip() != '' else '',
    axis=1
)

In [49]:
for _, row in checkpoint4.iterrows():
    person_id = row['person_id']
    if person_id != '':
        print(row['Incident #'], person_id)

24001059.0 bbd46b409b22db9ca72a580d2f3ff677366ea510d45a31e7591b5dc758599fb3
24001409.0 f7bc73de01bd2b04c6b222b0f83c4c0e1065b93fc2fad2935f71e7fc1e857e0d
24001428.0 96212987fdbaba9d1985877eb62871478133266d3735694b96f45ea01b13e426
23003248.0 45c9a44c5bf155016256df968a877ac433421740ea92ad0251c0280bc93dedd1
23003258.0 8b2d46a516c2b543b1d97f31b0960166d496a78789358bf4bf83ef27902f1ed0
23003259.0 11016a390ec61d84162e78623a665a1a5d11639057b6d3b0e2e7d236b1558829
22008711.0 2fcecd534242b48490a6c3b63ef724b05a8807dea3e7ef1e62d648abfb15185e
22008711.0 1efe257bf13d7a7eb892452b304c7623a9149cb6cf606a19da18fd0b524bfbda
22008716.0 eb64d9c81bc5961d354e949001e63d377b7b3121568b45bf30f97a0604e4a0d6
23012697.0 f721063d65d798656d0b37488ffc157ba44cd6a2aa36f415562805d6426392c3
23012703.0 695945b5a7182bf45f7af4803ba1697500634e40a79b944192417cb36e02498a
23013044.0 05c4a0dfc18ea61ff59af82d448f8451f211779ba3a513ff2c7b123407f150cd
23013044.0 af6946277ee8f4f752bb50740db00191e80e5cc123e02c1a0c7d519b19518bcb
23013063.0 6

In [50]:
# Save the mapping of person_id -> Name, DOB
name_mapping = checkpoint4[checkpoint4['person_id'] != ''][['person_id', 'Name', 'DOB']].drop_duplicates()
name_mapping.to_csv(mapping_path, index=False)

In [51]:
checkpoint4 = checkpoint4.drop(columns=['Name'])

In [52]:
# Save the checkpoint with person_id
checkpoint4.to_csv(output_path, index=False)

### IF ever need to recover the names

In [None]:
# import pandas as pd
# import os

# # Set paths
# notebook_dir = os.getcwd()
# checkpoint_path = os.path.join(notebook_dir, '..', 'data', 'checkpoints', 'checkpoint5_hashed.csv')
# mapping_path = os.path.join(notebook_dir, '..', 'data', 'checkpoints', 'name_mapping.csv')

# # Load the checkpoint and name mapping files
# checkpoint_df = pd.read_csv(checkpoint_path)
# name_mapping = pd.read_csv(mapping_path)

# # Merge on person_id to recover names (automatically handles missing matches by returning NaNs)
# recovered_df = checkpoint_df.merge(name_mapping, on='person_id', how='left')

# # Output preview
# print("\nSample of recovered names:")
# print(recovered_df[['person_id', 'Name', 'DOB']].head())

# # Save recovered data
# output_path = os.path.join(notebook_dir, '..', 'data', 'checkpoints', 'checkpoint_with_names.csv')
# recovered_df.to_csv(output_path, index=False)
