<a href="https://colab.research.google.com/github/jhajagos/PHR2OHDSI/blob/main/Hash_XML_Filenames.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook is for hashing names of CDAs downloaded from a patient portal. The downloaded CDAs often contain indentifiable information. To limit this we rename the files and maintain a map file.

Script allows new files to be added and existing files to be updated.


In [1]:
import os
import glob
import pathlib
import csv
import hashlib
import shutil
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
CDA_FILE_PATH = "/content/drive/MyDrive/phr_ohdsi/source/jgh_documents/"
salt_for_renaming = "C-CDA documents are an unsung hero"

In [5]:
attic_path = pathlib.Path(CDA_FILE_PATH) / "attic"
file_name_map_name = attic_path / "file_name_map.csv"

if not(os.path.exists(attic_path)):
  os.makedir(attic_path)
  file_name_map_df = None

In [6]:
xml_files = glob.glob(str(CDA_FILE_PATH) + "*.xml") + glob.glob(str(CDA_FILE_PATH) + "*.XML")
xml_files_hash = {}
for xml_file in xml_files:
  with open(xml_file, "rb") as f:
    h = hashlib.sha256(f.read())
    xml_files_hash[xml_file] = h.hexdigest()


In [7]:
hashed_xml_files = glob.glob(str(CDA_FILE_PATH) + "mn_*.xml")

hashed_xml_files_hash = {} # Hold hash signatures for files
for hashed_xml_file in hashed_xml_files:
  with open(hashed_xml_file, "rb") as f:
    h = hashlib.sha256(f.read())
    hashed_xml_files_hash[hashed_xml_file] = h.hexdigest()



In [8]:
updated_files = []
file_names_map_dict = {}

for file_name in xml_files:
  renamed_file_name = "mn_"+  hashlib.blake2b((file_name + salt_for_renaming).encode("utf8"), digest_size=16).hexdigest() + ".xml"
  file_names_map_dict[file_name] = renamed_file_name
  # Check if file has changed
  if renamed_file_name in hashed_xml_files:
    h_renamed = hash_xml_files[renamed_file_name]
    h_file = xml_files_hash[file_name]
    if h_renamed != h_file:
      updated_files += [file_name]
      print(f"Files '{file_name}' has been updated")



In [9]:
first_pass_files = set(hashed_xml_files) - set(updated_files) # Remove files that now have an updated file
files_to_rename = set(xml_files) - first_pass_files
file_names_map = []


In [10]:
for file_name in files_to_rename:

  renamed_file_name = file_names_map_dict[file_name]

  file_names_map += [{"original_file_name": file_name, "renamed_file_name": renamed_file_name}]
  shutil.copyfile(file_name, pathlib.Path(CDA_FILE_PATH) / renamed_file_name)
  shutil.move(file_name,  attic_path /  os.path.split(file_name)[1])

file_name_map_df = pd.DataFrame(file_names_map)

if not os.path.exists(file_name_map_name):
  file_name_map_df.to_csv(file_name_map_name, index=False)
else:
  existing_file_name_map_df = pd.read_csv(file_name_map_name)
  file_name_map_df = pd.concat([existing_file_name_map_df, file_name_map_df], axis=1).drop_duplicates()
  file_name_map_df.to_csv(file_name_map_name, index=False)

In [11]:
!ls {CDA_FILE_PATH}

attic
mn_034b66e07897fde6fdc27ee8c42dd539.xml
mn_13ff002a19516bc7e1039213a8510113.xml
mn_1562e983740c425fea1c4df43b6c7fe5.xml
mn_2c59c0cb0af5beea07b9549cae482913.xml
mn_2cd17f56fdbaf0792c8badaaf2f3328b.xml
mn_690eee739266146b55137e18d024bd8d.xml
mn_71ca3d925c1b4c4b158ab330b89faa81.xml
mn_832f18645bcc13b27600207cd93606b7.xml
mn_8415cf0c29000af661006a6e2017e53d.xml
mn_a186826f2db3a0e3a49ea66b448dd231.xml
mn_a2628ac4f3a9fb0243d0d9dfbe225656.xml
mn_a65018ddb1c54ea51a45841441890077.xml
mn_af817edf49e0ebac5ba227981c3b4f6e.xml
mn_b0a6b93c261e53f053a128e070bdc217.xml
mn_bbaf4fee421d581d64224170be4ad112.xml
output
