<a href="https://colab.research.google.com/github/jhajagos/PHR2OHDSI/blob/main/Hash_XML_Filenames.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook is for hashing names of CDAs downloaded from a patient portal. The downloaded CDAs often contain indentifiable information. To limit this we rename the files and maintain a map file.

Script allows new files to be added and existing files to be updated.


In [None]:
import os
import glob
import pathlib
import csv
import hashlib
import shutil
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
CDA_FILE_PATH = "/content/drive/MyDrive/phr_ohdsi/source/jgh_documents/"
salt_for_renaming = "C-CDA docuemtns are an unsung hero"

In [None]:
attic_path = pathlib.Path(CDA_FILE_PATH) / "attic"
file_name_map_name = attic_path / "file_name_map.csv"

if not(os.path.exists(attic_path)):
  os.makedir(attic_path)
  file_name_map_df = None

In [None]:
xml_files = glob.glob(str(CDA_FILE_PATH) + "*.xml") + glob.glob(str(CDA_FILE_PATH) + "*.XML")
xml_files_hash = {}
for xml_file in xml_files:
  with open(xml_file, "rb") as f:
    h = hashlib.sha256(f.read())
    xml_files_hash[xml_file] = h.hexdigest()


{'/content/drive/MyDrive/phr_ohdsi/source/jgh_documents/mn_832f18645bcc13b27600207cd93606b7.xml': '9dd4bfb7da4bf8474f63f13fc8222ed2c58ee41b53fa33b1c722db755ff8ef1a', '/content/drive/MyDrive/phr_ohdsi/source/jgh_documents/mn_1562e983740c425fea1c4df43b6c7fe5.xml': '65e33b422ce3d3b548dd005b5a8bd1486e92b3df5935fb0c6357bc0dfcf24e44', '/content/drive/MyDrive/phr_ohdsi/source/jgh_documents/mn_a65018ddb1c54ea51a45841441890077.xml': '1abab8c344a8a5fa9ed50e7b61dbf3a20137ec996bccf858e5df99afa0c3f612', '/content/drive/MyDrive/phr_ohdsi/source/jgh_documents/mn_71ca3d925c1b4c4b158ab330b89faa81.xml': '27316175b42158cfeed4f9fd5d4f9acd5fb1b90a990d24fa1903a6fb885078bc', '/content/drive/MyDrive/phr_ohdsi/source/jgh_documents/mn_b0a6b93c261e53f053a128e070bdc217.xml': 'a40bc84e7eabf11f6ba45345bdccc994bc1e2895613d83db176accb50d94e4f1', '/content/drive/MyDrive/phr_ohdsi/source/jgh_documents/mn_bbaf4fee421d581d64224170be4ad112.xml': '22c053fc646810519126f9b13436d7fc4855b27ca156cad89b5af96faf510815', '/content

In [None]:
hashed_xml_files = glob.glob(str(CDA_FILE_PATH) + "mn_*.xml")

hashed_xml_files_hash = {} # Hold hash signatures for files
for hashed_xml_file in hashed_xml_files:
  with open(hashed_xml_file, "rb") as f:
    h = hashlib.sha256(f.read())
    hashed_xml_files_hash[hashed_xml_file] = h.hexdigest()



['/content/drive/MyDrive/phr_ohdsi/source/jgh_documents/mn_832f18645bcc13b27600207cd93606b7.xml', '/content/drive/MyDrive/phr_ohdsi/source/jgh_documents/mn_1562e983740c425fea1c4df43b6c7fe5.xml', '/content/drive/MyDrive/phr_ohdsi/source/jgh_documents/mn_a65018ddb1c54ea51a45841441890077.xml', '/content/drive/MyDrive/phr_ohdsi/source/jgh_documents/mn_71ca3d925c1b4c4b158ab330b89faa81.xml', '/content/drive/MyDrive/phr_ohdsi/source/jgh_documents/mn_b0a6b93c261e53f053a128e070bdc217.xml', '/content/drive/MyDrive/phr_ohdsi/source/jgh_documents/mn_bbaf4fee421d581d64224170be4ad112.xml', '/content/drive/MyDrive/phr_ohdsi/source/jgh_documents/mn_a2628ac4f3a9fb0243d0d9dfbe225656.xml', '/content/drive/MyDrive/phr_ohdsi/source/jgh_documents/mn_a186826f2db3a0e3a49ea66b448dd231.xml', '/content/drive/MyDrive/phr_ohdsi/source/jgh_documents/mn_2c59c0cb0af5beea07b9549cae482913.xml', '/content/drive/MyDrive/phr_ohdsi/source/jgh_documents/mn_690eee739266146b55137e18d024bd8d.xml', '/content/drive/MyDrive/phr_o

In [None]:
updated_files = []
file_names_map_dict = {}

for file_name in xml_files:
  renamed_file_name = "mn_"+  hashlib.blake2b((file_name + salt_for_renaming).encode("utf8"), digest_size=16).hexdigest() + ".xml"
  file_names_map_dict[file_name] = renamed_file_name
  # Check if file has changed
  if renamed_file_name in hashed_xml_files:
    h_renamed = hash_xml_files[renamed_file_name]
    h_file = xml_files_hash[file_name]
    if h_renamed != h_file:
      updated_files += [file_name]
      print(f"Files '{file_name}' has been updated")



[]


In [None]:
first_pass_files = set(hashed_xml_files) - set(updated_files) # Remove files that now have an updated file
files_to_rename = set(xml_files) - first_pass_files
file_names_map = []

print(files_to_rename)

{'/content/drive/MyDrive/phr_ohdsi/source/jgh_documents/export_cda.xml'}


In [None]:
for file_name in files_to_rename:

  renamed_file_name = file_names_map_dict[file_name]

  file_names_map += [{"original_file_name": file_name, "renamed_file_name": renamed_file_name}]
  shutil.copyfile(file_name, pathlib.Path(CDA_FILE_PATH) / renamed_file_name)
  shutil.move(file_name,  attic_path /  os.path.split(file_name)[1])

file_name_map_df = pd.DataFrame(file_names_map)

if not os.path.exists(file_name_map_name):
  file_name_map_df.to_csv(file_name_map_name, index=False)
else:
  existing_file_name_map_df = pd.read_csv(file_name_map_name)
  file_name_map_df = pd.concat([existing_file_name_map_df, file_name_map_df], axis=1).drop_duplicates()
  file_name_map_df.to_csv(file_name_map_name, index=False)

In [None]:
!ls {CDA_FILE_PATH}

attic
mn_034b66e07897fde6fdc27ee8c42dd539.xml
mn_13ff002a19516bc7e1039213a8510113.xml
mn_1562e983740c425fea1c4df43b6c7fe5.xml
mn_2c59c0cb0af5beea07b9549cae482913.xml
mn_2cd17f56fdbaf0792c8badaaf2f3328b.xml
mn_690eee739266146b55137e18d024bd8d.xml
mn_71ca3d925c1b4c4b158ab330b89faa81.xml
mn_832f18645bcc13b27600207cd93606b7.xml
mn_8415cf0c29000af661006a6e2017e53d.xml
mn_a186826f2db3a0e3a49ea66b448dd231.xml
mn_a2628ac4f3a9fb0243d0d9dfbe225656.xml
mn_a65018ddb1c54ea51a45841441890077.xml
mn_af817edf49e0ebac5ba227981c3b4f6e.xml
mn_b0a6b93c261e53f053a128e070bdc217.xml
mn_bbaf4fee421d581d64224170be4ad112.xml
output
