<a href="https://colab.research.google.com/github/jhajagos/PHR2OHDSI/blob/main/Hash_XML_Filenames.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook is for hashing names of CDAs downloaded from a patient portal. The downloaded CDAs often contain indentifiable information.


In [21]:
import os
import glob
import pathlib
import csv
import hashlib
import shutil
import pandas as pd

In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
CDA_FILE_PATH = "/content/drive/MyDrive/phr_ohdsi/source/jgh_documents/"
salt_for_renaming = "C-CDA docuemtns are an unsung hero"

In [24]:
attic_path = pathlib.Path(CDA_FILE_PATH) / "attic"

if not(os.path.exists(attic_path)):
  os.makedir(attic_path)

In [25]:
xml_files = glob.glob(str(CDA_FILE_PATH) + "*.xml") + glob.glob(str(CDA_FILE_PATH) + "*.XML")

In [26]:
hashed_xml_files = glob.glob(str(CDA_FILE_PATH) + "mn_*.xml")

In [27]:
files_to_rename = set(xml_files) - set(hashed_xml_files)
file_names_map = []

for file_name in files_to_rename:

  renamed_file_name = "mn_"+  hashlib.blake2b((file_name + salt_for_renaming).encode("utf8"), digest_size=16).hexdigest() + ".xml"
  file_names_map += [{"original_file_name": file_name, "renamed_file_name": renamed_file_name}]
  shutil.copyfile(file_name, pathlib.Path(CDA_FILE_PATH) / renamed_file_name)
  shutil.move(file_name,  attic_path /  os.path.split(file_name)[1])

file_name_map_df = pd.DataFrame(file_names_map)

if not os.path.exists(attic_path / "file_name_map.csv"):
  file_name_map_df.to_csv(attic_path / "file_name_map.csv", index=False)
else:
  existing_file_name_map_df = pd.read_csv(attic_path / "file_name_map.csv")
  file_name_map_df = pd.concat([existing_file_name_map_df, file_name_map_df], axis=1).drop_duplicates()
  file_name_map_df.to_csv(attic_path / "file_name_map.csv", index=False)


In [28]:
!ls {CDA_FILE_PATH}

attic					 mn_8415cf0c29000af661006a6e2017e53d.xml
mn_034b66e07897fde6fdc27ee8c42dd539.xml  mn_a186826f2db3a0e3a49ea66b448dd231.xml
mn_13ff002a19516bc7e1039213a8510113.xml  mn_a2628ac4f3a9fb0243d0d9dfbe225656.xml
mn_1562e983740c425fea1c4df43b6c7fe5.xml  mn_a65018ddb1c54ea51a45841441890077.xml
mn_2c59c0cb0af5beea07b9549cae482913.xml  mn_af817edf49e0ebac5ba227981c3b4f6e.xml
mn_2cd17f56fdbaf0792c8badaaf2f3328b.xml  mn_b0a6b93c261e53f053a128e070bdc217.xml
mn_690eee739266146b55137e18d024bd8d.xml  mn_bbaf4fee421d581d64224170be4ad112.xml
mn_71ca3d925c1b4c4b158ab330b89faa81.xml  output
mn_832f18645bcc13b27600207cd93606b7.xml
