Merge pull request #41 from jjjermiah/27-md5pyadd-tests-and-logging-+…

…-error-handling-for-non-existent-files 27 md5pyadd tests and logging + error handling for non existent files
jjjermiah · Jan 21, 2024 · c1ed27b · c1ed27b
2 parents 7401362 + ca80e28
commit c1ed27b
Show file tree

Hide file tree

Showing 6 changed files with 194 additions and 83 deletions.
diff --git a/docs/Tutorial.ipynb b/docs/Tutorial.ipynb
@@ -49,7 +49,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -433,7 +433,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -459,7 +459,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -480,7 +480,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Downloading 5 series: 100%|██████████| 5/5 [00:23<00:00,  4.69s/it]"
+      "Downloading 5 series: 100%|██████████| 5/5 [00:26<00:00,  5.31s/it]"
      ]
     },
     {
@@ -545,21 +545,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Downloading 5 series:   0%|          | 0/5 [00:00<?, ?it/s]"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Downloading 5 series: 100%|██████████| 5/5 [00:17<00:00,  3.53s/it]\n"
+      "Downloading 5 series: 100%|██████████| 5/5 [00:18<00:00,  3.69s/it]\n"
      ]
     },
     {
@@ -568,7 +561,7 @@
        "True"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -578,7 +571,7 @@
     "    seriesUIDS[0:5], \n",
     "    downloadDir, \n",
     "    filePattern=\"%PatientName/%SeriesNumber-%SeriesInstanceUID/%InstanceNumber-%SOPInstanceUID.dcm\",\n",
-    "    overwrite=True, nParallel=4)"
+    "    overwrite=True, nParallel=5)"
    ]
   },
   {

diff --git a/src/nbiatoolkit/dicomsort/dicomsort.py b/src/nbiatoolkit/dicomsort/dicomsort.py
@@ -23,7 +23,7 @@ def __init__(
 
     def generateFilePathFromDICOMAttributes(
         self, dataset: pydicom.dataset.FileDataset
-        ) -> str:
+    ) -> str:
         """
         Generate a file path for the DICOM file by formatting DICOM attributes.
         """
@@ -45,7 +45,7 @@ def generateFilePathFromDICOMAttributes(
 
     def sortSingleDICOMFile(
         self, filePath: str, option: str, overwrite: bool = False
-        ) -> bool:
+    ) -> bool:
         assert option in ["copy", "move"], "Invalid option: symlink not implemented yet"
 
         try: 
@@ -77,38 +77,39 @@ def sortSingleDICOMFile(
 
         return True
 
+    def sortDICOMFiles(
+        self, option: str = "copy", overwrite: bool = False
+    ) -> bool:
 
-    def sortDICOMFiles(self, option: str = "copy", overwrite: bool = False) -> bool:    
-
-        all_files = []
-        # Iterate over all files in the source directory
-        for root, dirs, files in os.walk(self.sourceDir):
-            for file in files:
-                all_files.append(os.path.join(root, file)) if file.endswith(".dcm") else None
+        dicom_file_paths = self._get_dicom_files()
+
+        results = [self.sortSingleDICOMFile(file, option, overwrite) for file in dicom_file_paths]
 
-        results = [self.sortSingleDICOMFile(file, option, overwrite) for file in all_files]
-
         return all(results)
 
 
-# Test case
-if __name__ == "__main__":
+    def _get_dicom_files(self) -> list[str]:
+        dicom_file_paths = []
+        # Iterate over all files in the source directory
+        for root, dirs, files in os.walk(self.sourceDir):
+            for f in files:
+                dicom_file_paths.append(os.path.join(root, f)) if f.endswith(".dcm") else None
 
-    # Create an instance of DICOMSorter with the desired target pattern
-    sourceDir="/home/bioinf/bhklab/jermiah/projects/NBIA-toolkit/resources/rawdata/RADCURE-0281"
-    pattern = '%PatientName/%StudyDescription-%StudyDate/%SeriesNumber-%SeriesDescription-%SeriesInstanceUID/%InstanceNumber.dcm'
-    destinationDir="/home/bioinf/bhklab/jermiah/projects/NBIA-toolkit/resources/procdata"
-    
-    sorter = DICOMSorter(
-        sourceDir = sourceDir,
-        destinationDir=destinationDir,
-        targetPattern=pattern,
-        truncateUID=True,
-        sanitizeFilename=True,
-        overwrite=True
-        )
-
-    sorter.sortDICOMFiles(option="move")    
+        return dicom_file_paths
+
+# Test case
+# if __name__ == "__main__":
+
+    # sorter = DICOMSorter(
+    #     sourceDir = sourceDir,
+    #     destinationDir=destinationDir,
+    #     targetPattern=pattern,
+    #     truncateUID=True,
+    #     sanitizeFilename=True,
+    #     overwrite=True
+    #     )
+
+    # sorter.sortDICOMFiles(option="move")
 
 
 

diff --git a/src/nbiatoolkit/nbia.py b/src/nbiatoolkit/nbia.py
@@ -11,6 +11,8 @@
 import zipfile
 from tqdm import tqdm
 
+
+
 class NBIAClient:
     """
     The NBIAClient class is a wrapper around the NBIA REST API. It provides
@@ -20,8 +22,6 @@ class NBIAClient:
     and password, you can pass them to the constructor.
     
     TODO:: Add docstring
-    FIXME:: logger prints duplicate logs if you instantiate the class more
-    than once
     """
 
     def __init__(self,
@@ -48,31 +48,6 @@ def __init__(self,
     def headers(self):
         return self.api_headers
 
-    # setter method to update logger with a new instance of setup_logger
-    def setLogger(
-        self,
-        log_level: str = "INFO",
-        console_logging: bool = False,
-        log_file: str = None,
-        log_dir: str = None,
-        log_format: str = '%(asctime)s | %(name)s | %(levelname)s | %(message)s',
-        datefmt: str = '%y-%m-%d %H:%M'
-    ) -> bool:
-        try:
-            self.log = setup_logger(
-                name="NBIAClient",
-                log_level=log_level,
-                console_logging=console_logging,
-                log_file=log_file,
-                log_dir=log_dir,
-                log_format=log_format,
-                datefmt=datefmt
-            )
-            return True
-        except Exception as e:
-            self.log.error("Error setting up logger: %s", e)
-            return False
-
     def query_api(self, endpoint: NBIA_ENDPOINTS, params: dict = {}) -> dict:
 
         query_url = NBIA_ENDPOINTS.BASE_URL.value + endpoint.value
@@ -227,7 +202,8 @@ def downloadSeries(
     # downloads in the future
     def _downloadSingleSeries(
         self, SeriesInstanceUID: str, downloadDir: str,
-        filePattern: str, overwrite: bool) -> bool:
+        filePattern: str, overwrite: bool
+    ) -> bool:
 
         # create temporary directory
         from tempfile import TemporaryDirectory
@@ -248,8 +224,11 @@ def _downloadSingleSeries(
 
         with TemporaryDirectory() as tempDir:
             file.extractall(path=tempDir)
-            if not validateMD5(seriesDir=tempDir) and not overwrite:
-                self.log.error("MD5 validation failed. Exiting...")
+
+            try:
+                validateMD5(seriesDir=tempDir)
+            except Exception as e:
+                self.log.error("Error validating MD5 hash: %s", e)
                 return False
 
             # Create an instance of DICOMSorter with the desired target pattern
@@ -260,8 +239,12 @@ def _downloadSingleSeries(
                 truncateUID=True,
                 sanitizeFilename=True
                 )
-
-            sorter.sortDICOMFiles(option="move", overwrite=overwrite)
+            # sorter.sortDICOMFiles(option="move", overwrite=overwrite)
+            if not sorter.sortDICOMFiles(option="move", overwrite=overwrite):
+                self.log.error(
+                    "Error sorting DICOM files for series %s\n \
+                        failed files located at %s", SeriesInstanceUID, tempDir)
+                return False
 
         return True
 

diff --git a/src/nbiatoolkit/utils/md5.py b/src/nbiatoolkit/utils/md5.py
@@ -1,5 +1,9 @@
 import hashlib, os
 
+# Define MD5HashMismatchError
+class MD5HashMismatchError(Exception):
+    pass
+
 def calculateMD5(filepath: str) -> str:
         hash_md5 = hashlib.md5()
         with open(filepath, "rb") as f:
@@ -10,21 +14,66 @@ def calculateMD5(filepath: str) -> str:
 def validateMD5(seriesDir: str) -> bool:
 
     md5File = os.path.join(seriesDir, "md5hashes.csv")
-    assert os.path.isfile(md5File), "MD5 hash file not found in download directory."
-
+    if not os.path.isfile(md5File):
+        # "MD5 hash file not found in download directory."
+        raise FileNotFoundError("MD5 hash file not found in download directory.")
+
     with open(md5File, "r") as f:
         lines = f.readlines()
 
     for line in lines[1:]:           
         filepath = os.path.join(seriesDir, line.split(",")[0])
         if not os.path.isfile(filepath):
-            print(f"File not found in seriesDir: {filepath}")
-            return False
+            raise FileNotFoundError(f"File not found in seriesDir: {filepath}")
 
         md5hash = line.split(",")[1].strip().lower()
         md5 = calculateMD5(filepath)
 
-        assert md5 == md5hash, f"MD5 hash mismatch for file: {filepath}"       
+        if (md5 != md5hash):
+            #f"MD5 hash mismatch for file: {filepath}"
+            raise MD5HashMismatchError(f"MD5 hash mismatch for file: {filepath}")
+
     # delete the md5 file if all hashes match
     os.remove(md5File)
-    return True
+    return True
+
+
+if __name__ == "__main__":
+    import sys
+    import io
+    import os
+    import zipfile
+    from nbiatoolkit.utils.nbia_endpoints import NBIA_ENDPOINTS
+    from nbiatoolkit import NBIAClient
+    import requests
+    from tempfile import TemporaryDirectory
+
+    client = NBIAClient()
+    series = '1.3.6.1.4.1.14519.5.2.1.6834.5010.189721824525842725510380467695'
+    query_url = NBIA_ENDPOINTS.BASE_URL.value + NBIA_ENDPOINTS.DOWNLOAD_SERIES.value
+
+    params = dict()
+    params["SeriesInstanceUID"] = series
+
+    response = requests.get(
+        url=query_url,
+        headers=client.api_headers,
+        params=params
+    )
+
+    file = zipfile.ZipFile(io.BytesIO(response.content))
+
+    tempDir_ = TemporaryDirectory()
+    tempDir = tempDir_.name
+
+    file.extractall(path=tempDir)
+
+    try:
+        validateMD5(tempDir)
+        print("MD5 hashes validated successfully.")
+    except AssertionError as e:
+        print(f"Error validating MD5 hashes: {e}")
+        sys.exit(1)
+    except FileNotFoundError as e:
+        print(f"Error validating MD5 hashes: {e}")
+        sys.exit(1)