Skip to content

Commit

Permalink
Merge pull request #41 from jjjermiah/27-md5pyadd-tests-and-logging-+…
Browse files Browse the repository at this point in the history
…-error-handling-for-non-existent-files

27 md5pyadd tests and logging + error handling for non existent files
  • Loading branch information
jjjermiah committed Jan 21, 2024
2 parents 7401362 + ca80e28 commit c1ed27b
Show file tree
Hide file tree
Showing 6 changed files with 194 additions and 83 deletions.
23 changes: 8 additions & 15 deletions docs/Tutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -433,7 +433,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 2,
"metadata": {},
"outputs": [
{
Expand All @@ -459,7 +459,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 3,
"metadata": {},
"outputs": [
{
Expand All @@ -480,7 +480,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"Downloading 5 series: 100%|██████████| 5/5 [00:23<00:00, 4.69s/it]"
"Downloading 5 series: 100%|██████████| 5/5 [00:26<00:00, 5.31s/it]"
]
},
{
Expand Down Expand Up @@ -545,21 +545,14 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Downloading 5 series: 0%| | 0/5 [00:00<?, ?it/s]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Downloading 5 series: 100%|██████████| 5/5 [00:17<00:00, 3.53s/it]\n"
"Downloading 5 series: 100%|██████████| 5/5 [00:18<00:00, 3.69s/it]\n"
]
},
{
Expand All @@ -568,7 +561,7 @@
"True"
]
},
"execution_count": 16,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -578,7 +571,7 @@
" seriesUIDS[0:5], \n",
" downloadDir, \n",
" filePattern=\"%PatientName/%SeriesNumber-%SeriesInstanceUID/%InstanceNumber-%SOPInstanceUID.dcm\",\n",
" overwrite=True, nParallel=4)"
" overwrite=True, nParallel=5)"
]
},
{
Expand Down
57 changes: 29 additions & 28 deletions src/nbiatoolkit/dicomsort/dicomsort.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def __init__(

def generateFilePathFromDICOMAttributes(
self, dataset: pydicom.dataset.FileDataset
) -> str:
) -> str:
"""
Generate a file path for the DICOM file by formatting DICOM attributes.
"""
Expand All @@ -45,7 +45,7 @@ def generateFilePathFromDICOMAttributes(

def sortSingleDICOMFile(
self, filePath: str, option: str, overwrite: bool = False
) -> bool:
) -> bool:
assert option in ["copy", "move"], "Invalid option: symlink not implemented yet"

try:
Expand Down Expand Up @@ -77,38 +77,39 @@ def sortSingleDICOMFile(

return True

def sortDICOMFiles(
self, option: str = "copy", overwrite: bool = False
) -> bool:

def sortDICOMFiles(self, option: str = "copy", overwrite: bool = False) -> bool:

all_files = []
# Iterate over all files in the source directory
for root, dirs, files in os.walk(self.sourceDir):
for file in files:
all_files.append(os.path.join(root, file)) if file.endswith(".dcm") else None
dicom_file_paths = self._get_dicom_files()

results = [self.sortSingleDICOMFile(file, option, overwrite) for file in dicom_file_paths]

results = [self.sortSingleDICOMFile(file, option, overwrite) for file in all_files]

return all(results)


# Test case
if __name__ == "__main__":
def _get_dicom_files(self) -> list[str]:
dicom_file_paths = []
# Iterate over all files in the source directory
for root, dirs, files in os.walk(self.sourceDir):
for f in files:
dicom_file_paths.append(os.path.join(root, f)) if f.endswith(".dcm") else None

# Create an instance of DICOMSorter with the desired target pattern
sourceDir="/home/bioinf/bhklab/jermiah/projects/NBIA-toolkit/resources/rawdata/RADCURE-0281"
pattern = '%PatientName/%StudyDescription-%StudyDate/%SeriesNumber-%SeriesDescription-%SeriesInstanceUID/%InstanceNumber.dcm'
destinationDir="/home/bioinf/bhklab/jermiah/projects/NBIA-toolkit/resources/procdata"
sorter = DICOMSorter(
sourceDir = sourceDir,
destinationDir=destinationDir,
targetPattern=pattern,
truncateUID=True,
sanitizeFilename=True,
overwrite=True
)

sorter.sortDICOMFiles(option="move")
return dicom_file_paths

# Test case
# if __name__ == "__main__":

# sorter = DICOMSorter(
# sourceDir = sourceDir,
# destinationDir=destinationDir,
# targetPattern=pattern,
# truncateUID=True,
# sanitizeFilename=True,
# overwrite=True
# )

# sorter.sortDICOMFiles(option="move")



Expand Down
47 changes: 15 additions & 32 deletions src/nbiatoolkit/nbia.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import zipfile
from tqdm import tqdm



class NBIAClient:
"""
The NBIAClient class is a wrapper around the NBIA REST API. It provides
Expand All @@ -20,8 +22,6 @@ class NBIAClient:
and password, you can pass them to the constructor.
TODO:: Add docstring
FIXME:: logger prints duplicate logs if you instantiate the class more
than once
"""

def __init__(self,
Expand All @@ -48,31 +48,6 @@ def __init__(self,
def headers(self):
return self.api_headers

# setter method to update logger with a new instance of setup_logger
def setLogger(
self,
log_level: str = "INFO",
console_logging: bool = False,
log_file: str = None,
log_dir: str = None,
log_format: str = '%(asctime)s | %(name)s | %(levelname)s | %(message)s',
datefmt: str = '%y-%m-%d %H:%M'
) -> bool:
try:
self.log = setup_logger(
name="NBIAClient",
log_level=log_level,
console_logging=console_logging,
log_file=log_file,
log_dir=log_dir,
log_format=log_format,
datefmt=datefmt
)
return True
except Exception as e:
self.log.error("Error setting up logger: %s", e)
return False

def query_api(self, endpoint: NBIA_ENDPOINTS, params: dict = {}) -> dict:

query_url = NBIA_ENDPOINTS.BASE_URL.value + endpoint.value
Expand Down Expand Up @@ -227,7 +202,8 @@ def downloadSeries(
# downloads in the future
def _downloadSingleSeries(
self, SeriesInstanceUID: str, downloadDir: str,
filePattern: str, overwrite: bool) -> bool:
filePattern: str, overwrite: bool
) -> bool:

# create temporary directory
from tempfile import TemporaryDirectory
Expand All @@ -248,8 +224,11 @@ def _downloadSingleSeries(

with TemporaryDirectory() as tempDir:
file.extractall(path=tempDir)
if not validateMD5(seriesDir=tempDir) and not overwrite:
self.log.error("MD5 validation failed. Exiting...")

try:
validateMD5(seriesDir=tempDir)
except Exception as e:
self.log.error("Error validating MD5 hash: %s", e)
return False

# Create an instance of DICOMSorter with the desired target pattern
Expand All @@ -260,8 +239,12 @@ def _downloadSingleSeries(
truncateUID=True,
sanitizeFilename=True
)

sorter.sortDICOMFiles(option="move", overwrite=overwrite)
# sorter.sortDICOMFiles(option="move", overwrite=overwrite)
if not sorter.sortDICOMFiles(option="move", overwrite=overwrite):
self.log.error(
"Error sorting DICOM files for series %s\n \
failed files located at %s", SeriesInstanceUID, tempDir)
return False

return True

Expand Down
61 changes: 55 additions & 6 deletions src/nbiatoolkit/utils/md5.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import hashlib, os

# Define MD5HashMismatchError
class MD5HashMismatchError(Exception):
pass

def calculateMD5(filepath: str) -> str:
hash_md5 = hashlib.md5()
with open(filepath, "rb") as f:
Expand All @@ -10,21 +14,66 @@ def calculateMD5(filepath: str) -> str:
def validateMD5(seriesDir: str) -> bool:

md5File = os.path.join(seriesDir, "md5hashes.csv")
assert os.path.isfile(md5File), "MD5 hash file not found in download directory."

if not os.path.isfile(md5File):
# "MD5 hash file not found in download directory."
raise FileNotFoundError("MD5 hash file not found in download directory.")

with open(md5File, "r") as f:
lines = f.readlines()

for line in lines[1:]:
filepath = os.path.join(seriesDir, line.split(",")[0])
if not os.path.isfile(filepath):
print(f"File not found in seriesDir: {filepath}")
return False
raise FileNotFoundError(f"File not found in seriesDir: {filepath}")

md5hash = line.split(",")[1].strip().lower()
md5 = calculateMD5(filepath)

assert md5 == md5hash, f"MD5 hash mismatch for file: {filepath}"
if (md5 != md5hash):
#f"MD5 hash mismatch for file: {filepath}"
raise MD5HashMismatchError(f"MD5 hash mismatch for file: {filepath}")

# delete the md5 file if all hashes match
os.remove(md5File)
return True
return True


if __name__ == "__main__":
import sys
import io
import os
import zipfile
from nbiatoolkit.utils.nbia_endpoints import NBIA_ENDPOINTS
from nbiatoolkit import NBIAClient
import requests
from tempfile import TemporaryDirectory

client = NBIAClient()
series = '1.3.6.1.4.1.14519.5.2.1.6834.5010.189721824525842725510380467695'
query_url = NBIA_ENDPOINTS.BASE_URL.value + NBIA_ENDPOINTS.DOWNLOAD_SERIES.value

params = dict()
params["SeriesInstanceUID"] = series

response = requests.get(
url=query_url,
headers=client.api_headers,
params=params
)

file = zipfile.ZipFile(io.BytesIO(response.content))

tempDir_ = TemporaryDirectory()
tempDir = tempDir_.name

file.extractall(path=tempDir)

try:
validateMD5(tempDir)
print("MD5 hashes validated successfully.")
except AssertionError as e:
print(f"Error validating MD5 hashes: {e}")
sys.exit(1)
except FileNotFoundError as e:
print(f"Error validating MD5 hashes: {e}")
sys.exit(1)

0 comments on commit c1ed27b

Please sign in to comment.