Merge pull request #75 from jjjermiah/46-feature-get_collection_descr…

…iptions feat: add getCollectionDescriptions method, with tests, and updated d…
jjjermiah · Feb 3, 2024 · 89133ac · 89133ac
2 parents 0edfd89 + 8617395
commit 89133ac
Show file tree

Hide file tree

Showing 6 changed files with 173 additions and 10 deletions.
diff --git a/docs/Tutorial.ipynb b/docs/Tutorial.ipynb
@@ -23,7 +23,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -38,16 +38,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "'0.15.1'"
+       "'0.18.1'"
       ]
      },
-     "execution_count": 23,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -74,7 +74,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -94,7 +94,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -110,6 +110,48 @@
     "print(collections)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### get Collection Description\n",
+    "\n",
+    "``` python\n",
+    "getCollectionDescriptions(\n",
+    "    collectionName: str     # (required)\n",
+    ")\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[{'collectionName': 'TCGA-BLCA',\n",
+      "  'description': 'The Cancer Genome Atlas-Bladder Endothelial Carcinoma '\n",
+      "                 '(TCGA-BLCA) data collection is part of a larger effort to '\n",
+      "                 'enhance the TCGA http://cancergenome.nih.gov/ data set with '\n",
+      "                 'characterized radiological images. The Cancer Imaging '\n",
+      "                 'Program (CIP), with the cooperation of several of the TCGA '\n",
+      "                 'tissue-contributing institutions, has archived a large '\n",
+      "                 'portion of the radiological images of the '\n",
+      "                 'genetically-analyzed BLCA cases. Please see the TCGA-BLCA '\n",
+      "                 'page to learn more about the images and to obtain any '\n",
+      "                 'supporting metadata for this collection.',\n",
+      "  'descriptionURI': 'https://doi.org/10.7937/K9/TCIA.2016.8LNG8XDR',\n",
+      "  'lastUpdated': '2023-03-16'}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "pprint(client.getCollectionDescriptions(\"TCGA-BLCA\"))"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

diff --git a/src/nbiatoolkit/nbia.py b/src/nbiatoolkit/nbia.py
@@ -1,7 +1,6 @@
 from .auth import OAuth2
-from .utils.nbia_endpoints import NBIA_ENDPOINTS
 from .logger.logger import setup_logger
-from .utils.md5 import validateMD5
+from .utils import NBIA_ENDPOINTS, validateMD5, clean_html, convertMillis
 from .dicomsort import DICOMSorter
 
 import requests
@@ -108,6 +107,27 @@ def getCollections(self, prefix: str = "") -> Union[list[str], None]:
                 collections.append(name)
         return collections
 
+    def getCollectionDescriptions(self, collectionName : str) -> Union[list[dict[str, str]], None]:
+        PARAMS = self.parsePARAMS(locals())
+
+        response = self.query_api(NBIA_ENDPOINTS.GET_COLLECTION_DESCRIPTIONS, PARAMS)
+
+        if len(response) == 0:
+            raise ValueError("The response from the API is empty. Please check the collection name.")
+
+        api_response = response[0]
+        if not isinstance(api_response, dict):
+            raise ValueError("The response from the API is not a dictionary")
+
+        returnVal : dict[str, str] = {
+            "collectionName" : api_response['collectionName'],
+            "description" : clean_html(api_response['description']),
+            "descriptionURI" : api_response['descriptionURI'],
+            "lastUpdated" : convertMillis(millis=int(api_response['collectionDescTimestamp'])),
+        }
+
+        return [returnVal]
+
     def getModalityValues(
         self, Collection: str = "", BodyPartExamined: str = ""
     ) -> Union[list[str], None]:

diff --git a/src/nbiatoolkit/utils/__init__.py b/src/nbiatoolkit/utils/__init__.py
@@ -1,4 +1,4 @@
 from .nbia_endpoints import NBIA_ENDPOINTS
 from .md5 import validateMD5
-
-__all__ = ["NBIA_ENDPOINTS", "validateMD5"]
+from .parsers import convertMillis, clean_html
+__all__ = ["NBIA_ENDPOINTS", "validateMD5", "convertMillis", "clean_html"]
diff --git a/src/nbiatoolkit/utils/parsers.py b/src/nbiatoolkit/utils/parsers.py
@@ -0,0 +1,39 @@
+from bs4 import BeautifulSoup
+from datetime import datetime
+
+
+def clean_html(html_string: str) -> str:
+    """
+    Cleans the given HTML string by removing HTML tags and replacing special characters.
+
+    Args:
+        html_string (str): The input HTML string to be cleaned.
+
+    Returns:
+        str: The cleaned text content without HTML tags and special characters.
+    """
+    assert isinstance(html_string, str), "The input must be a string"
+    assert html_string != "", "The input string cannot be empty" 
+    soup = BeautifulSoup(html_string, 'html.parser')
+    text_content = soup.get_text(separator=' ', strip=True)
+    text_content = text_content.replace('\xa0', ' ')
+    return text_content
+
+
+from datetime import datetime
+
+def convertMillis(millis: int) -> str:
+    """
+    Convert milliseconds to a formatted date string.
+
+    Args:
+        millis (int): The number of milliseconds to convert.
+
+    Returns:
+        str: The formatted date string in the format 'YYYY-MM-DD'.
+
+    Raises:
+        AssertionError: If the input is not an integer.
+    """
+    assert isinstance(millis, int), "The input must be an integer"
+    return datetime.fromtimestamp(millis / 1000.0).strftime('%Y-%m-%d')
diff --git a/tests/test_nbia.py b/tests/test_nbia.py
@@ -172,3 +172,19 @@ def test_downloadSeries(nbia_client, nbia_collections, nbia_patients):
         assert file.endswith(".dcm")
         assert file[:-4].isdigit()
 
+def test_getCollectionDescriptions(nbia_client):
+    collectionName = "4D-Lung"
+    descriptions = nbia_client.getCollectionDescriptions(collectionName)
+    assert isinstance(descriptions, list)
+    assert len(descriptions) == 1
+    assert isinstance(descriptions[0], dict)
+    assert "collectionName" in descriptions[0]
+    assert descriptions[0]["collectionName"] == collectionName
+    assert "description" in descriptions[0]
+    assert "descriptionURI" in descriptions[0]
+    assert "lastUpdated" in descriptions[0]
+
+def test_failed_getCollectionDescriptions(nbia_client):
+    collectionName = "bad_collection"
+    with pytest.raises(ValueError):
+        nbia_client.getCollectionDescriptions(collectionName)
diff --git a/tests/test_parsers.py b/tests/test_parsers.py
@@ -0,0 +1,46 @@
+
+from numpy import exp
+from src.nbiatoolkit.utils.parsers import clean_html, convertMillis
+from datetime import datetime
+import pytest
+def test_clean_html_valid_input():
+    # Test case for valid input with HTML tags and special characters
+    html_string = "<p>This is <b>bold</b> text with special characters: &amp; &lt; &gt;</p>"
+    expected_output = "This is bold text with special characters: & < >"
+    assert clean_html(html_string) == expected_output
+
+def test_clean_html_empty_input():
+    # Test case for empty input string
+    html_string = ""
+    expected_output = ""
+    with pytest.raises(AssertionError) as e:
+        clean_html(html_string)
+
+def test_clean_html_no_html_tags():
+    # Test case for input string without any HTML tags
+    html_string = "This is a plain text without any HTML tags"
+    expected_output = "This is a plain text without any HTML tags"
+    assert clean_html(html_string) == expected_output
+
+def test_clean_html_special_characters_only():
+    # Test case for input string with only special characters
+    html_string = "&amp; &lt; &gt;"
+    expected_output = "& < >"
+    assert clean_html(html_string) == expected_output
+
+
+def test_convertMillis_valid_input():
+    # Test case for valid input
+    target_date = datetime(2021, 9, 1)
+    millis = int(target_date.timestamp() * 1000)
+    expected_output = "2021-09-01"
+    assert convertMillis(millis) == expected_output
+
+def test_convertMillis_invalid_input():
+    # Test case for invalid input
+    millis = "1630444800000"  # Invalid input: string instead of integer
+    try:
+        convertMillis(millis)  # type: ignore
+        assert False, "Expected AssertionError"
+    except AssertionError as e:
+        assert str(e) == "The input must be an integer"