Merge branch 'develop' into bagel

helxplatform · Aug 7, 2024 · e6e6d41 · e6e6d41
2 parents 9bc078f + a4917a7
commit e6e6d41
Show file tree

Hide file tree

Showing 5 changed files with 51 additions and 29 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -3,17 +3,12 @@
 # A container for the core semantic-search capability.
 #
 ######################################################
-FROM python:3.12.1-alpine3.19
+FROM python:alpine3.20
 
 
 # Install required packages
 RUN apk update && \
-    apk add g++ make libexpat=2.6.2-r0 libssl3=3.1.4-r6 libcrypto3=3.1.4-r6
-
-
-#upgrade openssl \
-
-#RUN apk  add openssl=3.1.4-r5
+    apk add g++ make 
 
 
 RUN pip install --upgrade pip

diff --git a/requirements.txt b/requirements.txt
@@ -21,7 +21,7 @@ requests
 redis
 requests-cache
 six
-
+retrying
 # Click for command line arguments
 # We use Click 7.0 because that's what one of the pinned packages above use.
 click

diff --git a/src/dug/config.py b/src/dug/config.py
@@ -28,8 +28,9 @@ class Config:
     nboost_port: int = 8000
 
     program_sort_list: str = ""
-    program_name_mappings: dict=field(
-        default_factory=lambda:{})
+    program_description: dict=field(default_factory=lambda:{})
+    consent_id_path: str= ""
+
 
     # Preprocessor config that will be passed to annotate.Preprocessor constructor
     preprocessor: dict = field(
@@ -156,8 +157,8 @@ def from_env(cls):
             "redis_host": "REDIS_HOST",
             "redis_port": "REDIS_PORT",
             "redis_password": "REDIS_PASSWORD",
-            "program_sort_list": "PROGRAM_SORT_LIST",
-            "program_name_mappings" : "PROGRAM_NAME_MAPPINGS"
+            "program_description": "PROGRAM_DESCRIPTION",
+            "consent_id_path": "CONSENT_ID_PATH"
         }
 
         kwargs = {}

diff --git a/src/dug/core/async_search.py b/src/dug/core/async_search.py
@@ -2,10 +2,10 @@
 import logging
 from elasticsearch import AsyncElasticsearch
 from elasticsearch.helpers import async_scan
-import ssl,os,json
-
+import ssl,json
 from dug.config import Config
 
+
 logger = logging.getLogger('dug')
 
 
@@ -499,7 +499,7 @@ async def search_program(self, program_name=None, offset=0, size=None):
                 "match": {"data_type": program_name}
             })
 
-        print("query_body", query_body)
+        #print("query_body", query_body)
 
         # Prepare the query body for execution
         body = query_body
@@ -523,18 +523,37 @@ async def search_program(self, program_name=None, offset=0, size=None):
             # Append the details to the list in the desired format
             collection_details_list.append(collection_details)
 
-        return collection_details_list
+
+        with open(self._cfg.consent_id_path, 'r') as file:
+            consent_id_mappings = json.load(file)
+        # Add consent_id to the study
+        updated_studies = []
+        for study in collection_details_list:
+            collection_id = study["collection_id"]
+            if collection_id in consent_id_mappings:
+                consent_ids = consent_id_mappings[collection_id]
+                for consent_id in consent_ids:
+                    updated_study = study.copy()
+                    updated_study["collection_id"] = f"{collection_id}.{consent_id}"
+                    updated_study["collection_action"] = f"{study['collection_action']}"
+                    updated_studies.append(updated_study)
+            else:
+                updated_studies.append(study)
+
+        return updated_studies
+
+
 
 
 
     async def search_program_list(self):
-
         query_body = {
             "size": 0,  # We don't need the documents themselves, so set the size to 0
             "aggs": {
                 "unique_program_names": {
                     "terms": {
-                        "field": "data_type.keyword"
+                        "field": "data_type.keyword",
+                        "size": 10000
                     },
                     "aggs": {
                         "No_of_studies": {
@@ -554,15 +573,22 @@ async def search_program_list(self):
         # The unique data_types and their counts of unique collection_ids will be in the 'aggregations' field of the response
         unique_data_types = search_results['aggregations']['unique_program_names']['buckets']
         data=unique_data_types
-        program_keys =self._cfg.program_sort_list.split(',')
-        #key_mapping = self._cfg.program_name_mappings
-        #key_mapping = json.loads(key_mapping)
-        key_index_map = {key: index for index, key in enumerate(program_keys)}
-        unique_data_types = sorted(data, key=lambda x: key_index_map.get(x['key'], len(program_keys)))
-        #for item in unique_data_types:
-        #    if item['key'] in key_mapping:
-        #        item['key'] = key_mapping[item['key']]
-        return unique_data_types
+        print(data)
+        # Sorting the data alphabetically based on 'key'
+        sorted_data = sorted(data, key=lambda x: x['key'])
+
+        #Add description as another field in exisiting data based on the program name
+        descriptions_json = self._cfg.program_description
+        descriptions = json.loads(descriptions_json)
+        description_dict = {item['key']: {'description': item['description'], 'parent_program': item['parent_program']} for item in descriptions}
+
+        # Add descriptions and parent programs to the sorted data
+        for item in sorted_data:
+            desc_info = description_dict.get(item['key'], {'description': '', 'parent_program': []})
+            item['description'] = desc_info['description']
+            item['parent_program'] = desc_info['parent_program']
+
+        return sorted_data
 
 
     def _get_var_query(self, concept, fuzziness, prefix_length, query):

diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -129,13 +129,13 @@ def sapbert_annotator_api():
                             "name": "attack; cardiovascular",
                             "curie": "UBERON:0007100",
                             "category": "biolink:Disease",
-                            "score": "0.85857231617",
+                            "score": 0.85857231617
                         },
                         {
                             "name": "Angina attack",
                             "curie": "XAO:0000336",
                             "category": "biolink:Disease",
-                            "score": "0.806502258778",
+                            "score": 0.806502258778
                         },
                     ]
                 ),