Skip to content

Commit

Permalink
Merge branch 'develop' into bagel
Browse files Browse the repository at this point in the history
  • Loading branch information
YaphetKG committed Aug 7, 2024
2 parents 9bc078f + a4917a7 commit e6e6d41
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 29 deletions.
9 changes: 2 additions & 7 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,12 @@
# A container for the core semantic-search capability.
#
######################################################
FROM python:3.12.1-alpine3.19
FROM python:alpine3.20


# Install required packages
RUN apk update && \
apk add g++ make libexpat=2.6.2-r0 libssl3=3.1.4-r6 libcrypto3=3.1.4-r6


#upgrade openssl \

#RUN apk add openssl=3.1.4-r5
apk add g++ make


RUN pip install --upgrade pip
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ requests
redis
requests-cache
six

retrying
# Click for command line arguments
# We use Click 7.0 because that's what one of the pinned packages above use.
click
Expand Down
9 changes: 5 additions & 4 deletions src/dug/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,9 @@ class Config:
nboost_port: int = 8000

program_sort_list: str = ""
program_name_mappings: dict=field(
default_factory=lambda:{})
program_description: dict=field(default_factory=lambda:{})
consent_id_path: str= ""


# Preprocessor config that will be passed to annotate.Preprocessor constructor
preprocessor: dict = field(
Expand Down Expand Up @@ -156,8 +157,8 @@ def from_env(cls):
"redis_host": "REDIS_HOST",
"redis_port": "REDIS_PORT",
"redis_password": "REDIS_PASSWORD",
"program_sort_list": "PROGRAM_SORT_LIST",
"program_name_mappings" : "PROGRAM_NAME_MAPPINGS"
"program_description": "PROGRAM_DESCRIPTION",
"consent_id_path": "CONSENT_ID_PATH"
}

kwargs = {}
Expand Down
56 changes: 41 additions & 15 deletions src/dug/core/async_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
import logging
from elasticsearch import AsyncElasticsearch
from elasticsearch.helpers import async_scan
import ssl,os,json

import ssl,json
from dug.config import Config


logger = logging.getLogger('dug')


Expand Down Expand Up @@ -499,7 +499,7 @@ async def search_program(self, program_name=None, offset=0, size=None):
"match": {"data_type": program_name}
})

print("query_body", query_body)
#print("query_body", query_body)

# Prepare the query body for execution
body = query_body
Expand All @@ -523,18 +523,37 @@ async def search_program(self, program_name=None, offset=0, size=None):
# Append the details to the list in the desired format
collection_details_list.append(collection_details)

return collection_details_list

with open(self._cfg.consent_id_path, 'r') as file:
consent_id_mappings = json.load(file)
# Add consent_id to the study
updated_studies = []
for study in collection_details_list:
collection_id = study["collection_id"]
if collection_id in consent_id_mappings:
consent_ids = consent_id_mappings[collection_id]
for consent_id in consent_ids:
updated_study = study.copy()
updated_study["collection_id"] = f"{collection_id}.{consent_id}"
updated_study["collection_action"] = f"{study['collection_action']}"
updated_studies.append(updated_study)
else:
updated_studies.append(study)

return updated_studies





async def search_program_list(self):

query_body = {
"size": 0, # We don't need the documents themselves, so set the size to 0
"aggs": {
"unique_program_names": {
"terms": {
"field": "data_type.keyword"
"field": "data_type.keyword",
"size": 10000
},
"aggs": {
"No_of_studies": {
Expand All @@ -554,15 +573,22 @@ async def search_program_list(self):
# The unique data_types and their counts of unique collection_ids will be in the 'aggregations' field of the response
unique_data_types = search_results['aggregations']['unique_program_names']['buckets']
data=unique_data_types
program_keys =self._cfg.program_sort_list.split(',')
#key_mapping = self._cfg.program_name_mappings
#key_mapping = json.loads(key_mapping)
key_index_map = {key: index for index, key in enumerate(program_keys)}
unique_data_types = sorted(data, key=lambda x: key_index_map.get(x['key'], len(program_keys)))
#for item in unique_data_types:
# if item['key'] in key_mapping:
# item['key'] = key_mapping[item['key']]
return unique_data_types
print(data)
# Sorting the data alphabetically based on 'key'
sorted_data = sorted(data, key=lambda x: x['key'])

#Add description as another field in exisiting data based on the program name
descriptions_json = self._cfg.program_description
descriptions = json.loads(descriptions_json)
description_dict = {item['key']: {'description': item['description'], 'parent_program': item['parent_program']} for item in descriptions}

# Add descriptions and parent programs to the sorted data
for item in sorted_data:
desc_info = description_dict.get(item['key'], {'description': '', 'parent_program': []})
item['description'] = desc_info['description']
item['parent_program'] = desc_info['parent_program']

return sorted_data


def _get_var_query(self, concept, fuzziness, prefix_length, query):
Expand Down
4 changes: 2 additions & 2 deletions tests/integration/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,13 +129,13 @@ def sapbert_annotator_api():
"name": "attack; cardiovascular",
"curie": "UBERON:0007100",
"category": "biolink:Disease",
"score": "0.85857231617",
"score": 0.85857231617
},
{
"name": "Angina attack",
"curie": "XAO:0000336",
"category": "biolink:Disease",
"score": "0.806502258778",
"score": 0.806502258778
},
]
),
Expand Down

0 comments on commit e6e6d41

Please sign in to comment.