# CUI BATCH QUERY PIPELINE

###### INSTALL PACKAGE

In [15]:
#install to get latest version
# !pip install git+https://github.com/kevon217/data-dictionary-cui-mapping.git
# !pip install data-dictionary-cui-mapping

### STEP-1A: RUN BATCH QUERY PIPELINE


###### IMPORT PACKAGES

In [17]:
from ddcuimap.umls import batch_query_pipeline as umls_bqp
from ddcuimap.metamap import batch_query_pipeline as mm_bqp
from ddcuimap.semantic_search import batch_hybrid_query_pipeline as ss_bqp
from ddcuimap.hydra_search import batch_hydra_query_pipeline as hs_bqp

from ddcuimap.utils import helper
from omegaconf import OmegaConf

###### LOAD/EDIT CONFIGURATION FILES

In [18]:
cfg_hydra = helper.compose_config.fn(overrides=["custom=hydra_base"])
# cfg_umls = helper.compose_config.fn(overrides=["custom=de", "apis=config_umls_api"])
cfg_mm = helper.compose_config.fn(overrides=["custom=de", "apis=config_metamap_api"])
cfg_ss = helper.compose_config.fn(
    overrides=[
        "custom=title_def",
        "semantic_search=embeddings",
        "apis=config_pinecone_api",
    ]
)

# # UMLS API CREDENTIALS
# cfg_umls.apis.umls.user_info.apiKey = ''
# cfg_umls.apis.umls.user_info.email = ''

# # MetaMap API CREDENTIALS
# cfg_mm.apis.metamap.user_info.apiKey = ''
# cfg_mm.apis.metamap.user_info.email = ''
#
# # Pinecone API CREDENTIALS
# cfg_ss.apis.pinecone.index_info.apiKey = ''
# cfg_ss.apis.pinecone.index_info.environment = ''

print(OmegaConf.to_yaml(cfg_hydra))

apis:
  umls:
    user_info:
      email: null
      apiKey: null
    api_settings:
      url: https://utslogin.nlm.nih.gov/cas/v1/api-key
      uri: https://uts-ws.nlm.nih.gov
      version: current
      content_endpoint: /rest/search/current
      fullpath: https://uts-ws.nlm.nih.gov/rest/search/current
      payload: {}
      headers: {}
      sabs: []
      searchType1: exact
      searchType2: normalizedWords
      pageSize: 20
      pages_max: 1
    query_params:
      apiKey: null
      string: {}
      searchType: exact
      sabs: []
      pageSize: 20
      pageNumber: {}
  metamap:
    user_info:
      email: null
      apiKey: null
    api_settings:
      serverurl: https://utslogin.nlm.nih.gov/cas/v1/tickets
      tgtserverurl: https://utslogin.nlm.nih.gov/cas/v1/api-key
      serviceurl: https://ii.nlm.nih.gov/cgi-bin/II/UTS_Required/API_batchValidationII.pl
      cmd: metamap
      cmdargs:
        mm_data_year: -Z 2020AB
        mm_data_version: -V USAbase
        stri

###### RUN BATCH QUERY PIPELINE


In [19]:
# df_umls, cfg_umls = umls_bqp.run_umls_batch(cfg_umls)
# df_mm, cfg_mm = mm_bqp.run_mm_batch(cfg_mm)
# df_ss, cfg_ss = ss_bqp.run_hybrid_ss_batch(cfg_ss)
df_hydra, cfg_step1 = hs_bqp.run_hydra_batch(cfg_hydra, cfg_umls=None, cfg_mm=cfg_mm, cfg_ss=cfg_ss)

print(df_hydra.head())

Batches:   0%|          | 0/1 [00:00<?, ?it/s]



Batches:   0%|          | 0/1 [00:00<?, ?it/s]



Semantic Search Runner: 100%|██████████| 3/3 [00:01<00:00,  2.02it/s]
Aggregating Results: 100%|██████████| 3/3 [00:00<00:00, 225.69it/s]
Semantic Search Runner: 100%|██████████| 3/3 [00:01<00:00,  2.63it/s]
Aggregating Results: 100%|██████████| 3/3 [00:00<00:00, 238.88it/s]


  variable name         title  \
0        AgeYrs  Age in years   
1        AgeYrs  Age in years   
2        AgeYrs  Age in years   
3        AgeYrs  Age in years   
4        AgeYrs  Age in years   

                                          definition permissible values  \
0  Value for participant's subject age, calculate...                NaN   
1  Value for participant's subject age, calculate...                NaN   
2  Value for participant's subject age, calculate...                NaN   
3  Value for participant's subject age, calculate...                NaN   
4  Value for participant's subject age, calculate...                NaN   

  permissible value descriptions             preferred question text  \
0                            NaN  Subject's age (recorded in years):   
1                            NaN  Subject's age (recorded in years):   
2                            NaN  Subject's age (recorded in years):   
3                            NaN  Subject's age (recorded in y

### STEP-1B: *MANUAL CURATION STEP IN EXCEL
*see curation example in ***notebooks/examples_files/DE_Step-1_curation_keepCol.xlsx***

### STEP-2A: CREATE DATA DICTIONARY IMPORT FILE

###### IMPORT CURATION PACKAGE

In [16]:
from ddcuimap.curation import create_dictionary_import_file
from ddcuimap.curation import check_cuis
from ddcuimap.utils import helper

###### CREATE DATA DICTIONARY IMPORT FILE

In [None]:
cfg_step1 = helper.load_config.fn(helper.choose_file("Load config file from Step 1"))
df_dd = create_dictionary_import_file.create_dd_file(cfg_step1)
print(df_dd.head())

#### STEP-2B: CHECK CUIS IN DATA DICTIONARY IMPORT FILE

In [None]:
cfg_step2 = helper.load_config.fn(helper.choose_file("Load config file from Step 2"))
df_check = check_cuis.check_cuis(cfg_step2)
print(df_check.head())