# Pre-test data 

## 1. Data import

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load niaid queries
queries_df = pd.read_csv('./niaid_queries.csv', sep=',')
queries_df

Unnamed: 0,combination,search term,result id
0,original query,influenza,PRJNA658522
1,original query,influenza,PRJNA658529
2,original query,influenza,PRJNA658552
3,original query,influenza,PRJNA658564
4,original query,influenza,PRJNA658562
...,...,...,...
3995,c15,hiv,VIVLI_02021d26-0e96-424c-8fec-77cbd204fd8d
3996,c15,hiv,VIVLI_7a8a7234-ada3-465b-a608-347b5e74a542
3997,c15,hiv,VIVLI_7ac8687d-79c8-4860-8141-83776849e2df
3998,c15,hiv,VIVLI_242c4b5a-19b3-4a51-b885-05358ed89db6


In [9]:
queries_df['search term'].unique()

array(['influenza', 'malaria therapeutics', 'long covid',
       'zika microcephaly', 'naegleria fowleri infection', 'asthma',
       'allergy treatment', 'allergen skin prick test',
       'sublingual immunotherapy', 'AIDS', 't-cell function',
       'immunotherapeutics', "addison's disease", 'cancer',
       'myocardial infarction', 'rational cancer drug design',
       'dendritic cells', 'mast cells', 'plasmacytoid dendritic cells',
       'pinealocyte', 'metabolomics', 'gwas', 'tuberculin skin test',
       'mycobacterium', 'hiv'], dtype=object)

## 2. Data wrangling

In [4]:
# Remove original query rows
combinations_df = queries_df.drop(
                    queries_df.loc[queries_df['combination'] == 'original query'].index).copy(deep = True)
combinations_df

Unnamed: 0,combination,search term,result id
10,c1,influenza,PRJNA658522
11,c1,influenza,PRJNA658529
12,c1,influenza,PRJNA658552
13,c1,influenza,PRJNA658564
14,c1,influenza,PRJNA658562
...,...,...,...
3995,c15,hiv,VIVLI_02021d26-0e96-424c-8fec-77cbd204fd8d
3996,c15,hiv,VIVLI_7a8a7234-ada3-465b-a608-347b5e74a542
3997,c15,hiv,VIVLI_7ac8687d-79c8-4860-8141-83776849e2df
3998,c15,hiv,VIVLI_242c4b5a-19b3-4a51-b885-05358ed89db6


In [9]:
# Remove unneeded search terms 
search_terms_for_pre_test = ['influenza', 'immunotherapeutics', 'malaria therapeutics', 'cancer']
combinations_df = combinations_df[combinations_df['search term'].isin(search_terms_for_pre_test)]
combinations_df

Unnamed: 0,combination,search term,result id
10,c1,influenza,PRJNA658522
11,c1,influenza,PRJNA658529
12,c1,influenza,PRJNA658552
13,c1,influenza,PRJNA658564
14,c1,influenza,PRJNA658562
...,...,...,...
2235,c15,cancer,IMMPORT_SDY1108
2236,c15,cancer,IMMPORT_SDY1093
2237,c15,cancer,GSE3281
2238,c15,cancer,GSE796


In [12]:
# Generate query list

record_list = combinations_df['result id'].unique().tolist()
record_list

['PRJNA658522',
 'PRJNA658529',
 'PRJNA658552',
 'PRJNA658564',
 'PRJNA658562',
 'PRJNA658555',
 'S-EPMC3290697',
 'S-EPMC2958547',
 'NCBI_SRA_ERP000316',
 'NCBI_SRA_ERP017907',
 'IMMPORT_SDY756',
 'IMMPORT_SDY1697',
 'IMMPORT_SDY618',
 'ZENODO_4837559',
 'GSE102797',
 'GSE13637',
 'IMMPORT_SDY1467',
 'IMMPORT_SDY1471',
 'IMMPORT_SDY1469',
 'IMMPORT_SDY1468',
 'IMMPORT_SDY17',
 'IMMPORT_SDY350',
 'ZENODO_4701589',
 'ZENODO_4699177',
 'DRYAD_doi.org:10.5061:dryad.cv37539',
 'Dataverse_10.7910_DVN_2PUGFZ',
 'S-EPMC7382078',
 'S-EPMC4759581',
 'S-EPMC6168423',
 'S-EPMC4852283',
 'GSE77499',
 'S-EPMC6682681',
 'S-EPMC3152530',
 'S-EPMC4890880',
 'S-EPMC7235171',
 'S-EPMC3574796',
 'GSE67184',
 'GSE67470',
 'S-EPMC4019905',
 'S-EPMC7784533',
 'S-EPMC6481018',
 'S-EPMC7533533',
 'S-EPMC7415431',
 'S-EPMC4703482',
 'S-EPMC6237481',
 'PXD017839',
 'S-EPMC6536974',
 'S-EPMC7103671',
 'S-EPMC7052369',
 'Mendeley_7yb9y7nc52',
 'GSE212173',
 'NCBI_SRA_SRP139077',
 'GSE3933',
 'GSE49052',
 'ZENODO_