# Pre-test data 

## 1. Data import

In [40]:
import pandas as pd
import numpy as np

In [41]:
# Load niaid queries

queries_df = pd.read_csv('./niaid_queries.csv', sep=',')
queries_df

Unnamed: 0,combination,search term,result id
0,original query,influenza,PRJNA658522
1,original query,influenza,PRJNA658529
2,original query,influenza,PRJNA658552
3,original query,influenza,PRJNA658564
4,original query,influenza,PRJNA658562
...,...,...,...
3995,c15,hiv,VIVLI_02021d26-0e96-424c-8fec-77cbd204fd8d
3996,c15,hiv,VIVLI_7a8a7234-ada3-465b-a608-347b5e74a542
3997,c15,hiv,VIVLI_7ac8687d-79c8-4860-8141-83776849e2df
3998,c15,hiv,VIVLI_242c4b5a-19b3-4a51-b885-05358ed89db6


In [42]:
queries_df['search term'].unique()

array(['influenza', 'malaria therapeutics', 'long covid',
       'zika microcephaly', 'naegleria fowleri infection', 'asthma',
       'allergy treatment', 'allergen skin prick test',
       'sublingual immunotherapy', 'AIDS', 't-cell function',
       'immunotherapeutics', "addison's disease", 'cancer',
       'myocardial infarction', 'rational cancer drug design',
       'dendritic cells', 'mast cells', 'plasmacytoid dendritic cells',
       'pinealocyte', 'metabolomics', 'gwas', 'tuberculin skin test',
       'mycobacterium', 'hiv'], dtype=object)

## 2. Data wrangling

In [43]:
# Remove original query rows

all_combinations_df = queries_df.drop(
                    queries_df.loc[queries_df['combination'] == 'original query'].index).copy(deep = True)
all_combinations_df

Unnamed: 0,combination,search term,result id
10,c1,influenza,PRJNA658522
11,c1,influenza,PRJNA658529
12,c1,influenza,PRJNA658552
13,c1,influenza,PRJNA658564
14,c1,influenza,PRJNA658562
...,...,...,...
3995,c15,hiv,VIVLI_02021d26-0e96-424c-8fec-77cbd204fd8d
3996,c15,hiv,VIVLI_7a8a7234-ada3-465b-a608-347b5e74a542
3997,c15,hiv,VIVLI_7ac8687d-79c8-4860-8141-83776849e2df
3998,c15,hiv,VIVLI_242c4b5a-19b3-4a51-b885-05358ed89db6


In [44]:
# Alternate test data and figures 

In [45]:
# Unique result ids per search term

unique_results_ids = all_combinations_df.groupby('search term')['result id'].unique().reset_index()
unique_results_ids

Unnamed: 0,search term,result id
0,AIDS,"[S-EPMC7818909, S-EPMC6638923, S-EPMC6435605, ..."
1,addison's disease,"[S-BSST626, S-EPMC4821366, S-EPMC6026684, S-EP..."
2,allergen skin prick test,"[S-EPMC4880857, S-EPMC3574828, S-EPMC5555666, ..."
3,allergy treatment,"[IMMPORT_SDY1520, S-EPMC5317265, accessclinica..."
4,asthma,"[GSE43696, GSE130499, GSE67940, GSE63142, S-EP..."
5,cancer,"[Mendeley_7yb9y7nc52, GSE212173, NCBI_SRA_SRP1..."
6,dendritic cells,"[S-EPMC3751914, GSE45652, E-MTAB-11735, S-EPMC..."
7,gwas,"[ZENODO_6976231, EGAD00010001463, EGAD00010000..."
8,hiv,"[VIVLI_1749e227-99eb-4847-bfc2-10f90b30ae4a, V..."
9,immunotherapeutics,"[S-EPMC4019905, S-EPMC7784533, S-EPMC6481018, ..."


In [46]:
# Total number of unique result ids for all search terms across the combinations

nunique_result_ids = all_combinations_df['result id'].nunique()
nunique_result_ids

489

In [47]:
# Average unique result ids per search term

average_result_ids = all_combinations_df.groupby('search term')['result id'].nunique().mean()
average_result_ids

19.64

In [31]:
# Pre-test data and figures 

In [48]:
# Remove unneeded search terms 

search_terms_for_pre_test = ['influenza', 'immunotherapeutics', 'malaria therapeutics']
pre_test_data_df = all_combinations_df[all_combinations_df['search term'].isin(search_terms_for_pre_test)]
pre_test_data_df

Unnamed: 0,combination,search term,result id
10,c1,influenza,PRJNA658522
11,c1,influenza,PRJNA658529
12,c1,influenza,PRJNA658552
13,c1,influenza,PRJNA658564
14,c1,influenza,PRJNA658562
...,...,...,...
1915,c15,immunotherapeutics,S-EPMC4703482
1916,c15,immunotherapeutics,PXD017839
1917,c15,immunotherapeutics,S-EPMC6237481
1918,c15,immunotherapeutics,S-EPMC6536974


In [49]:
# Unique result ids per search term

unique_results_ids_pre_tests = pre_test_data_df.groupby('search term')['result id'].unique().reset_index()
unique_results_ids_pre_tests

Unnamed: 0,search term,result id
0,immunotherapeutics,"[S-EPMC4019905, S-EPMC7784533, S-EPMC6481018, ..."
1,influenza,"[PRJNA658522, PRJNA658529, PRJNA658552, PRJNA6..."
2,malaria therapeutics,"[S-EPMC7382078, S-EPMC4759581, S-EPMC6168423, ..."


In [50]:
# Total number of unique result ids for pre-test search terms

nunique_result_ids_pre_tests = pre_test_data_df['result id'].nunique()
nunique_result_ids_pre_tests

49

In [51]:
# Average unique result ids per pre-test search term

average_result_ids_pre_test = pre_test_data_df.groupby('search term')['result id'].nunique().mean()
average_result_ids_pre_test

16.333333333333332

In [52]:
# Generate pre-test search term list

search_term_list = pre_test_data_df['result id'].unique().tolist()
search_term_list

['PRJNA658522',
 'PRJNA658529',
 'PRJNA658552',
 'PRJNA658564',
 'PRJNA658562',
 'PRJNA658555',
 'S-EPMC3290697',
 'S-EPMC2958547',
 'NCBI_SRA_ERP000316',
 'NCBI_SRA_ERP017907',
 'IMMPORT_SDY756',
 'IMMPORT_SDY1697',
 'IMMPORT_SDY618',
 'ZENODO_4837559',
 'GSE102797',
 'GSE13637',
 'IMMPORT_SDY1467',
 'IMMPORT_SDY1471',
 'IMMPORT_SDY1469',
 'IMMPORT_SDY1468',
 'IMMPORT_SDY17',
 'IMMPORT_SDY350',
 'ZENODO_4701589',
 'ZENODO_4699177',
 'DRYAD_doi.org:10.5061:dryad.cv37539',
 'Dataverse_10.7910_DVN_2PUGFZ',
 'S-EPMC7382078',
 'S-EPMC4759581',
 'S-EPMC6168423',
 'S-EPMC4852283',
 'GSE77499',
 'S-EPMC6682681',
 'S-EPMC3152530',
 'S-EPMC4890880',
 'S-EPMC7235171',
 'S-EPMC3574796',
 'GSE67184',
 'GSE67470',
 'S-EPMC4019905',
 'S-EPMC7784533',
 'S-EPMC6481018',
 'S-EPMC7533533',
 'S-EPMC7415431',
 'S-EPMC4703482',
 'S-EPMC6237481',
 'PXD017839',
 'S-EPMC6536974',
 'S-EPMC7103671',
 'S-EPMC7052369']