## This notebook formats the data for additional statistical analysis

The conditions for the big study follow a factorial design, so we should be able to apply statistical analyses commonly used with factorial Design of Experiment studies

In Round 1, set 1 of the tests the intention was to test 4 combinations (combi 1-4) and 25 queries; however, some query combinations will have the same results, in which case the preference between the two cannot be tested. In those cases, a different combi was tested for that query

In [1]:
import os
import pandas as pd

In [2]:
script_path = os.getcwd()
datapath = os.path.join(script_path,'data')
datafile = os.path.join(datapath,'data.tsv')
scopesfile = os.path.join(datapath,'scopes.tsv')
orig_data = pd.read_csv(datafile,delimiter='\t',header=0)
datascope = pd.read_csv(scopesfile,delimiter='\t',header=0)

In [3]:
## confirm that the data is as expected and clean up as needed
print(orig_data.head(n=2))
print(datascope.head(n=2))
print(len(datascope))
print(len(orig_data))
print(len(orig_data.loc[orig_data['test type']=='M']),len(orig_data.loc[orig_data['test type']=='L']))
print(len(orig_data.loc[orig_data['test type']=='M ']))
orig_data['test type'] = [x.strip() for x in orig_data['test type']]
print(len(orig_data.loc[orig_data['test type']=='M']),len(orig_data.loc[orig_data['test type']=='L']))
print(len(orig_data.loc[orig_data['test type']=='M ']))
search_terms = datascope['search term'].unique().tolist()
print(search_terms)

   round  test search term test type combo 1  ps 1  pct ps 1  time 1 combo 2  \
0      1     1   influenza         M      c1     2        40      34      c3   
1      1     1   influenza         L      c3     2        40      57      c4   

   ps 2  pct ps 2  time 2 combo 3  ps 3  pct ps 3  time 3 combo 4  ps 4  \
0     1        20   156.0      c4     1        20   105.0      c2   1.0   
1     2        40    34.0      c1     1        20    23.0      c2   0.0   

   pct ps 4  time 4  
0      20.0    71.0  
1       0.0     NaN  
  search term       scope              domain
0   influenza  very broad  Infectious Disease
1  long covid       broad  Infectious Disease
25
102
51 51
0
51 51
0
['influenza', 'long covid', 'malaria therapeutics', 'asthma', 'naegleria fowleri infection', 'zika microcephaly', 'allergen skin prick test', 'allergy treatment', 'sublingual immunotherapy', 'AIDS', 'immunotherapeutics', 't-cell function', "addison's disease", 'cancer', 'myocardial infarction', 'dendritic

In [4]:
## Split out the results for most/best combination

orig_scoped = orig_data.merge(datascope,on='search term',how='left')
#print(orig_scoped.head(n=2))
print(len(orig_scoped))

orig_1 = orig_scoped[['round','test','search term','scope','domain','test type','combo 1','ps 1','time 1']].copy()
orig_1.rename(columns={'combo 1':'selection','ps 1':'participants','time 1':'selection time'}, inplace=True)
orig_2 = orig_scoped[['round','test','search term','scope','domain','test type','combo 2','ps 2','time 2']].copy()
orig_2.rename(columns={'combo 2':'selection','ps 2':'participants','time 2':'selection time'}, inplace=True)
orig_3 = orig_scoped[['round','test','search term','scope','domain','test type','combo 3','ps 3','time 3']].copy()
orig_3.rename(columns={'combo 3':'selection','ps 3':'participants','time 3':'selection time'}, inplace=True)
orig_4 = orig_scoped[['round','test','search term','scope','domain','test type','combo 4','ps 4','time 4']].copy()
orig_4.rename(columns={'combo 4':'selection','ps 4':'participants','time 4':'selection time'}, inplace=True)
all_results = pd.concat((orig_1,orig_2,orig_3,orig_4),ignore_index=True)
print(all_results.head(n=2))

102
   round  test search term       scope              domain test type  \
0      1     1   influenza  very broad  Infectious Disease         M   
1      1     1   influenza  very broad  Infectious Disease         L   

  selection  participants  selection time  
0        c1           2.0            34.0  
1        c3           2.0            57.0  


In [6]:
most_only = all_results.loc[all_results['test type']=='M']
least_only = all_results.loc[all_results['test type']=='L']
print(len(most_only), len(least_only))

204 204


In [8]:
## export the results
most_only.to_csv(os.path.join(datapath,'most_likely.tsv'),sep='\t',header=True)
least_only.to_csv(os.path.join(datapath,'least_likely.tsv'),sep='\t',header=True)