# Basic analysis - Lyssna data

## 1. Data import

In [1]:
import pandas as pd
import numpy as np

pd.options.display.max_columns = 22
pd.options.display.max_rows = 150

In [2]:
# Load search term scopes

scopes_df = pd.read_csv('./search_term_scopes.csv', sep=',')
scopes_df

Unnamed: 0,search term,scope
0,influenza,very broad
1,long covid,broad
2,malaria therapeutics,very specific
3,asthma,very broad
4,naegleria fowleri infection,very specific
5,zika microcephaly,somewhat specific
6,allergen skin prick test,somewhat specific
7,allergy treatment,broad
8,sublingual immunotherapy,very specific
9,AIDS,very broad


In [3]:
# Load Lyssna data

df = pd.read_csv('./all_rounds_data.csv', sep=',')
df

Unnamed: 0,round,test,search term,test type,combo 1,ps 1,pct ps 1,time 1,combo 2,ps 2,pct ps 2,time 2,combo 3,ps 3,pct ps 3,time 3,combo 4,ps 4,pct ps 4,time 4
0,1,1,influenza,M,c1,2,40,34,c3,1,20,156.0,c4,1,20,105.0,c2,1.0,20.0,71.0
1,1,1,influenza,L,c3,2,40,57,c4,2,40,34.0,c1,1,20,23.0,c2,0.0,0.0,
2,1,1,long covid,M,c3,2,40,107,c4,2,40,166.0,c2,1,20,18.0,c1,0.0,0.0,
3,1,1,long covid,L,c2,2,40,9,c3,1,20,77.0,c4,1,20,95.0,c1,1.0,20.0,16.0
4,1,1,malaria therapeutics,M,c2,2,40,31,c4,1,20,27.0,c1,1,20,50.0,c3,1.0,20.0,76.0
5,1,1,malaria therapeutics,L,c1,2,40,27,c4,1,20,10.0,c2,1,20,4.0,c3,1.0,20.0,31.0
6,1,2,asthma,M,c4,3,60,56,c5,1,20,52.0,c1,1,20,24.0,c2,0.0,0.0,
7,1,2,asthma,L,c5,3,60,19,c1,2,40,9.0,c4,0,0,,c2,0.0,0.0,
8,1,2,naegleria fowleri infection,M,c11,3,60,23,c1,2,40,72.0,c10,0,0,,c14,0.0,0.0,
9,1,2,naegleria fowleri infection,L,c11,4,80,14,c10,1,20,98.0,c14,0,0,,c1,0.0,0.0,


In [4]:
# Columns

# round: Lyssna round number
# test: test number
# search term
# test type: M -> participants decide which set of results is the most relevant one 
#            L -> participants decide which set of results is the least relevant one
# combo x: combination presented to participants
# ps x: number of participants that selected the combination under combo x as the most/least relevant
# pct ps x: percentage of participants that selected the combination under combo x as the most/least relevant
# time x: average time spent by participants to select the combination under combo x as the most/least relevant

In [5]:
# Unique search terms

df['search term'].unique()

array(['influenza', 'long covid', 'malaria therapeutics', 'asthma',
       'naegleria fowleri infection', 'zika microcephaly',
       'allergen skin prick test', 'allergy treatment',
       'sublingual immunotherapy', 'AIDS', 'immunotherapeutics',
       't-cell function', "addison's disease", 'cancer',
       'myocardial infarction', 'dendritic cells', 'mast cells',
       'rational cancer drug design', 'metabolomics', 'pinealocyte',
       'plasmacytoid dendritic cells', 'gwas', 'mycobacterium',
       'tuberculin skin test', 'hiv'], dtype=object)

## 2. Data wrangling

In [6]:
# Add search result scope info to Lysnna dataframe and reorder columns

df = df.merge(scopes_df[['search term', 'scope']], how = 'left', on = 'search term')
df = df.reindex(columns = ['round', 'test', 'search term', 'scope', 'test type', 'combo 1', 'ps 1',
                           'pct ps 1', 'time 1', 'combo 2', 'ps 2', 'pct ps 2', 'time 2',
                           'combo 3', 'ps 3', 'pct ps 3', 'time 3', 'combo 4', 'ps 4', 'pct ps 4', 'time 4'])
df     

Unnamed: 0,round,test,search term,scope,test type,combo 1,ps 1,pct ps 1,time 1,combo 2,ps 2,pct ps 2,time 2,combo 3,ps 3,pct ps 3,time 3,combo 4,ps 4,pct ps 4,time 4
0,1,1,influenza,very broad,M,c1,2,40,34,c3,1,20,156.0,c4,1,20,105.0,c2,1.0,20.0,71.0
1,1,1,influenza,very broad,L,c3,2,40,57,c4,2,40,34.0,c1,1,20,23.0,c2,0.0,0.0,
2,1,1,long covid,broad,M,c3,2,40,107,c4,2,40,166.0,c2,1,20,18.0,c1,0.0,0.0,
3,1,1,long covid,broad,L,c2,2,40,9,c3,1,20,77.0,c4,1,20,95.0,c1,1.0,20.0,16.0
4,1,1,malaria therapeutics,very specific,M,c2,2,40,31,c4,1,20,27.0,c1,1,20,50.0,c3,1.0,20.0,76.0
5,1,1,malaria therapeutics,very specific,L,c1,2,40,27,c4,1,20,10.0,c2,1,20,4.0,c3,1.0,20.0,31.0
6,1,2,asthma,very broad,M,c4,3,60,56,c5,1,20,52.0,c1,1,20,24.0,c2,0.0,0.0,
7,1,2,asthma,very broad,L,c5,3,60,19,c1,2,40,9.0,c4,0,0,,c2,0.0,0.0,
8,1,2,naegleria fowleri infection,very specific,M,c11,3,60,23,c1,2,40,72.0,c10,0,0,,c14,0.0,0.0,
9,1,2,naegleria fowleri infection,very specific,L,c11,4,80,14,c10,1,20,98.0,c14,0,0,,c1,0.0,0.0,


In [7]:
# Create dataframes for the least and most relevant preference tests

In [8]:
least_df = df[df['test type'] == 'L'].copy()
least_df

Unnamed: 0,round,test,search term,scope,test type,combo 1,ps 1,pct ps 1,time 1,combo 2,ps 2,pct ps 2,time 2,combo 3,ps 3,pct ps 3,time 3,combo 4,ps 4,pct ps 4,time 4
1,1,1,influenza,very broad,L,c3,2,40,57,c4,2,40,34.0,c1,1,20,23.0,c2,0.0,0.0,
3,1,1,long covid,broad,L,c2,2,40,9,c3,1,20,77.0,c4,1,20,95.0,c1,1.0,20.0,16.0
5,1,1,malaria therapeutics,very specific,L,c1,2,40,27,c4,1,20,10.0,c2,1,20,4.0,c3,1.0,20.0,31.0
7,1,2,asthma,very broad,L,c5,3,60,19,c1,2,40,9.0,c4,0,0,,c2,0.0,0.0,
9,1,2,naegleria fowleri infection,very specific,L,c11,4,80,14,c10,1,20,98.0,c14,0,0,,c1,0.0,0.0,
11,1,2,zika microcephaly,somewhat specific,L,c4,4,80,8,c2,1,20,28.0,c1,0,0,,c3,0.0,0.0,
13,1,3,allergen skin prick test,somewhat specific,L,c1,2,40,37,c4,1,20,12.0,c2,1,20,49.0,c3,1.0,20.0,22.0
15,1,3,allergy treatment,broad,L,c2,3,60,25,c3,1,20,23.0,c1,1,20,4.0,c4,0.0,0.0,
17,1,3,sublingual immunotherapy,very specific,L,c4,4,80,17,c3,1,20,23.0,c1,0,0,,c2,0.0,0.0,
19,1,4,AIDS,very broad,L,c4,4,80,59,c1,1,20,6.0,c3,0,0,,c2,0.0,0.0,


In [9]:
most_df = df[df['test type'] != 'L'].copy()
most_df

Unnamed: 0,round,test,search term,scope,test type,combo 1,ps 1,pct ps 1,time 1,combo 2,ps 2,pct ps 2,time 2,combo 3,ps 3,pct ps 3,time 3,combo 4,ps 4,pct ps 4,time 4
0,1,1,influenza,very broad,M,c1,2,40,34,c3,1,20,156.0,c4,1,20,105.0,c2,1.0,20.0,71.0
2,1,1,long covid,broad,M,c3,2,40,107,c4,2,40,166.0,c2,1,20,18.0,c1,0.0,0.0,
4,1,1,malaria therapeutics,very specific,M,c2,2,40,31,c4,1,20,27.0,c1,1,20,50.0,c3,1.0,20.0,76.0
6,1,2,asthma,very broad,M,c4,3,60,56,c5,1,20,52.0,c1,1,20,24.0,c2,0.0,0.0,
8,1,2,naegleria fowleri infection,very specific,M,c11,3,60,23,c1,2,40,72.0,c10,0,0,,c14,0.0,0.0,
10,1,2,zika microcephaly,somewhat specific,M,c1,3,60,40,c2,1,20,36.0,c3,1,20,35.0,c4,0.0,0.0,
12,1,3,allergen skin prick test,somewhat specific,M,c3,2,40,121,c4,1,20,39.0,c1,1,20,52.0,c2,1.0,20.0,71.0
14,1,3,allergy treatment,broad,M,c1,2,40,51,c3,1,20,52.0,c4,1,20,25.0,c2,1.0,20.0,31.0
16,1,3,sublingual immunotherapy,very specific,M,c1,4,80,40,c4,1,20,26.0,c2,0,0,,c3,0.0,0.0,
18,1,4,AIDS,very broad,M,c4,2,40,88,c1,2,40,127.0,c2,1,20,14.0,c3,0.0,0.0,


## 3. Combinations deemed most or least relevant

In [10]:
# Find the number of participants that considered each combination as the least relevant one

In [11]:
# Initialize a dictionary to store combo quantities
least_combos = {}

# Iterate over rows in the DataFrame
for index, row in least_df.iterrows():
    # Iterate over shop columns and quantity columns
    for i in range(1, 5):
        combo = row[f'combo {i}']
        ps = row[f'ps {i}']
        # Update dictionary
        least_combos[combo] = least_combos.get(combo, 0) + ps

# Participants that selected each combination as the least relevant one
print("Least relevant preference tests: combinations and number of participants who selected them")
print("Combination: participants")
for combo, ps in least_combos.items():
    print(f"{combo}: {ps}")

Least relevant preference tests: combinations and number of participants who selected them
Combination: participants
c3: 24.0
c4: 42.0
c1: 25.0
c2: 27.0
c5: 43.0
c11: 6
c10: 15
c14: 0
c15: 3
nan: nan
c9: 10.0
c7: 21.0
c8: 18.0
c6: 20.0
c13: 1


In [12]:
# Find the number of participants that considered each combination as the most relevant one

In [13]:
# Initialize a dictionary to store combo quantities
most_combos = {}

# Iterate over rows in the DataFrame
for index, row in most_df.iterrows():
    # Iterate over shop columns and quantity columns
    for i in range(1, 5):
        combo = row[f'combo {i}']
        ps = row[f'ps {i}']
        # Update dictionary
        most_combos[combo] = most_combos.get(combo, 0) + ps

# Participants that selected each combination as the most relevant one
print("Most relevant preference tests: combinations and number of participants who selected them")
print("Combination: participants")
for combo, ps in most_combos.items():
    print(f"{combo}: {ps}")

Most relevant preference tests: combinations and number of participants who selected them
Combination: participants
c1: 53.0
c3: 20.0
c4: 28.0
c2: 23.0
c5: 25.0
c11: 3.0
c10: 4.0
c14: 0.0
c15: 1
nan: nan
c8: 39.0
c7: 22.0
c9: 14.0
c6: 22.0
c13: 1


## 4. Combinations that were not deemed most or least relevant

In [14]:
# Combinations that were not considered to be least relevant

In [15]:
# Find rows where any of the ps are '0'
rows_with_zero_ps_least = least_df[(least_df['ps 1'] == 0) | 
                                   (least_df['ps 2'] == 0) | (least_df['ps 3'] == 0) | (least_df['ps 4'] == 0)]
rows_with_zero_ps_least

Unnamed: 0,round,test,search term,scope,test type,combo 1,ps 1,pct ps 1,time 1,combo 2,ps 2,pct ps 2,time 2,combo 3,ps 3,pct ps 3,time 3,combo 4,ps 4,pct ps 4,time 4
1,1,1,influenza,very broad,L,c3,2,40,57,c4,2,40,34.0,c1,1,20,23.0,c2,0.0,0.0,
7,1,2,asthma,very broad,L,c5,3,60,19,c1,2,40,9.0,c4,0,0,,c2,0.0,0.0,
9,1,2,naegleria fowleri infection,very specific,L,c11,4,80,14,c10,1,20,98.0,c14,0,0,,c1,0.0,0.0,
11,1,2,zika microcephaly,somewhat specific,L,c4,4,80,8,c2,1,20,28.0,c1,0,0,,c3,0.0,0.0,
15,1,3,allergy treatment,broad,L,c2,3,60,25,c3,1,20,23.0,c1,1,20,4.0,c4,0.0,0.0,
17,1,3,sublingual immunotherapy,very specific,L,c4,4,80,17,c3,1,20,23.0,c1,0,0,,c2,0.0,0.0,
19,1,4,AIDS,very broad,L,c4,4,80,59,c1,1,20,6.0,c3,0,0,,c2,0.0,0.0,
21,1,4,immunotherapeutics,somewhat specific,L,c15,3,60,11,c1,2,40,16.0,c2,0,0,,,,,
23,1,4,t-cell function,broad,L,c3,3,60,12,c2,1,20,10.0,c4,1,20,130.0,c1,0.0,0.0,
27,1,5,cancer,very broad,L,c3,2,40,16,c4,2,40,14.0,c2,1,20,29.0,c1,0.0,0.0,


In [16]:
print("Least relevant preference tests\n")
# Iterate through each search term
for term in rows_with_zero_ps_least['search term'].unique():
    print(f"search term: {term}")
    zero_ps_combos = []
    for i in range(1, 5):  # Iterate through ps columns
        ps_col = f'ps {i}'
        combo_col = f'combo {i}'
        # Filter rows where ps is 0 and add corresponding combo to the list
        zero_ps_combos.extend(rows_with_zero_ps_least[(rows_with_zero_ps_least['search term'] == term) & 
                                                      (rows_with_zero_ps_least[ps_col] == 0)][combo_col])
    # Display unique combos with ps equal to 0 for the search term
    print(f"combos with ps equal to zero: {set(zero_ps_combos)}")
    print()

Least relevant preference tests

search term: influenza
combos with ps equal to zero: {'c8', 'c2', 'c5', 'c1', 'c3', 'c7'}

search term: asthma
combos with ps equal to zero: {'c2', 'c4'}

search term: naegleria fowleri infection
combos with ps equal to zero: {'c14', 'c1'}

search term: zika microcephaly
combos with ps equal to zero: {'c8', 'c9', 'c3', 'c1'}

search term: allergy treatment
combos with ps equal to zero: {'c7', 'c4'}

search term: sublingual immunotherapy
combos with ps equal to zero: {'c7', 'c2', 'c1'}

search term: AIDS
combos with ps equal to zero: {'c3', 'c2'}

search term: immunotherapeutics
combos with ps equal to zero: {'c2'}

search term: t-cell function
combos with ps equal to zero: {'c5', 'c1'}

search term: cancer
combos with ps equal to zero: {'c1'}

search term: myocardial infarction
combos with ps equal to zero: {'c8', 'c6', 'c1'}

search term: rational cancer drug design
combos with ps equal to zero: {'c9', 'c3', 'c1'}

search term: metabolomics
combos with

In [17]:
# Combinations that were not considered to be most relevant

In [18]:
# Find rows where any of the ps are '0'
rows_with_zero_ps_most = most_df[(most_df['ps 1'] == 0) | 
                                   (most_df['ps 2'] == 0) | (most_df['ps 3'] == 0) | (most_df['ps 4'] == 0)]
rows_with_zero_ps_most

Unnamed: 0,round,test,search term,scope,test type,combo 1,ps 1,pct ps 1,time 1,combo 2,ps 2,pct ps 2,time 2,combo 3,ps 3,pct ps 3,time 3,combo 4,ps 4,pct ps 4,time 4
2,1,1,long covid,broad,M,c3,2,40,107,c4,2,40,166.0,c2,1,20,18.0,c1,0.0,0.0,
6,1,2,asthma,very broad,M,c4,3,60,56,c5,1,20,52.0,c1,1,20,24.0,c2,0.0,0.0,
8,1,2,naegleria fowleri infection,very specific,M,c11,3,60,23,c1,2,40,72.0,c10,0,0,,c14,0.0,0.0,
10,1,2,zika microcephaly,somewhat specific,M,c1,3,60,40,c2,1,20,36.0,c3,1,20,35.0,c4,0.0,0.0,
16,1,3,sublingual immunotherapy,very specific,M,c1,4,80,40,c4,1,20,26.0,c2,0,0,,c3,0.0,0.0,
18,1,4,AIDS,very broad,M,c4,2,40,88,c1,2,40,127.0,c2,1,20,14.0,c3,0.0,0.0,
22,1,4,t-cell function,broad,M,c3,2,40,117,c4,2,40,45.0,c1,1,20,16.0,c2,0.0,0.0,
24,1,5,addison's disease,very specific,M,c2,4,80,118,c1,1,20,45.0,c3,0,0,,c4,0.0,0.0,
26,1,5,cancer,very broad,M,c1,3,60,29,c4,1,20,24.0,c2,1,20,46.0,c3,0.0,0.0,
28,1,5,myocardial infarction,broad,M,c4,2,40,69,c1,2,40,40.0,c3,1,20,32.0,c2,0.0,0.0,


In [19]:
print("Most relevant preference tests\n")

# Iterate through each search term
for term in rows_with_zero_ps_most['search term'].unique():
    print(f"search term: {term}")
    zero_ps_combos = []
    for i in range(1, 5):  # Iterate through ps columns
        ps_col = f'ps {i}'
        combo_col = f'combo {i}'
        # Filter rows where ps is 0 and add corresponding combo to the list
        zero_ps_combos.extend(rows_with_zero_ps_most[(rows_with_zero_ps_most['search term'] == term) & 
                                                     (rows_with_zero_ps_most[ps_col] == 0)][combo_col])
    # Display unique combos with ps equal to 0 for the search term
    print(f"combos with ps equal to zero: {set(zero_ps_combos)}")
    print()

Most relevant preference tests

search term: long covid
combos with ps equal to zero: {'c3', 'c5', 'c1'}

search term: asthma
combos with ps equal to zero: {'c7', 'c2'}

search term: naegleria fowleri infection
combos with ps equal to zero: {'c14', 'c10'}

search term: zika microcephaly
combos with ps equal to zero: {'c5', 'c4'}

search term: sublingual immunotherapy
combos with ps equal to zero: {'c8', 'c3', 'c7', 'c2'}

search term: AIDS
combos with ps equal to zero: {'c3', 'c5'}

search term: t-cell function
combos with ps equal to zero: {'c9', 'c5', 'c2', 'c10'}

search term: addison's disease
combos with ps equal to zero: {'c3', 'c10', 'c5', 'c4'}

search term: cancer
combos with ps equal to zero: {'c8', 'c3', 'c5'}

search term: myocardial infarction
combos with ps equal to zero: {'c7', 'c2'}

search term: dendritic cells
combos with ps equal to zero: {'c8', 'c7', 'c2'}

search term: mast cells
combos with ps equal to zero: {'c8', 'c3', 'c7', 'c4'}

search term: rational cancer d