In [None]:
import os
import pandas as pd
import scipy.stats as stats
import numpy as np
from itertools import combinations
from datetime import datetime

In [None]:
script_path = os.getcwd()
datapath = os.path.join(script_path,'data')
resultpath = os.path.join(script_path,'results')
most_likely = pd.read_csv(os.path.join(datapath,'most_likely.tsv'),delimiter='\t',header=0)
least_likely = pd.read_csv(os.path.join(datapath,'least_likely.tsv'),delimiter='\t',header=0)

In [None]:
## load parameters into dictionaries
resultsdict = {'most_likely':most_likely,'least_likely':least_likely}
grouptype = {'selection_only':['selection'],'scope':['scope','selection'],
             'domain':['domain','selection'],'round':['round','selection']}

In [None]:
def get_basic_stats(resultsdict,grouptype,resultopt,aggname,export='y'):
    aggoption = grouptype[aggname]
    results2test = resultsdict[resultopt]
    tmpselect = results2test.groupby(aggoption)['participants'].sum().reset_index(name='total votes')
    totals = tmpselect['total votes'].sum()
    tmpselect['selection mean'] = tmpselect['total votes']/totals
    tmpselect['per test mean'] = results2test.groupby(aggoption)['participants'].mean().tolist()
    tmpselect['per test stdev'] = results2test.groupby(aggoption)['participants'].std().tolist()
    tmpselect['per test stderr'] = results2test.groupby(aggoption)['participants'].sem().tolist()
    tstamp = datetime.now()
    sttstamp = tstamp.strftime("%Y-%m-%d")
    if export == 'y':
        tmpselect.to_csv(os.path.join(resultpath,sttstamp+'_'+resultopt+'_'+aggname+'_basic_stats.tsv'),sep='\t',header=True)
    else:
        return tmpselect

In [None]:
def run_all_basic_stats(resultsdict,grouptype,export='y'):
    resultoptlist = list(resultsdict.keys())
    aggnamelist = list(grouptype.keys())
    for resultopt in resultoptlist:
        for aggname in aggnamelist:
            get_basic_stats(resultsdict,grouptype,resultopt,aggname,export)

In [None]:
def compare_results(results2test, aggopt, groupings):
    tstamp = datetime.now()
    sttstamp = tstamp.strftime("%Y-%m-%d")
    nonans = results2test['selection'].dropna()
    combilist = nonans.unique().tolist()
    selection_dict = {}
    for eachselect in combilist:
        selection_dict[eachselect] = np.array(results2test['participants'].loc[results2test['selection']==eachselect].tolist())
    tmpresult = []
    for combo in combinations(combilist, 2):  
        a1 = selection_dict[combo[0]]
        a2 = selection_dict[combo[1]]
        if a1.mean() > a2.mean():
            winner = combo[0]
        else:
            winner = combo[1]
        tmpstat, tmp_p = stats.ttest_ind(a1, a2)
        tmpdict = {"function 1":combo[0], "function 2":combo[1],
                   "mean 1": a1.mean(), "mean 2": a2.mean(),
                   "stdev 1": a1.std(), "stdev 2": a2.std(), "winner": winner,
                   "t-test": tmpstat, "p-val": tmp_p, "groupings": groupings,
                   "aggregation approach": aggopt, "run-date":sttstamp}
        tmpresult.append(tmpdict)
    return sttstamp, tmpresult

def run_t_tests(resultsdict,grouptype,resultopt,aggopt,export='y'):
    results2test = resultsdict[resultopt]
    if aggopt == 'selection_only':
        aggname = grouptype[aggopt]
        groupings = "None"
        sttstamp, tmpresult = compare_results(results2test, aggopt, groupings)
    else: 
        testopts = results2test[aggopt].unique().tolist()
        tmpresult = []
        for eachopt in testopts:
            tmpgroup = results2test.loc[results2test[aggopt]==eachopt]
            groupings = eachopt
            sttstamp, partialresult = compare_results(tmpgroup, aggopt, groupings)
            tmpresult.extend(partialresult)
    statisticsdf = pd.DataFrame(tmpresult)
    print(statisticsdf.head(n=2))
    if export == 'y':
        statisticsdf.to_csv(os.path.join(resultpath,sttstamp+'_'+aggopt+'_'+resultopt+'.tsv'),sep='\t',header=True)
    else:
        return statisticsdf

### Running the statistics

In [None]:
#### Get basic statistics (means, stdev, stderr, etc.)
## Note, don't run this if you have no use for this data since it will generate lots of files
run_all_basic_stats(resultsdict,grouptype,export='y')

In [None]:
#### Run the T-tests

#### No groupings, just compare selections
## for most likely
run_t_tests(resultsdict,grouptype,'most_likely','selection_only',export='y')
## for least likely
run_t_tests(resultsdict,grouptype,'least_likely','selection_only',export='y')

#### Group by scope, compare selections
## for most likely
run_t_tests(resultsdict,grouptype,'most_likely','scope',export='y')
## for least likely
run_t_tests(resultsdict,grouptype,'least_likely','scope',export='y')

#### Group by domain, compare selections
## for most likely
run_t_tests(resultsdict,grouptype,'most_likely','domain',export='y')
## for least likely
run_t_tests(resultsdict,grouptype,'least_likely','domain',export='y')

#### Group by round, compare selections
## for most likely
run_t_tests(resultsdict,grouptype,'most_likely','round',export='y')
## for least likely
run_t_tests(resultsdict,grouptype,'least_likely','round',export='y')


# Test components of the script

These are various components of the previously defined modules. Ignore everything below here if the modules run fine. If the modules don't run, you can use this to troubleshoot different parts of the modules

In [None]:
tstamp = datetime.now()
print(tstamp.strftime("%Y-%m-%d"))

In [None]:
#### Parts of the basic stats module
## Get the Totals, Mean, Standard Deviation, Standard Error based on groupings by user selection
resultopt = 'most_likely'
aggopt = 'selection_only'
aggname = grouptype[aggopt]
results2test = resultsdict[resultopt]
tmpselect = results2test.groupby(aggname)['participants'].sum().reset_index(name='total votes')
totals = selection_only['total votes'].sum()
print(totals)
tmpselect['selection mean'] = tmpselect['total votes']/totals
tmpselect['per test mean'] = results2test.groupby(grouptype['selection_only'])['participants'].mean().tolist()
tmpselect['per test stdev'] = results2test.groupby(grouptype['selection_only'])['participants'].std().tolist()
tmpselect['per test stderr'] = results2test.groupby(grouptype['selection_only'])['participants'].sem().tolist()
print(tmpselect.head(n=2))

In [None]:
## Perform single t-tests
resultopt = 'most_likely'
aggopt = 'selection_only'
aggname = grouptype[aggopt]
results2test = resultsdict[resultopt]

c1 = np.array(results2test['participants'].loc[results2test['selection']=='c1'].tolist())
c2 = np.array(results2test['participants'].loc[results2test['selection']=='c2'].tolist())
c3 = np.array(results2test['participants'].loc[results2test['selection']=='c3'].tolist())
c4 = np.array(results2test['participants'].loc[results2test['selection']=='c4'].tolist())

print("c1 vs c2: ",stats.ttest_ind(c1, c2))
print("c2 vs c3: ",stats.ttest_ind(c2, c3))
print("c1 vs c3: ",stats.ttest_ind(c1, c3))
print("c1 vs c4: ",stats.ttest_ind(c1, c4))
print("c2 vs c4: ",stats.ttest_ind(c2, c4))
print("c3 vs c4: ",stats.ttest_ind(c3, c4))

In [None]:
statistic, pvalue = stats.ttest_ind(c3, c4)
print(len(c3), len(c4), statistic,pvalue)
print(c3.mean(),c3.std())

In [None]:
#### Parts of the t-test comparison module

## get the unique combis selected
nonans = results2test['selection'].dropna()
combilist = nonans.unique().tolist()
print(combilist)

## create a dictionary of the combis and the np arrays of the result dfs
selection_dict = {}
for eachselect in combilist:
    selection_dict[eachselect] = np.array(results2test['participants'].loc[results2test['selection']==eachselect].tolist())

## iterate through different combinations of the combis, run t-tests, and store the results
tmpresult = []
for combo in combinations(combilist, 2):  # 2 for pairs, 3 for triplets, etc
    a1 = selection_dict[combo[0]]
    a2 = selection_dict[combo[1]]
    tmpstat, tmp_p = stats.ttest_ind(a1, a2)
    tmpdict = {"function 1":combo[0], "function 2":combo[1],
               "mean 1": a1.mean(), "mean 2": a2.mean(),
               "stdev 1": a1.std(), "stdev 2": a2.std(),
               "t-test": tmpstat, "p-val": tmp_p,
               "aggregation approach": "selection only"}
    tmpresult.append(tmpdict)

statisticsdf = pd.DataFrame(tmpresult)
print(statisticsdf.head(n=2))
#statisticsdf.to_csv(os.path.join(resultpath,'user_choice_only_most.tsv'),sep='\t',header=True)
statisticsdf.to_csv(os.path.join(resultpath,'user_choice_only_least.tsv'),sep='\t',header=True)

### Investigate effect of different groupings (domain or scope) on selection

In [None]:
## get the unique combis selected
nonans = results2test['selection'].dropna()
combilist = nonans.unique().tolist()
print(combilist)

In [None]:
print(results2test['scope'].unique().tolist())
print(results2test['domain'].unique().tolist())

In [None]:
#scopeopts = ['very broad','broad','somewhat specific','very specific']
scopeopts = results2test['scope'].unique().tolist()
#domainopts = ['Infectious Disease', 'Allergy', 'Immunology', 'General Biomedical', 'Cell types', 'Experimental techniques']
domainopts = results2test['domain'].unique().tolist()

for eachopt in scopeopts:
    tmpgroup = results2test.loc[results2test['scope']==eachopt]

#### Subset the results by groupings, then run t-tests

In [None]:
## Subset and then perform t-tests
tmpgroup = results2test.loc[results2test['scope']=='broad']
tmpgroup2 = results2test.loc[results2test['scope']=='somewhat specific']
c1_broad = np.array(tmpgroup['participants'].loc[tmpgroup['selection']=='c1'].tolist())
c2_broad = np.array(tmpgroup['participants'].loc[tmpgroup['selection']=='c2'].tolist())
c1_ss = np.array(tmpgroup2['participants'].loc[tmpgroup2['selection']=='c1'].tolist())
c2_ss = np.array(tmpgroup2['participants'].loc[tmpgroup2['selection']=='c2'].tolist())
print("broad terms, c1 vs c2: ",stats.ttest_ind(c1_broad, c2_broad))
print("somewhat specific, c1 vs c2: ",stats.ttest_ind(c1_ss, c2_ss))

#### Investigate doing paired t-test based on grouping rather than subsetting then running t-tests
It's not clear paired t-tests are appropriate in this case, so this is not done

In [None]:
## Perform paired t-test based on grouping
tmpgroup = results2test.groupby(['selection','scope'])['participants'].sum().reset_index(name='votes')
print(tmpgroup.head(n=2))
c1_group = np.array(tmpgroup['votes'].loc[tmpgroup['selection']=='c1'].tolist())
c2_group = np.array(tmpgroup['votes'].loc[tmpgroup['selection']=='c2'].tolist())
## create arrays that follow a specific order to ensure proper pairing

## Conduct paired t-test
tmpstat,tmp_p = stats.ttest_rel(c1_group,c2_group)
print(tmpstat,tmp_p)

## Old Methods -- Ignore

In [None]:
def run_t_tests(resultsdict,grouptype,resultopt,export='y'):
    tstamp = datetime.now()
    sttstamp = tstamp.strftime("%Y-%m-%d")
    aggopt = 'selection_only'
    aggname = grouptype[aggopt]
    results2test = resultsdict[resultopt]
    nonans = results2test['selection'].dropna()
    combilist = nonans.unique().tolist()
    selection_dict = {}
    for eachselect in combilist:
        selection_dict[eachselect] = np.array(results2test['participants'].loc[results2test['selection']==eachselect].tolist())
    tmpresult = []
    for combo in combinations(combilist, 2):  
        a1 = selection_dict[combo[0]]
        a2 = selection_dict[combo[1]]
        tmpstat, tmp_p = stats.ttest_ind(a1, a2)
        tmpdict = {"function 1":combo[0], "function 2":combo[1],
                   "mean 1": a1.mean(), "mean 2": a2.mean(),
                   "stdev 1": a1.std(), "stdev 2": a2.std(),
                   "t-test": tmpstat, "p-val": tmp_p,
                   "aggregation approach": aggopt, "run-date":sttstamp}
        tmpresult.append(tmpdict)

    statisticsdf = pd.DataFrame(tmpresult)
    print(statisticsdf.head(n=2))
    if export == 'y':
        statisticsdf.to_csv(os.path.join(resultpath,sttstamp+'_user_choice_only_'+resultopt+'.tsv'),sep='\t',header=True)
    else:
        return statisticsdf

In [None]:
run_t_tests(resultsdict,grouptype,'most_likely',export='y')