# Towards Reproducibility of Interactive Retrieval Experiments

In [1]:
import pandas as pd
import numpy as np
import ast
from scipy import stats
from itertools import combinations
import json
from collections import Counter
from scipy.spatial.distance import jensenshannon
from scipy.stats import ks_2samp
from utils import *

## Load data

In [2]:
### Prepare data

# session-averaged data from the original experiment 
session_df = pd.read_csv('data/driventodistraction_sessiondata.csv')
# userdata: id + categories (high/low) for all characteristics
userdata = pd.read_csv('data/userdata_categories.csv')
# adding Perceptual Speed and Cognitive Failure categories to the session dataframe
session_df = session_df.merge(userdata,on=['user'],how='left')


# reported measures
tab3 = ['queries','clicked_per_query','snippets_per_query','clicked','saved','saved_rel','accuracy']
tab4 = ['sessiontime','query time mean','doc time mean','snippet time mean','first click','first save']
tab_dtd = tab3 + tab4

# conditions/groups
conditions = ['interface','pscat','cfcat']

# Markov model states
states = ['START','QUERY', 'TASK','SERP','SNIPPET','DOC','MARK', 'REVIEW', 'END']


In [3]:
### random splits

# create random splits
#random_splits = split_users(userdata, 1000, 'username')
#with open('data/splits/random_splits.json', 'w') as file:
#    json.dump(random_splits, file)

# load random splits 
with open('data/splits/random_splits.json', 'r') as file:
    random_splits = json.load(file)

In [29]:
# show tables from original experiment (different numbers because of preprocessed/differently processed data)
pd.set_option('display.max_columns', None)  # show all columns
display(show_original_results(tab_dtd,session_df,conditions))

Unnamed: 0_level_0,cond,queries,queries,clicked_per_query,clicked_per_query,snippets_per_query,snippets_per_query,clicked,clicked,saved,saved,saved_rel,saved_rel,accuracy,accuracy,sessiontime,sessiontime,query time mean,query time mean,doc time mean,doc time mean,snippet time mean,snippet time mean,first click,first click,first save,first save
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
0,ads,4.098039,4.248494,3.310193,2.490631,28.663086,25.099467,7.323529,2.629467,5.235294,2.068991,3.911765,1.72978,0.759158,0.215059,436.248627,227.597319,13.84423,8.230631,18.439789,14.503121,1.709315,0.710975,41.6221,28.293496,93.83099,71.372544
1,links,4.039216,3.674697,3.534946,3.10542,26.544236,27.308066,7.833333,3.264217,5.421569,2.479365,3.892157,1.939209,0.739154,0.231027,476.126373,261.75149,13.914137,11.294455,19.410161,14.84622,1.858755,0.940033,38.380714,29.435543,93.9397,81.583353
2,none,4.5,4.260688,3.466873,2.89824,28.362721,26.00743,7.862745,3.318234,5.529412,2.57061,3.813725,1.784035,0.727987,0.24485,452.675882,238.149232,12.580965,9.439814,18.984443,18.274433,1.618023,0.737993,39.360792,31.45277,90.808725,80.987173
3,recs,3.843137,3.020541,3.255494,2.909322,24.675785,20.225818,7.264706,3.245489,5.22549,2.277339,3.607843,1.791934,0.715975,0.239945,436.437647,237.574338,13.572387,8.871384,15.855799,13.74347,1.85817,1.169025,47.828713,34.279063,92.905098,69.235361
4,highps,4.219388,3.824058,3.273788,2.657829,27.518114,26.10476,7.673469,2.849293,5.260204,2.032719,3.938776,1.702585,0.766228,0.217303,440.307449,218.103258,11.339312,6.158785,20.672559,15.794502,1.661642,0.830966,39.729436,28.397455,100.194694,76.498985
5,lowps,4.154255,3.950581,3.504372,3.106849,26.171109,23.838404,7.595745,3.420039,5.526596,2.663747,3.723404,1.914785,0.701794,0.240334,464.679096,261.724008,15.882695,11.927601,16.205798,15.262815,1.817739,0.951516,44.418424,33.879347,86.659626,75.034445
6,highcf,3.978723,3.970422,3.819053,3.245245,30.384386,27.852119,7.712766,3.07525,5.601064,2.530118,4.005319,1.916708,0.740441,0.238765,468.836596,241.345519,13.060122,7.790069,18.056198,15.774214,1.843462,1.016042,40.498852,28.679502,93.106022,81.03827
7,lowcf,4.240909,3.707472,3.026835,2.413809,24.229022,21.462508,7.45,3.174132,5.140909,2.171454,3.636364,1.699999,0.73141,0.227854,434.593409,240.601773,13.828614,10.76838,18.26582,15.197735,1.690654,0.80586,42.941797,32.97453,92.657489,71.054296


In [30]:
# find significant differences (i.e., show p-values) regarding conditions (interface conditions, high/low ps, high/low cf) in the original experiment
pvals = pvals_table(tab_dtd,session_df,conditions)
display(pvals)
print('Significant effects:\n',[(pvals.index[i][0], pvals.columns[j]) for i, j in zip(*np.where(pvals<0.05))])

Unnamed: 0,Unnamed: 1,Unnamed: 2,queries,clicked_per_query,snippets_per_query,clicked,saved,saved_rel,accuracy,sessiontime,query time mean,doc time mean,snippet time mean,first click,first save
interface,ads,links,0.933766,0.97916,0.613438,0.676851,0.904483,0.795225,0.750171,0.829368,0.63739,0.616646,0.676257,0.474661,0.77686
interface,ads,none,0.933766,0.97916,0.815173,0.676851,0.904483,0.702738,0.750171,0.829368,0.491867,0.616646,0.639423,0.338946,0.77686
interface,ads,recs,0.933766,0.97916,0.613438,0.6944,0.904483,0.508535,0.750171,0.900896,0.711375,0.253244,0.869059,0.454004,0.902058
interface,links,none,0.933766,0.97916,0.621925,0.797336,0.904483,0.795225,0.915869,0.829368,0.63739,0.452071,0.418631,0.659421,0.77686
interface,links,recs,0.933766,0.97916,0.834375,0.676851,0.904483,0.589225,0.750171,0.829368,0.836931,0.233107,0.676257,0.162558,0.77686
interface,none,recs,0.933766,0.97916,0.686526,0.676851,0.904483,0.589225,0.750171,0.829368,0.63739,0.437485,0.639423,0.16169,0.77686
pscat,highps,lowps,0.441375,0.849234,0.48657,0.349929,0.582181,0.111322,0.010142,0.694867,0.000177,0.000439,0.162815,0.288668,0.036119
cfcat,highcf,lowcf,0.074752,0.043035,0.062295,0.181248,0.054041,0.046392,0.599454,0.116722,0.715422,0.910798,0.172562,0.999654,0.458125


Significant effects:
 [('pscat', 'accuracy'), ('pscat', 'query time mean'), ('pscat', 'doc time mean'), ('pscat', 'first save'), ('cfcat', 'clicked_per_query'), ('cfcat', 'saved_rel')]


## Level 1

In [31]:
# comparing key findings (significant effects) between original and reproduction

effects_per_ex = []
succ_per_repro = []
fails_per_repro = []
dflist = []

for i in range(1000):
        # create dataframe for each "study"
        A = session_df[session_df['user'].isin(random_splits[0][i])] # "original"
        B = session_df[session_df['user'].isin(random_splits[1][i])] # "reproduction"
        # find significant effects in each study
        A_p = pvals_table(tab_dtd,A,conditions)
        B_p = pvals_table(tab_dtd,B,conditions)
        # add the number of significant effects per experiment, successfully reproduced effects, and effects that failed to reproduce to the respective lists
        effects_per_ex.append(pd.to_numeric(A_p.stack(), errors='coerce').lt(0.05).sum())
        effects_per_ex.append(pd.to_numeric(B_p.stack(), errors='coerce').lt(0.05).sum())
        succ_per_repro.append(((A_p <= 0.05) & (B_p <= 0.05)).sum().sum())
        fails_per_repro.append(((A_p > 0.05) & (B_p <= 0.05)).sum().sum() + ((A_p <= 0.05) & (B_p > 0.05)).sum().sum())
        # create dataframe showing the number of studies in which a significant effect was observed and add it to the list (0: significant in none, 1: significant in either original or reproduction, 2: significant in both)
        dflist.append(pd.DataFrame(
                np.where((A_p > 0.05) & (B_p > 0.05), 0,   
                        np.where((A_p > 0.05) | (B_p > 0.05), 1,  
                                2)),
                        index=A_p.index,
                        columns=A_p.columns
                ))

print('average number of significant effects per study:',np.mean(effects_per_ex))
print('average number of successfully reproduced effects:',np.mean(succ_per_repro),'(max: '+str(np.max(succ_per_repro))+')')
print('average number of effects only present in one study:',np.mean(fails_per_repro),'(max: '+str(np.max(fails_per_repro))+')') # note: regardless of the study being the "original" or the "reproduction"

# stack dataframes and count occurences of 0, 1, and 2
stacked_array = np.stack([df.values for df in dflist], axis=2)
count_0 = (stacked_array == 0).sum(axis=2)
count_1 = (stacked_array == 1).sum(axis=2)
count_2 = (stacked_array == 2).sum(axis=2)

# combine into one dataframe
combined = pd.DataFrame([[(count_2[i, j], count_1[i, j], count_0[i, j]) for j in range(A_p.shape[1])] for i in range(A_p.shape[0])], index=A_p.index, columns=A_p.columns)

print('\nnumber of simulations with significant effects in (both studies, only one study, no study):')
display(combined)


average number of significant effects per study: 5.536
average number of successfully reproduced effects: 1.151 (max: 5)
average number of effects only present in one study: 8.77 (max: 18)

number of simulations with significant effects in (both studies, only one study, no study):


Unnamed: 0,Unnamed: 1,Unnamed: 2,queries,clicked_per_query,snippets_per_query,clicked,saved,saved_rel,accuracy,sessiontime,query time mean,doc time mean,snippet time mean,first click,first save
interface,ads,links,"(0, 0, 1000)","(0, 0, 1000)","(0, 1, 999)","(0, 1, 999)","(0, 0, 1000)","(0, 1, 999)","(0, 5, 995)","(0, 0, 1000)","(0, 0, 1000)","(0, 0, 1000)","(0, 0, 1000)","(0, 3, 997)","(0, 1, 999)"
interface,ads,none,"(0, 0, 1000)","(0, 0, 1000)","(0, 0, 1000)","(0, 0, 1000)","(0, 0, 1000)","(0, 4, 996)","(0, 4, 996)","(0, 0, 1000)","(0, 4, 996)","(0, 0, 1000)","(0, 0, 1000)","(0, 21, 979)","(0, 3, 997)"
interface,ads,recs,"(0, 1, 999)","(0, 2, 998)","(0, 0, 1000)","(0, 0, 1000)","(0, 0, 1000)","(0, 48, 952)","(0, 20, 980)","(0, 0, 1000)","(0, 0, 1000)","(0, 5, 995)","(0, 0, 1000)","(0, 8, 992)","(0, 0, 1000)"
interface,links,none,"(0, 0, 1000)","(0, 0, 1000)","(0, 1, 999)","(0, 1, 999)","(0, 0, 1000)","(1, 1, 998)","(0, 0, 1000)","(0, 0, 1000)","(0, 0, 1000)","(0, 0, 1000)","(0, 1, 999)","(0, 4, 996)","(0, 1, 999)"
interface,links,recs,"(0, 0, 1000)","(0, 0, 1000)","(0, 0, 1000)","(0, 3, 997)","(0, 0, 1000)","(0, 21, 979)","(0, 3, 997)","(0, 0, 1000)","(0, 0, 1000)","(0, 24, 976)","(0, 0, 1000)","(0, 56, 944)","(0, 1, 999)"
interface,none,recs,"(0, 0, 1000)","(0, 0, 1000)","(0, 0, 1000)","(0, 1, 999)","(0, 0, 1000)","(0, 10, 990)","(0, 0, 1000)","(0, 0, 1000)","(0, 1, 999)","(0, 1, 999)","(0, 0, 1000)","(0, 118, 882)","(0, 5, 995)"
pscat,highps,lowps,"(9, 128, 863)","(26, 38, 936)","(46, 154, 800)","(8, 215, 777)","(24, 95, 881)","(0, 310, 690)","(1, 840, 159)","(46, 97, 857)","(455, 545, 0)","(341, 659, 0)","(14, 400, 586)","(2, 185, 813)","(1, 621, 378)"
cfcat,highcf,lowcf,"(2, 456, 542)","(0, 564, 436)","(4, 582, 414)","(7, 326, 667)","(4, 562, 434)","(0, 516, 484)","(4, 32, 964)","(7, 441, 552)","(33, 64, 903)","(72, 36, 892)","(14, 376, 610)","(15, 16, 969)","(15, 127, 858)"


In [32]:
# average p values and difference between p values

for i in range(1000):
    # creating dataframes for each study
    A = session_df[session_df['user'].isin(random_splits[0][i])] # "original"
    B = session_df[session_df['user'].isin(random_splits[1][i])] # "reproduction"
    # p-values
    A_p = pvals_table(tab_dtd,A,['pscat','cfcat']) # only checking for PS and CF (see paper)
    B_p = pvals_table(tab_dtd,B,['pscat','cfcat']) # only checking for PS and CF (see paper)

    cumulative_p = A_p + B_p if i == 0 else cumulative_p + A_p + B_p

    diff = (A_p-B_p).abs()
    cumulative_diff = diff if i == 0 else cumulative_diff + diff
    
average_p = cumulative_p / 2000 # 1000 "originals" + 1000 "reproductions"
average_diff = cumulative_diff / 1000 

print('Average p-values across all studies:')
display(average_p)
print('Mean differences between p-values across all reproductions:')
display(average_diff)
    

Average p-values across all studies:


Unnamed: 0,Unnamed: 1,Unnamed: 2,queries,clicked_per_query,snippets_per_query,clicked,saved,saved_rel,accuracy,sessiontime,query time mean,doc time mean,snippet time mean,first click,first save
pscat,highps,lowps,0.472817,0.519122,0.435178,0.426784,0.47179,0.341565,0.143758,0.453029,0.076806,0.093511,0.350503,0.433592,0.254976
cfcat,highcf,lowcf,0.300022,0.259239,0.291695,0.374533,0.285343,0.262022,0.553713,0.335783,0.470462,0.45549,0.350803,0.544805,0.466854


Mean differences between p-values across all reproductions:


Unnamed: 0,Unnamed: 1,Unnamed: 2,queries,clicked_per_query,snippets_per_query,clicked,saved,saved_rel,accuracy,sessiontime,query time mean,doc time mean,snippet time mean,first click,first save
pscat,highps,lowps,0.410798,0.155421,0.356008,0.449242,0.32724,0.471174,0.234883,0.243358,0.149866,0.18039,0.489006,0.475155,0.4165
cfcat,highcf,lowcf,0.449077,0.413217,0.471628,0.496464,0.460078,0.406061,0.34594,0.489931,0.239142,0.107343,0.477661,0.072003,0.40223


## Level 2

In [34]:
# comparing measurements from reported measures between original and reproduction

# check for significant differences in one reproduction attempt
def compare_measurements_l2(df1, df2, tab):
    r = {}
    for col in tab:
        l1 = list(df1[col])
        l2 = list(df2[col])
        # Mann Whitney U test
        _, p_val = stats.mannwhitneyu([x for x in l1 if not np.isnan(x)], [x for x in l2 if not np.isnan(x)], alternative='two-sided')
        r[col] = p_val
    # correcting p values using Benjamini Hochberg
    p_values = list(r.values())
    keys = list(r.keys())
    _, pvals_corrected, _, _ = sm.stats.multipletests(p_values, method='fdr_bh')
    r = dict(zip(keys, pvals_corrected))
    # return a list with corrected p-values for each metric
    return r

In [35]:
count_l2_diffs = Counter()

pvals_l2 = []
for i in range(1000):
    # creating dataframes for each group (original/reproduction)
    A = session_df[session_df['user'].isin(random_splits[0][i])]
    B = session_df[session_df['user'].isin(random_splits[1][i])]

    # adding list of p-values for all metrics to list for all reproductions
    pvals_l2.append(compare_measurements_l2(A,B,tab_dtd))

# for each measure, count how many reproductions show significant differences between the groups
for d in pvals_l2:
    for key, value in d.items():
        if value < 0.05: 
            count_l2_diffs[key] += 1

# print results
for key in tab_dtd:
    if key in count_l2_diffs.keys():
        print(f"Significant differences for '{key}': {count_l2_diffs[key]}")
    else:
        print(f"Significant differences for '{key}': 0")

Significant differences for 'queries': 43
Significant differences for 'clicked_per_query': 41
Significant differences for 'snippets_per_query': 88
Significant differences for 'clicked': 72
Significant differences for 'saved': 77
Significant differences for 'saved_rel': 25
Significant differences for 'accuracy': 20
Significant differences for 'sessiontime': 87
Significant differences for 'query time mean': 73
Significant differences for 'doc time mean': 90
Significant differences for 'snippet time mean': 118
Significant differences for 'first click': 38
Significant differences for 'first save': 47


## Level 3

In [None]:
# compare user behaviour (times spent per state and transitions) between original and reproduction

### Times

In [36]:
def compare_times_l3(df1, df2, states):
    r = {}
    for s in states:
        col = s.lower()+' time mean' # corresponding column name in dataframe
        A_times = list(df1[col])
        B_times = list(df2[col])
        _, p_val = stats.mannwhitneyu([x for x in A_times if not np.isnan(x)], [x for x in B_times if not np.isnan(x)], alternative='two-sided')
        r[s] = p_val
    # apply Benjamini-Hochberg correction 
    keys = list(r.keys())
    p_values = list(r.values())
    _, pvals_corrected, _, _ = sm.stats.multipletests(p_values, method='fdr_bh')
    r = dict(zip(keys, pvals_corrected))
    return r

In [None]:
count_l3_times = Counter()
pvals_l3_times = []

for i in range(1000):
    A = session_df[session_df['user'].isin(random_splits[0][i])].copy() # "original"
    B = session_df[session_df['user'].isin(random_splits[1][i])].copy() # "reproduction"

    pvals_l3_times.append(compare_times_l3(A,B,states))

# for each state, count how many reproductions show significant differences between the groups (regarding time spent in this state)
for d in pvals_l3_times:
    for key, value in d.items():
        if value < 0.05: 
            count_l3_times[key] += 1

# print results
for s in states:
    if s in count_l3_times.keys():
        print(f"Significant differences for '{s}': {count_l3_times[s]}")
    else:
        print(f"Significant differences for '{s}': 0")


Significant differences for 'START': 97
Significant differences for 'QUERY': 95
Significant differences for 'TASK': 65
Significant differences for 'SERP': 131
Significant differences for 'SNIPPET': 139
Significant differences for 'DOC': 107
Significant differences for 'MARK': 82
Significant differences for 'REVIEW': 48
Significant differences for 'END': 72


### Transitions

In [39]:
# create a model based on all sessions in df
def create_transition_df(df):
    dicts = list(df['session_transition_dict']) # get transition dictionaries from sessions
    dfs = [pd.DataFrame(ast.literal_eval(d)) for d in dicts] # dictionaries to dataframes
    normalized_dfs = [df.div(df.sum(axis=1), axis=0) for df in dfs] # normalize
    concatenated_df = pd.concat(normalized_dfs) # concatenate
    result_df = concatenated_df.groupby(concatenated_df.index).sum() # group
    result_df = result_df[states] # sort columns
    result_df = result_df.loc[states] # sort rows
    result_df_normal = result_df.div(result_df.sum(axis=1), axis=0) # normalize result
    result_df_normal = result_df_normal.fillna(0)
    result_df_normal.drop(['END'],inplace=True) # drop row "END" because there are no outgoing transitions
    return result_df_normal


In [40]:
frobenius = []
jsd = []
ks = []

for i in range(1000):
    A = session_df[session_df['user'].isin(random_splits[0][i])] # "original"
    B = session_df[session_df['user'].isin(random_splits[1][i])] # "reproduction"
    lmdf1 = create_transition_df(A)
    lmdf2 = create_transition_df(B)
    M1 = lmdf1.to_numpy()
    M2 = lmdf2.to_numpy()

    # Frobenius Norm
    frobenius_norm = np.linalg.norm(lmdf1.values - lmdf2.values, 'fro')
    frobenius.append(frobenius_norm)

    # Jensen-Shannon Divergence for each row
    js_divergences = [jensenshannon(M1[j], M2[j]) for j in range(len(M1))]
    jsd.append(js_divergences)

    # Kolmogorov-Smirnov Test for each row
    ks_tests = [ks_2samp(M1[j], M2[j])[1] for j in range(len(M1))]
    ks.append(ks_tests)

print('mean, min, and max values')
print('Frobenius:',np.mean(frobenius),np.min(frobenius),np.max(frobenius))
states_tab4 = ['START', 'TASK', 'SERP', 'SNIPPET', 'DOC', 'MARK', 'REVIEW'] # no query, no end (query has only one successor, end has no outgoing transitions)
for i in range(len(states_tab4)):
    print(states_tab4[i])
    jsdlist = []
    for j in range(len(jsd)):
        jsdlist.append(jsd[j][i])
    print('JSD', np.mean(jsdlist),np.min(jsdlist),np.max(jsdlist))
    kslist = []
    for j in range(len(ks)):
        kslist.append(ks[j][i])
    print('KS', np.mean(kslist),np.min(kslist),np.max(kslist))

mean, min, and max values
Frobenius: 0.19109773006791797 0.06313278471739471 0.4297194585236102
START
JSD 0.05887067120335691 0.0 0.18838697532721632
KS 1.0 1.0 1.0
TASK
JSD 0.0 0.0 0.0
KS 1.0 1.0 1.0
SERP
JSD 0.10325207920724257 0.028728063534325627 0.19774025698421036
KS 0.9817514603044014 0.7301110654051831 1.0
SNIPPET
JSD 0.03981737524904248 0.010598969488078883 0.08991784435221083
KS 0.996925051419169 0.9894693541752365 1.0
DOC
JSD 0.026365058513525334 0.0043200689355435765 0.08550334379830614
KS 0.9953072398190045 0.7301110654051831 1.0
MARK
JSD 0.03290230123258255 0.005151596182395145 0.09784337202534515
KS 0.9993997531879885 0.9894693541752365 1.0
REVIEW
JSD 0.03505799805656771 0.004507063925110064 0.09028910116629964
KS 0.9998315096668039 0.9894693541752365 1.0


## Stratification

In [41]:
characteristics = ['pscat','cfcat','agecat','bfcat','sfcat','ofcat']
all_combinations = [list(comb) for r in (1,2,3) for comb in combinations(characteristics, r)]
all_combinations.remove(['pscat', 'bfcat', 'ofcat']) # least populated class has only 1 member

#for strat in all_combinations:
#    stratified_splits = split_users(userdata, 100 ,strat)
#    with open('_'.join(strat)+'.json', 'w') as file:
#        json.dump(data/splits/stratified_splits, file)

### Level 1

In [42]:
# selection of characteristics and combinations of characteristics to display
strats_l1 = [['pscat'],['cfcat'],['agecat'],['bfcat'],['sfcat'],['ofcat'],['pscat','cfcat'],['pscat','cfcat','agecat'],['pscat','cfcat','ofcat'],['pscat','cfcat','sfcat'],['pscat','cfcat','bfcat']]

# show level 1 results for selected stratified splits (only average number of effects and average number of successfully/unsuccessfully reproduced effects)
for strat in strats_l1:
    with open('data/splits/'+'_'.join(strat)+'.json', 'r') as file:
        stratified_splits = json.load(file)
        print(strat)
        level1_findings(session_df,tab_dtd,conditions,stratified_splits)
        print()


['pscat']
average number of significant effects per study: 5.67
average number of successfully reproduced effects: 1.29
average number of effects only present in one study: 8.76

['cfcat']
average number of significant effects per study: 5.72
average number of successfully reproduced effects: 1.25
average number of effects only present in one study: 8.94

['agecat']
average number of significant effects per study: 5.565
average number of successfully reproduced effects: 1.21
average number of effects only present in one study: 8.71

['bfcat']
average number of significant effects per study: 5.695
average number of successfully reproduced effects: 1.22
average number of effects only present in one study: 8.95

['sfcat']
average number of significant effects per study: 5.55
average number of successfully reproduced effects: 1.19
average number of effects only present in one study: 8.72

['ofcat']
average number of significant effects per study: 5.48
average number of successfully reprodu

### Level 2 & Level 3

In [43]:
# show results for other levels only for PS+CF (which was the most successful at level 1)

#strats_l2l3 = [['pscat'],['cfcat'],['pscat','cfcat']]
strats_l2l3 = [['pscat','cfcat']]
for strat in strats_l2l3:
    with open('data/splits/'+'_'.join(strat)+'.json', 'r') as file:
        stratified_splits = json.load(file)
        print(strat)
        print('Level 2:')
        level2_measurements(session_df,tab_dtd,stratified_splits)
        print('\nLevel 3 -- times:')
        level3_times(session_df,states,stratified_splits)
        print('\nLevel 3 -- transitions:')
        level3_transitions(session_df,states,stratified_splits)
        print()

['pscat', 'cfcat']
Level 2:
Significant differences for 'queries': 4
Significant differences for 'clicked_per_query': 5
Significant differences for 'snippets_per_query': 10
Significant differences for 'clicked': 8
Significant differences for 'saved': 11
Significant differences for 'saved_rel': 5
Significant differences for 'accuracy': 1
Significant differences for 'sessiontime': 12
Significant differences for 'query time mean': 6
Significant differences for 'doc time mean': 9
Significant differences for 'snippet time mean': 10
Significant differences for 'first click': 2
Significant differences for 'first save': 5

Level 3 -- times:
Significant differences for 'START': 10
Significant differences for 'QUERY': 6
Significant differences for 'TASK': 3
Significant differences for 'SERP': 16
Significant differences for 'SNIPPET': 12
Significant differences for 'DOC': 10
Significant differences for 'MARK': 13
Significant differences for 'REVIEW': 7
Significant differences for 'END': 6

Level 