In [7]:
from collections import Counter, defaultdict
from math import sqrt, pow, ceil
from decimal import Decimal
from scipy import stats
import pandas as pd
import numpy as np
import itertools
import random
import copy 

## Global parameters

In [8]:
binary_relevance = [0,1]
cut_off = 3

## Step 1: Simulate Rankings of Relevance for E and P

We make no assumptions regarding the identities of the documents in the rankings of E and P. This implies that the documents in the two rankings can be distinct or that they can overlap. To incorporate this into our simulation, we generate not only all possible pairs of relevance rankings, but for each pair we also generate all the possible ways that their documents could overlap. We implement this by assigning ids to the documents in each ranking, with overlap occurring if two documents in different rankings have the same id.


In [9]:
def identify_docs(docs1, docs2):  
  """
  Take two lists of document relevance scores and generate all possible combinations of overlap
  between the two lists.
  :param docs1: ranking 1 relevance scores
  :param docs2: ranking 2 relevance scores
  :return: list of lists of tuples with all possible overlapping schemes
  """
  
  # Generate all possible document identifiers for irrelevant documents
  zero_ids1 = list(range(docs1.count(0))) 
  if len(zero_ids1) == 0:
    zero_ids2 = [[-1] * docs2.count(0)]  
  else:
    zero_ids2 = list(itertools.permutations(list(range(6)), docs2.count(0)))
    zero_ids2 = sorted([[-1 if x>=docs1.count(0) else x for x in ids] for ids in zero_ids2])
    zero_ids2 = list(ids for ids,_ in itertools.groupby(zero_ids2))  
  
  # Generate all possible document identifiers for relevant documents
  one_ids1 = list(range(docs1.count(1))) 
  if len(one_ids1) == 0:
    one_ids2 = [[-1] * docs2.count(1)]
  else:
    one_ids2 = list(itertools.permutations(list(range(6)), docs2.count(1)))
    one_ids2 = sorted([[-1 if x>=docs1.count(1) else x for x in ids] for ids in one_ids2])
    one_ids2 = list(ids for ids, _ in itertools.groupby(one_ids2))

  # Label the documents of ranking 1
  ranking1 = []
  zero_count = one_count = 0
  for doc in docs1:
    if doc == 0:
      ranking1.append((doc, zero_ids1[zero_count]))
      zero_count += 1
    else:
      ranking1.append((doc, one_ids1[one_count]))
      one_count += 1
      
  # Label the documents of ranking 2
  labelled_rankings = []
  for zero_ids in zero_ids2:
    for one_ids in one_ids2:
      ranking2 = []
      zero_count = one_count = 0
      for doc in docs2:
        if doc == 0:
          ranking2.append((doc, zero_ids[zero_count]))
          zero_count += 1
        else:
          ranking2.append((doc, one_ids[one_count]))
          one_count += 1        
      labelled_rankings.append([ranking1, ranking2])

  return labelled_rankings

In [10]:
# Generate all possible pairs of relevance rankings
system_e = list(map(list, itertools.product(binary_relevance, repeat=cut_off)))
system_p = list(map(list, itertools.product(binary_relevance, repeat=cut_off)))

# Expand the set of relevance rankings pairs to all possible overlapping configurations 
ranking_pairs = [list(ranking) for ranking in list(itertools.product(system_e, system_p))]
labelled_rankings = [ranking for docs in ranking_pairs for ranking in identify_docs(docs[0], docs[1])]
labelled_rankings[:5]

## Step 2: Defining Expected Reciprocal Rank @ Cut-off (ERR@-)

In [11]:
mapping_relevance_to_probability = lambda pos_g, max_g: ((2**pos_g) - 1 ) / (2**max_g)

In [12]:
def ERR(ranking, mapping=mapping_relevance_to_probability, n=cut_off):
    p, err = 1, 0
  
    for r in range(0, n):
        R = mapping_relevance_to_probability(ranking[r], 1)
        err += (p * (R / (r + 1)))
        p *= (1 - R)
        
    return err

## Step 3: Interleaving

The rankings produced by interleaving are cut-off at 3, since this is the maximum number of results a user can see. Using unique id labels, we make sure that there are no two identical documents in the interleaved rankings, as a part of both team draft and probabilistic interleaving algorithms.   



In [13]:
def team_draft_interleaving(ranking_input):
    """
    Generates the interleaved ranking based on semi-stochastical choice of a "team" that drafts and on deterministical 
    choice of the draft pick.
    Args: 
        ranking_input (list): two rankings proposed by each system with corresponding relevance and ids as tuples.
    
    returns a dict with ranking position as keys and list with team name and relevance as values
    """
    #sorting by relevance:
    ranking=copy.deepcopy(ranking_input)
    ranking_a, ranking_b = ranking[0], ranking[1]
    team_a, team_b = 0, 0
    # I_dict has rankings number as keys and triplets of [ str "Team_Name", int relevance, int id]
    I_dict = {"1": [],"2": [], "3": []} 
    for iteration in range(3):  # not the same stopping condition as in the paper, since there is a limit on the size of I
        
        #check to determine which team is drafting
        if (team_a < team_b) or ((team_a == team_b) and np.random.randint(2) == 1):
            #assignment 
            rank=ranking_a[0][0]
            id_a=ranking_a[0][1]
            I_dict[str(iteration+1)] = ["A",rank]
            team_a += 1
            #deletes duplicates
            for i,(rank_b,id_b) in enumerate(ranking_b):
                if id_a==id_b:
                    del ranking_b[i]
            #delete used ranking
            del ranking_a[0]
                    
                        
        else:
            #assignment 
            rank=ranking_b[0][0]
            id_b=ranking_b[0][1]
            I_dict[str(iteration+1)] = ["B",rank]
            team_b += 1
            #deletes duplicates
            for i,(rank_a,id_a) in enumerate(ranking_a):
                if id_b==id_a:
                    del ranking_a[i]
            #delete used ranking
            del ranking_b[0]
                        
    return I_dict
team_draft_interleaving([[(0, 0), (0, 1), (0, 2)], [(0, -1), (0, 2), (0, -1)]])

{'1': ['A', 0], '2': ['B', 0], '3': ['B', 0]}

In [14]:
def softmax(d):
    """
    Generates a softmax probability distribution based on the number of ranks and on their ranks.
    Args:
        d (list): a list with ranks
    returns a dict with ranking position as keys and probability of adding a rank to the interleaving as values
    """
    denominator=0
    
    length=len(d)
    for i in range(length):
        denominator+=1/((i+1)**tau)
    soft={}
    for i in range(length):
        soft[i]=(1/((i+1)**tau)) / denominator
    return soft

def probabilistic_interleaving(ranking_input):
    """
    Generates the interleaved ranking based on semi-stochastical choice of a "team" that drafts and on stochastic 
    choice of the draft pick.
    Args: 
        ranking_input (list): two rankings proposed by each system with corresponding relevance and ids as tuples.
    
    returns a dict with ranking position as keys and list with team name and relevance as values
    """
    ranking=copy.deepcopy(ranking_input)
    #sorting by relevance:
    ranking_a, ranking_b = ranking[0], ranking[1]
    team_a, team_b = 0, 0
    # I_dict has rankings number as keys and triplets of [ str "Team_Name", int relevance, int id]
    I_dict = {"1": [],"2": [], "3": []} 
    for iteration in range(3):  # not the same stopping condition as in the paper, since there is a limit on the size of I
        
        #check to determine which team is drafting
        if (team_a < team_b) or ((team_a == team_b) and np.random.randint(2) == 1):
            #generating distribution and choosing
            distribution=softmax(ranking_a)
            choice=np.random.choice(list(distribution.keys()), p=list(distribution.values()))
            #assignment
            id_a=ranking_a[choice][1]
            I_dict[str(iteration+1)] = ["A",ranking_a[choice][0]]
            team_a += 1
            #deleting duplicates
            for i,(rank_b,id_b) in enumerate(ranking_b):
                if id_a==id_b:
                    del ranking_b[i]
            
            del ranking_a[choice]
            
                        
        else:
            #generating distribution and choosing
            distribution=softmax(ranking_b)
            choice=np.random.choice(list(distribution.keys()), p=list(distribution.values()))
            #assignment
            id_b=ranking_b[choice][1]
            I_dict[str(iteration+1)] = ["B",ranking_b[choice][0]]
            team_b += 1
            #deleting duplicates
            for i,(rank_a,id_a) in enumerate(ranking_a):
                if id_b==id_a:
                    del ranking_a[i]
            
            del ranking_b[choice]
                        
    return I_dict
probabilistic_interleaving([[(0, 0), (0, 1), (0, 2)], [(0, -1), (0, 2), (0, -1)]])

{'1': ['B', 0], '2': ['A', 0], '3': ['B', 0]}

## Step 4: Simulate User Clicks

We implement a position-based model for clicking and train it with 100 iterations of the EM algorithm on the Yandex click log. We assume that documents clicked in a session can refer back to any of the queries in the same session, not just the most recent query (this assumption was communicated to us by our TA). After training the model we use the trained examination parameters for rank 1 to 3 and our own attractiveness parameters (0.1 for irrelevant documents and 0.9 for relevant documents) to simulate position-based clickes. We also implement the option of a random click as a sanity test.

In [15]:
class PositionBasedModel:

    def __init__(self, stored_gammas=False):
        self.gamma = defaultdict(float) if not stored_gammas else stored_gammas
        self.alpha = defaultdict(lambda: defaultdict(float))

    def train(self, training_file = "YandexRelPredChallenge.txt", iterations=10):

        # Read data into dataframe
        columns = ["SessionID", "TimePassed", "TypeOfAction", "TargetID", "RegionID", 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        df = pd.read_csv(training_file, sep='\t', header=None, names=columns)
        print("Training with EM...")

        for i in range(iterations):

             # Initialise sums and counts
            gamma_count = defaultdict(lambda: 2)
            alpha_count = defaultdict(lambda: defaultdict(lambda: 2))

            gamma_sum = defaultdict(lambda: 1)
            alpha_sum = defaultdict(lambda: defaultdict(lambda: 1))


            # Iterate sessions
            grouped = df.groupby("SessionID")
            for session_id, session_df in grouped:

                # Extract session clicks
                session_clicks = session_df[session_df["TypeOfAction"] == "C"]["TargetID"].tolist()

                # Iterate session queries
                query_df = session_df[session_df["TypeOfAction"] == "Q"]
                for j, (index, row) in enumerate(query_df.iterrows()):

                    query_id = row["TargetID"]
                    for rank in range(1, 11):  # from rank 1 to 10
                        document_id = row[rank]

                        # Determine what values should be added to the EM formula sums
                        if document_id in session_clicks:
                            gamma_value = alpha_value = 1
                        else:
                            gamma_value = (self.gamma[rank] * (1 - self.alpha[document_id][query_id])) / \
                                          (1 - self.gamma[rank] * self.alpha[document_id][query_id])
                            alpha_value = ((1 - self.gamma[rank]) * self.alpha[document_id][query_id]) / \
                                          (1 - self.gamma[rank] * self.alpha[document_id][query_id])

                        gamma_sum[rank] += gamma_value
                        alpha_sum[document_id][query_id] += alpha_value

                        gamma_count[rank] += 1
                        alpha_count[document_id][query_id] += 1

            # Update variables
            for rank, param in self.gamma.items():
                self.gamma[rank] = gamma_sum[rank] / gamma_count[rank]

            for document_id, document_params in self.alpha.items():
                for query_id, param in document_params.items():
                    self.alpha[document_id][query_id] = alpha_sum[document_id][query_id] / alpha_count[document_id][query_id]

            print("Completed iteration", i+1)
        print("Training complete")

    def click_prob(self, epsilon, rank, relevance):
        # Calculate the probability of clicking on a document
        attract = epsilon if relevance == 0 else 1-epsilon
        click_prob = float(self.gamma[rank]) * attract
        return click_prob

    def click_doc(self, epsilon, rank, relevance):
        # Decide whether a document is clicked on
        random_number = random.uniform(0, 1)
        return True if random_number < self.click_prob(epsilon, rank, relevance) else False
    
    def click_random(self, p=1./3.):
        # Decide whether a document is clicked based on a random click model where each
        # document is clicked with probability p
        random_number = random.uniform(0, 1)
        return True if random_number < p else False

In [16]:
# trained gammas for 100 iterations
gamma_100iterations = defaultdict(float, {1: 0.9998616618119641, 
                                          2: 0.6898705149478165, 
                                          3: 0.4737488629423336, 
                                          4: 0.35795929106830376, 
                                          5: 0.27471953217002687, 
                                          6: 0.22844461594225668, 
                                          7: 0.20124843776845872, 
                                          8: 0.17975265660898382, 
                                          9: 0.16342837688027292, 
                                          10: 0.16113079695259283})
model = PositionBasedModel(stored_gammas=gamma_100iterations)
# model.train(iterations=100)

In [17]:
model.gamma

defaultdict(float,
            {1: 0.9998616618119641,
             2: 0.6898705149478165,
             3: 0.4737488629423336,
             4: 0.35795929106830376,
             5: 0.27471953217002687,
             6: 0.22844461594225668,
             7: 0.20124843776845872,
             8: 0.17975265660898382,
             9: 0.16342837688027292,
             10: 0.16113079695259283})

In [18]:
model.click_prob(epsilon=0.1, rank=1, relevance=1)

0.8998754956307677

In [19]:
model.click_prob(epsilon=0.1, rank=10, relevance=0)

0.016113079695259283

In [20]:
model.click_doc(epsilon=0.1, rank=1, relevance=1)

True

In [21]:
model.click_doc(epsilon=0.1, rank=2, relevance=1)

True

In [22]:
z1_alpha = stats.norm.ppf(0.95) # alpha = 0.05
z1_beta = stats.norm.ppf(0.9) # beta = 0.1
def compute_impressions(p1, p0=0.5, z1_alpha=z1_alpha, z1_beta=z1_beta):

    """"
    Compute the amount of impressions needed according to the z-test.
    
    Args: 
        p1: proportion of wins to be tested for
        p0: standard value to test against
        z1_alpha: signifiance value
        z1_beta: significance value for a power of 1-beta
    
    returns the amount of impressions n to needed to prove significance.
    """
    
    null = z1_alpha * (sqrt(p0 * (1 - p0)))
    alternative = z1_beta * (sqrt(p1 * (1 - p1)))
    n = ceil(pow(((null + alternative) / (p1 - p0)), 2))
    
    return n

In [35]:
def power_analysis(interleaving_method, click_model, ranking, n_simulations, random=False, include_ties=False):
    wins_e, wins_p, ties = 0, 0, 0
    
    for i in range(n_simulations):
        click_e, click_p = 0, 0
        interleaving = interleaving_method(ranking)
        generated_list = [(value[0], value[1]) for value in interleaving.values()]
        
        for rank, (team, relevance) in enumerate(generated_list):
            if not random:
                click = click_model.click_doc(epsilon=0.1, rank=rank+1, relevance=relevance)
            else:
                click = click_model.click_random()
            
            if click:
                if 'A' in team:
                    click_e += 1
                else:
                    click_p += 1
        
        if click_e > click_p:
            
            wins_e += 1
            
        elif click_p > click_e:
            
            wins_p +=1
            
        else:
            
            wins_p += 1
            wins_e += 1
                    
    if include_ties:
        proportion_e = wins_e / (wins_e + wins_p  + ties)
    else:
        proportion_e = wins_e / (wins_e + wins_p)
        
    impressions = compute_impressions(proportion_e)
            
    return impressions


In [33]:
def calculate_significance(interleaving_method, table, labelled_rankings=labelled_rankings, metric=ERR, click_model=model, n_simulations=10**3, random=False, include_ties=False):
    
    """"
       
    Construct a table for intervals of the ERR metric. For each of these intervals, computes the mean, min and max. 
    
    Args: 
        interleaving_method: interleaving function of interest
        table: dict to bin and store results in
        labelled_rankings: dataset that contains all the possible ranking pairs
        metric: metric to compute the accuracy of a given ranking for
        click_model: model for generating user clicks
        n_simulations: amount of simulations desired
        random: whether to generate random clicks or not
        include_ties: specify how to compute the proportion of wins
    
    returns a binned pandas dataframe for given intervals of ERR and statistics from computed impressions
    """
    
    
    x = len(labelled_rankings)
    
    for i, ranking in enumerate(labelled_rankings):
        if i % 100 == 0:
            print('processing %s/%s'%(i,x))
        system_e = ranking[0]
        system_p = ranking[1]
        
        relevances_e = [e[0] for e in system_e]
        relevances_p = [p[0] for p in system_p]
        
        metric_e = metric(relevances_e)
        metric_p = metric(relevances_p)
        
        delta_metric = metric_e - metric_p
        
        if delta_metric >= 0:
            
            try:
                impressions = power_analysis(interleaving_method, click_model, ranking, n_simulations, random=random, include_ties=include_ties)
                table[delta_metric].append(impressions)
            except:
                continue
                
    statistics_table = dict()
    table = pd.DataFrame.from_dict(table, orient='index')   
    bins =  np.insert(np.insert(np.arange(0.1, 1, 0.1), 0, 0.05, axis=0), 10, 0.95, axis=0)
    groups = table.groupby(pd.cut(table.index, bins, right=False), axis=0) #.agg(['min', 'mean', 'max'])
    for interval, data in groups:
        values_in_bin = [value for value in data.values.flatten().tolist() if not np.isnan(value)]
        if values_in_bin:
            statistics = dict(min=np.min(values_in_bin), median=np.median(values_in_bin), max=np.max(values_in_bin))
            statistics_table[interval] = statistics
        else:
            statistics = dict(min=0, median=0, max=0)
            statistics_table[interval] = statistics
        
    
    statistics_table = pd.DataFrame.from_dict(statistics_table, orient='index')
        
    return statistics_table

### Random comparison

In [25]:
team_draft_table = defaultdict(lambda: list())
probabilistic_table = defaultdict(lambda: list())

team_draft_table = calculate_significance(team_draft_interleaving, team_draft_table, n_simulations=10**4, random=True)

print(team_draft_table)
print(team_draft_table.to_latex())

processing 0/688
processing 100/688
processing 200/688
processing 300/688
processing 400/688
processing 500/688


  import sys


processing 600/688
                  min    median          max
[0.05, 0.1)   17176.0  125270.5   16893215.0
[0.1, 0.2)     5744.0   53493.5   29914894.0
[0.2, 0.3)     4880.0  136251.0   67524768.0
[0.3, 0.4)    10140.0  113657.5    7348578.0
[0.4, 0.5)    15120.0  524383.0    4202279.0
[0.5, 0.6)    12508.0  151049.0  270580243.0
[0.6, 0.7)   100140.0  168528.5     423732.0
[0.7, 0.8)        0.0       0.0          0.0
[0.8, 0.9)        0.0       0.0          0.0
[0.9, 0.95)       0.0       0.0          0.0
\begin{tabular}{lrrr}
\toprule
{} &       min &    median &          max \\
\midrule
[0.05, 0.1) &   17176.0 &  125270.5 &   16893215.0 \\
[0.1, 0.2)  &    5744.0 &   53493.5 &   29914894.0 \\
[0.2, 0.3)  &    4880.0 &  136251.0 &   67524768.0 \\
[0.3, 0.4)  &   10140.0 &  113657.5 &    7348578.0 \\
[0.4, 0.5)  &   15120.0 &  524383.0 &    4202279.0 \\
[0.5, 0.6)  &   12508.0 &  151049.0 &  270580243.0 \\
[0.6, 0.7)  &  100140.0 &  168528.5 &     423732.0 \\
[0.7, 0.8)  &       0.0

In [26]:
probabilistic_table = calculate_significance(probabilistic_interleaving, probabilistic_table, n_simulations=10**4, random=True)

processing 0/688
processing 100/688
processing 200/688
processing 300/688
processing 400/688


  import sys


processing 500/688
processing 600/688


In [27]:
print(probabilistic_table)
print(probabilistic_table.to_latex())

                  min    median          max
[0.05, 0.1)   16788.0  100101.5    5296479.0
[0.1, 0.2)    12610.0  336289.5   64669924.0
[0.2, 0.3)     4790.0  122447.5  261227563.0
[0.3, 0.4)     6561.0  137252.0   64481794.0
[0.4, 0.5)     9745.0  212379.0   29426062.0
[0.5, 0.6)    16910.0  170378.0  265597112.0
[0.6, 0.7)   157769.0  173208.5     188648.0
[0.7, 0.8)        0.0       0.0          0.0
[0.8, 0.9)        0.0       0.0          0.0
[0.9, 0.95)       0.0       0.0          0.0
\begin{tabular}{lrrr}
\toprule
{} &       min &    median &          max \\
\midrule
[0.05, 0.1) &   16788.0 &  100101.5 &    5296479.0 \\
[0.1, 0.2)  &   12610.0 &  336289.5 &   64669924.0 \\
[0.2, 0.3)  &    4790.0 &  122447.5 &  261227563.0 \\
[0.3, 0.4)  &    6561.0 &  137252.0 &   64481794.0 \\
[0.4, 0.5)  &    9745.0 &  212379.0 &   29426062.0 \\
[0.5, 0.6)  &   16910.0 &  170378.0 &  265597112.0 \\
[0.6, 0.7)  &  157769.0 &  173208.5 &     188648.0 \\
[0.7, 0.8)  &       0.0 &       0.0 &     

### PBM Comparison

In [28]:
team_draft_table = defaultdict(lambda: list())
probabilistic_table = defaultdict(lambda: list())

# if not 'random=True' is passed, the PBM model is used
team_draft_table = calculate_significance(team_draft_interleaving, team_draft_table, n_simulations=10**4)

print(team_draft_table)
print(team_draft_table.to_latex())

processing 0/688
processing 100/688
processing 200/688
processing 300/688
processing 400/688
processing 500/688
processing 600/688
              min  median        max
[0.05, 0.1)  26.0    97.0  8007956.0
[0.1, 0.2)   11.0    92.5  9567947.0
[0.2, 0.3)    6.0    12.0     2263.0
[0.3, 0.4)    5.0    11.0      118.0
[0.4, 0.5)    5.0    10.0       12.0
[0.5, 0.6)    5.0     6.0       11.0
[0.6, 0.7)    5.0     5.0        6.0
[0.7, 0.8)    0.0     0.0        0.0
[0.8, 0.9)    0.0     0.0        0.0
[0.9, 0.95)   0.0     0.0        0.0
\begin{tabular}{lrrr}
\toprule
{} &   min &  median &        max \\
\midrule
[0.05, 0.1) &  26.0 &    97.0 &  8007956.0 \\
[0.1, 0.2)  &  11.0 &    92.5 &  9567947.0 \\
[0.2, 0.3)  &   6.0 &    12.0 &     2263.0 \\
[0.3, 0.4)  &   5.0 &    11.0 &      118.0 \\
[0.4, 0.5)  &   5.0 &    10.0 &       12.0 \\
[0.5, 0.6)  &   5.0 &     6.0 &       11.0 \\
[0.6, 0.7)  &   5.0 &     5.0 &        6.0 \\
[0.7, 0.8)  &   0.0 &     0.0 &        0.0 \\
[0.8, 0.9)  &   0

In [30]:
# if not 'random=True' is passed, the PBM model is used
probabilistic_table = calculate_significance(probabilistic_interleaving, probabilistic_table, n_simulations=10**4)

processing 0/688
processing 100/688
processing 200/688
processing 300/688
processing 400/688
processing 500/688
processing 600/688


  import sys


In [31]:
print(probabilistic_table)
print(probabilistic_table.to_latex())

              min  median       max
[0.05, 0.1)  22.0    78.0   25962.0
[0.1, 0.2)   20.0    38.5     493.0
[0.2, 0.3)   11.0    21.0  742296.0
[0.3, 0.4)    8.0    13.0     119.0
[0.4, 0.5)    7.0    10.5      17.0
[0.5, 0.6)    6.0     7.0      12.0
[0.6, 0.7)    6.0     6.0       6.0
[0.7, 0.8)    0.0     0.0       0.0
[0.8, 0.9)    0.0     0.0       0.0
[0.9, 0.95)   0.0     0.0       0.0
\begin{tabular}{lrrr}
\toprule
{} &   min &  median &       max \\
\midrule
[0.05, 0.1) &  22.0 &    78.0 &   25962.0 \\
[0.1, 0.2)  &  20.0 &    38.5 &     493.0 \\
[0.2, 0.3)  &  11.0 &    21.0 &  742296.0 \\
[0.3, 0.4)  &   8.0 &    13.0 &     119.0 \\
[0.4, 0.5)  &   7.0 &    10.5 &      17.0 \\
[0.5, 0.6)  &   6.0 &     7.0 &      12.0 \\
[0.6, 0.7)  &   6.0 &     6.0 &       6.0 \\
[0.7, 0.8)  &   0.0 &     0.0 &       0.0 \\
[0.8, 0.9)  &   0.0 &     0.0 &       0.0 \\
[0.9, 0.95) &   0.0 &     0.0 &       0.0 \\
\bottomrule
\end{tabular}



## Analysis

#### Results

##### Team-Draft Interleaving, Random click model


| $$\Delta ERR@3$$ 	|      min 	|   median 	|         max 	|
|------------------	|---------:	|---------:	|------------:	|
| [0.05, 0.1)      	|  17176.0 	| 125270.5 	|  16893215.0 	|
| [0.1, 0.2)       	|   5744.0 	|  53493.5 	|  29914894.0 	|
| [0.2, 0.3)       	|   4880.0 	| 136251.0 	|  67524768.0 	|
| [0.3, 0.4)       	|  10140.0 	| 113657.5 	|   7348578.0 	|
| [0.4, 0.5)       	|  15120.0 	| 524383.0 	|   4202279.0 	|
| [0.5, 0.6)       	|  12508.0 	| 151049.0 	| 270580243.0 	|
| [0.6, 0.7)       	| 100140.0 	| 168528.5 	|    423732.0 	|
| [0.7, 0.8)       	|      0.0 	|      0.0 	|         0.0 	|
| [0.8, 0.9)       	|      0.0 	|      0.0 	|         0.0 	|
| [0.9, 0.95)      	|      0.0 	|      0.0 	|         0.0 	|

##### Probabilistic Interleaving, Random click model

| $$\Delta ERR@3$$ 	|      min 	|   median 	|         max 	|
|------------------	|---------:	|---------:	|------------:	|
| [0.05, 0.1)      	|  16788.0 	| 100101.5 	|   5296479.0 	|
| [0.1, 0.2)       	|  12610.0 	| 336289.5 	|  64669924.0 	|
| [0.2, 0.3)       	|   4790.0 	| 122447.5 	| 261227563.0 	|
| [0.3, 0.4)       	|   6561.0 	| 137252.0 	|  64481794.0 	|
| [0.4, 0.5)       	|   9745.0 	| 212379.0 	|  29426062.0 	|
| [0.5, 0.6)       	|  16910.0 	| 170378.0 	| 265597112.0 	|
| [0.6, 0.7)       	| 157769.0 	| 173208.5 	|    188648.0 	|
| [0.7, 0.8)       	|      0.0 	|      0.0 	|         0.0 	|
| [0.8, 0.9)       	|      0.0 	|      0.0 	|         0.0 	|
| [0.9, 0.95)      	|      0.0 	|      0.0 	|         0.0 	|



##### Team-Draft Interleaving, PBM

| $$\Delta ERR@3$$  	|  min 	| median 	|       max 	|
|-------------	|-----:	|-------:	|----------:	|
| [0.05, 0.1) 	| 26.0 	|   97.0 	| 8007956.0 	|
| [0.1, 0.2)  	| 11.0 	|   92.5 	| 9567947.0 	|
| [0.2, 0.3)  	|  6.0 	|   12.0 	|    2263.0 	|
| [0.3, 0.4)  	|  5.0 	|   11.0 	|     118.0 	|
| [0.4, 0.5)  	|  5.0 	|   10.0 	|      12.0 	|
| [0.5, 0.6)  	|  5.0 	|    6.0 	|      11.0 	|
| [0.6, 0.7)  	|  5.0 	|    5.0 	|       6.0 	|
| [0.7, 0.8)  	|  0.0 	|    0.0 	|       0.0 	|
| [0.8, 0.9)  	|  0.0 	|    0.0 	|       0.0 	|
| [0.9, 0.95) 	|  0.0 	|    0.0 	|       0.0 	|



##### Probabilistic Interleaving, PBM

|  $$\Delta ERR@3 $$ 	|  min 	| median 	|      max 	|
|---------------------------------	|-----:	|-------:	|---------:	|
| [0.05, 0.1)                     	| 22.0 	|   78.0 	|  25962.0 	|
| [0.1, 0.2)                      	| 20.0 	|   38.5 	|    493.0 	|
| [0.2, 0.3)                      	| 11.0 	|   21.0 	| 742296.0 	|
| [0.3, 0.4)                      	|  8.0 	|   13.0 	|    119.0 	|
| [0.4, 0.5)                      	|  7.0 	|   10.5 	|     17.0 	|
| [0.5, 0.6)                      	|  6.0 	|    7.0 	|     12.0 	|
| [0.6, 0.7)                      	|  6.0 	|    6.0 	|      6.0 	|
| [0.7, 0.8)                      	|  0.0 	|    0.0 	|      0.0 	|
| [0.8, 0.9)                      	|  0.0 	|    0.0 	|      0.0 	|
| [0.9, 0.95)                     	|  0.0 	|    0.0 	|      0.0 	|



#### Observations

From the tables we notice that the smaller the difference is in the computed ERR, the more impressions are need for this bin to prove significance for this difference. For bins that have higher margins, we indeed observe an (exponential) decreasing fashion of the impressions needed. Intuitively this makes sense, due to the smaller that this difference is, the more impressions are needed to show significance.


#### Improvements

Out of several possible models for simulation of user click behaviour, in our experiments, we have only implemented two models - i.e. a random click model and a position based model (PBM). The underlying assumption to the latter is that a user may generate several clicks per query, and for a click to occur, several parameters (attractiveness) are to be estimated to compute the probability of the click. Other methods to estimate these parameters, or to even include other parameters that model for additional assumptions about the accuracy of the user behaviour, are perhaps possible improvements to our experimental design. Yet another model, that discounts for documents shown after documents of high relevance, the so called cascaded models, provide yet another assumption about the accuracy of the user behaviour. Considering these cascaded models may also provide an imporivement to our experimental design.


Different metrics are in circulation to compute the quality for, for a given ranking in offline evaluation. Whereas we have now only considered the metric of choice only for a rankings up until a cut-off at the third position, a more realistic approach would be to increase the cut-off position. Whereas this is not done now due to lack of computational and time resources, actually doing so, ensures to model more accurately a real world setting.

We have only considered a binary relevance judgements. By incorporating a finer grained judgement for relevances of documents, we expect to model for more accurate user feedback in assessing the users' opinion on a shown ranking. Hence, this would be an expected improvement in our experimental design.


More advanced statistic test.

