In [None]:
import numpy as np
import pandas as pd
import itertools
from collections import defaultdict
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from decimal import Decimal
import random

## Global parameters

In [2]:
binary_relevance = [0,1]
cut_off = 3

## Step 1: Simulate Rankings of Relevance for E and P

In [3]:
def identify_docs(docs1, docs2):  
  """
  Take two lists of document relevance scores and generate all possible combinations of overlap
  between the two lists.
  :param docs1: ranking 1 relevance scores
  :param docs2: ranking 2 relevance scores
  :return: list of lists of tuples with all possible overlapping schemes
  """
  
  # Generate all possible document identifiers for irrelevant documents
  zero_ids1 = list(range(docs1.count(0))) 
  if len(zero_ids1) == 0:
    zero_ids2 = [[-1] * docs2.count(0)]  
  else:
    zero_ids2 = list(itertools.permutations(list(range(6)), docs2.count(0)))
    zero_ids2 = sorted([[-1 if x>=docs1.count(0) else x for x in ids] for ids in zero_ids2])
    zero_ids2 = list(ids for ids,_ in itertools.groupby(zero_ids2))  
  
  # Generate all possible document identifiers for relevant documents
  one_ids1 = list(range(docs1.count(1))) 
  if len(one_ids1) == 0:
    one_ids2 = [[-1] * docs2.count(1)]
  else:
    one_ids2 = list(itertools.permutations(list(range(6)), docs2.count(1)))
    one_ids2 = sorted([[-1 if x>=docs1.count(1) else x for x in ids] for ids in one_ids2])
    one_ids2 = list(ids for ids, _ in itertools.groupby(one_ids2))

  # Label the documents of ranking 1
  ranking1 = []
  zero_count = one_count = 0
  for doc in docs1:
    if doc == 0:
      ranking1.append((doc, zero_ids1[zero_count]))
      zero_count += 1
    else:
      ranking1.append((doc, one_ids1[one_count]))
      one_count += 1
      
  # Label the documents of ranking 2
  labelled_rankings = []
  for zero_ids in zero_ids2:
    for one_ids in one_ids2:
      ranking2 = []
      zero_count = one_count = 0
      for doc in docs2:
        if doc == 0:
          ranking2.append((doc, zero_ids[zero_count]))
          zero_count += 1
        else:
          ranking2.append((doc, one_ids[one_count]))
          one_count += 1        
      labelled_rankings.append([ranking1, ranking2])

  return labelled_rankings

In [4]:
system_e = list(map(list, itertools.product(binary_relevance, repeat=cut_off)))
system_p = list(map(list, itertools.product(binary_relevance, repeat=cut_off)))

ranking_pairs = [list(ranking) for ranking in list(itertools.product(system_e, system_p))]
labelled_rankings = [ranking for docs in ranking_pairs for ranking in identify_docs(docs[0], docs[1])]
labelled_rankings

[[[(0, 0), (0, 1), (0, 2)], [(0, -1), (0, -1), (0, -1)]],
 [[(0, 0), (0, 1), (0, 2)], [(0, -1), (0, -1), (0, 0)]],
 [[(0, 0), (0, 1), (0, 2)], [(0, -1), (0, -1), (0, 1)]],
 [[(0, 0), (0, 1), (0, 2)], [(0, -1), (0, -1), (0, 2)]],
 [[(0, 0), (0, 1), (0, 2)], [(0, -1), (0, 0), (0, -1)]],
 [[(0, 0), (0, 1), (0, 2)], [(0, -1), (0, 0), (0, 1)]],
 [[(0, 0), (0, 1), (0, 2)], [(0, -1), (0, 0), (0, 2)]],
 [[(0, 0), (0, 1), (0, 2)], [(0, -1), (0, 1), (0, -1)]],
 [[(0, 0), (0, 1), (0, 2)], [(0, -1), (0, 1), (0, 0)]],
 [[(0, 0), (0, 1), (0, 2)], [(0, -1), (0, 1), (0, 2)]],
 [[(0, 0), (0, 1), (0, 2)], [(0, -1), (0, 2), (0, -1)]],
 [[(0, 0), (0, 1), (0, 2)], [(0, -1), (0, 2), (0, 0)]],
 [[(0, 0), (0, 1), (0, 2)], [(0, -1), (0, 2), (0, 1)]],
 [[(0, 0), (0, 1), (0, 2)], [(0, 0), (0, -1), (0, -1)]],
 [[(0, 0), (0, 1), (0, 2)], [(0, 0), (0, -1), (0, 1)]],
 [[(0, 0), (0, 1), (0, 2)], [(0, 0), (0, -1), (0, 2)]],
 [[(0, 0), (0, 1), (0, 2)], [(0, 0), (0, 1), (0, -1)]],
 [[(0, 0), (0, 1), (0, 2)], [(0, 0), (0

## Defining Expected Reciprocal Rank @ Cut-off (ERR@-)

In [5]:
mapping_relevance_to_probability = lambda pos_g, max_g: ((2**pos_g) - 1 ) / (2**max_g)

In [6]:
def ERR(ranking, mapping=mapping_relevance_to_probability, n=cut_off):
    p, err = 1, 0
    
    for r in range(0, n):
        R = mapping_relevance_to_probability(ranking[r], 1)
        err += (p * (R/(r+1)))
        p *= (1-R)
        
    return err

## Construct the table

In [7]:
table = defaultdict(lambda:list())
for pair in ranking_pairs:
    
    err_e = ERR(pair[0])
    err_p = ERR(pair[1])
    delta_err = err_e - err_p
    if delta_err >= 0: # only consider cases where system e outperforms system p
        table[delta_err].append(pair)
    
table
# table = {k:sum(v) for k,v in table.items()}
    
# table = pd.DataFrame(table, index=[0]).T
# table.reset_index(level=0, inplace=True)

defaultdict(<function __main__.<lambda>>,
            {0.0: [[[0, 0, 0], [0, 0, 0]],
              [[0, 0, 1], [0, 0, 1]],
              [[0, 1, 0], [0, 1, 0]],
              [[0, 1, 1], [0, 1, 1]],
              [[1, 0, 0], [1, 0, 0]],
              [[1, 0, 1], [1, 0, 1]],
              [[1, 1, 0], [1, 1, 0]],
              [[1, 1, 1], [1, 1, 1]]],
             0.04166666666666663: [[[1, 1, 0], [1, 0, 1]],
              [[1, 1, 1], [1, 1, 0]]],
             0.08333333333333326: [[[1, 1, 1], [1, 0, 1]]],
             0.08333333333333331: [[[0, 1, 1], [0, 1, 0]]],
             0.08333333333333334: [[[0, 1, 0], [0, 0, 1]]],
             0.08333333333333337: [[[1, 0, 1], [1, 0, 0]]],
             0.125: [[[1, 1, 0], [1, 0, 0]]],
             0.16666666666666663: [[[1, 1, 1], [1, 0, 0]]],
             0.16666666666666666: [[[0, 0, 1], [0, 0, 0]],
              [[0, 1, 1], [0, 0, 1]]],
             0.16666666666666669: [[[1, 0, 0], [0, 1, 1]]],
             0.25: [[[0, 1, 0], [0, 0, 0]], [[

In [8]:
bins = np.linspace(0.05, 0.95 , 10)
groups = table.groupby(pandas.cut(table['index'], bins))
# groups.count()[0]

AttributeError: 'collections.defaultdict' object has no attribute 'groupby'

In [9]:
def team_draft_interleaving(ranking_a, ranking_b):
    I, team_a, team_b = list(), set(), set()
    

## Step 4: Simulate User Clicks

In [None]:
class PositionBasedModel:

    def __init__(self):
        self.gamma = defaultdict(float)
        self.alpha = defaultdict(lambda: defaultdict(float))

    def train(self, training_file = "YandexRelPredChallenge.txt", iterations=10):

        # Read data into dataframe
        columns = ["SessionID", "TimePassed", "TypeOfAction", "TargetID", "RegionID", 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        df = pd.read_csv(training_file, sep='\t', header=None, names=columns)
        print("Training with EM...")

        for i in range(iterations):

            # Initialise sums and counts
            gamma_count = defaultdict(float)
            alpha_count = defaultdict(lambda: defaultdict(lambda: 2)) # This can be changed

            gamma_sum = defaultdict(float)
            alpha_sum = defaultdict(lambda: defaultdict(lambda: 1))

            # Iterate sessions
            grouped = df.groupby("SessionID")
            for session_id, session_df in grouped:

                # Extract session clicks
                session_clicks = session_df[session_df["TypeOfAction"] == "C"]["TargetID"].tolist()

                # Iterate session queries
                query_df = session_df[session_df["TypeOfAction"] == "Q"]
                for j, (index, row) in enumerate(query_df.iterrows()):

                    query_id = row["TargetID"]
                    for rank in range(1, 11):  # from rank 1 to 10
                        document_id = row[rank]

                        # Determine what values should be added to the EM formula sums
                        if j == len(query_df) - 1 and document_id in session_clicks:
                            gamma_value = alpha_value = 1
                        else:
                            gamma_value = (self.gamma[rank] * (1 - self.alpha[document_id][query_id])) / \
                                          (1 - self.gamma[rank] * self.alpha[document_id][query_id])
                            alpha_value = ((1 - self.gamma[rank]) * self.alpha[document_id][query_id]) / \
                                          (1 - self.gamma[rank] * self.alpha[document_id][query_id])

                        gamma_sum[rank] += gamma_value
                        alpha_sum[document_id][query_id] += alpha_value

                        gamma_count[rank] += 1
                        alpha_count[document_id][query_id] += 1

            # Update variables
            for rank, param in self.gamma.items():
                self.gamma[rank] = gamma_sum[rank] / gamma_count[rank]

            for document_id, document_params in self.alpha.items():
                for query_id, param in document_params.items():
                    self.alpha[document_id][query_id] = alpha_sum[document_id][query_id] / alpha_count[document_id][query_id]

            print("Completed iteration", i+1)
        print("Training complete")

    def click_prob(self, epsilon, rank, relevance):
        # Calculate the probability of clicking on a document
        attract = epsilon if relevance == 0 else 1-epsilon
        click_prob = float(self.gamma[rank]) * attract
        return click_prob

    def click_doc(self, epsilon, rank, relevance):
        # Decide whether a document is clicked on
        random_number = random.uniform(0, 1)
        return True if random_number < self.click_prob(epsilon, rank, relevance) else False

In [None]:
model = PositionBasedModel()
model.train(iterations=100)

Training with EM...


In [11]:
model.gamma

defaultdict(float,
            {1: 1.0,
             2: 0.7295799265009462,
             3: 0.5090874362772909,
             4: 0.3914117872033601,
             5: 0.28520356561587273,
             6: 0.23256466901771844,
             7: 0.20238199770630713,
             8: 0.19012437228055004,
             9: 0.17025041344452047,
             10: 0.17370354035567492})

In [None]:
model.click_prob(epsilon=0.1, rank=1, relevance=1)

In [None]:
model.click_prob(epsilon=0.1, rank=10, relevance=0)

In [None]:
model.click_doc(epsilon=0.1, rank=1, relevance=1)

In [None]:
model.click_doc(epsilon=0.1, rank=3, relevance=0)