In [2]:
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from decimal import Decimal

training_file = "YandexRelPredChallenge.txt"

In [17]:
# Read data into dataframe
columns = ["SessionID", "TimePassed", "TypeOfAction", "TargetID", "RegionID", 1,2,3,4,5,6,7,8,9,10]
df = pd.read_csv(training_file, sep='\t', header=None, names=columns)

gamma = defaultdict(float)
alpha = defaultdict(lambda: defaultdict(float))

print("Training with EM...")

for i in range(50):

  # Initialise sums and counts
  gamma_count = defaultdict(float)
  alpha_count = defaultdict(lambda: defaultdict(float))

  gamma_sum = defaultdict(float)
  alpha_sum = defaultdict(lambda: defaultdict(float))

  # Iterate sessions
  grouped = df.groupby("SessionID")
  for session_id, session_df in grouped:

    # Extract session clicks
    session_clicks = session_df[session_df["TypeOfAction"] == "C"]["TargetID"].tolist()

    # Iterate session queries
    query_df = session_df[session_df["TypeOfAction"] == "Q"]
    for j, (index, row) in enumerate(query_df.iterrows()):
      query_id = row["TargetID"]
      for rank in range(1, 11): # from rank 1 to 10
        document_id = row[rank]
        
        # Determine what values should be added to the EM formula sums 
        if j == len(query_df)-1 and document_id in session_clicks:
          gamma_value = alpha_value = 1
        else:
          gamma_value = (gamma[rank] * (1 - alpha[document_id][query_id])) / \
                        (1 - gamma[rank] * alpha[document_id][query_id])
          alpha_value = ((1 - gamma[rank]) * alpha[document_id][query_id]) / \
                        (1 - gamma[rank] * alpha[document_id][query_id])
            
        gamma_sum[rank] += gamma_value
        alpha_sum[document_id][query_id] += alpha_value

        gamma_count[rank] += 1
        alpha_count[document_id][query_id] += 1   

  # Update variables
  for rank, param in gamma.items():
    gamma[rank] = gamma_sum[rank] / gamma_count[rank]
    
  for document_id, document_params in alpha.items():
    for query_id, param in document_params.items():
      alpha[document_id][query_id] = alpha_sum[document_id][query_id] / alpha_count[document_id][query_id]

  print("Completed EM iteration", i)
  print(gamma, "\n")

Training with EM...
Completed EM iteration 0
defaultdict(<class 'float'>, {1: 0.1802729063115446, 2: 0.06874238019319141, 3: 0.04958735815436556, 4: 0.039083747538216265, 5: 0.028861483634999532, 6: 0.024500609584544687, 7: 0.021288567945231172, 8: 0.02025696333114508, 9: 0.01835787301885023, 10: 0.018850229766482228}) 

Completed EM iteration 1
defaultdict(<class 'float'>, {1: 0.3169490291467086, 2: 0.13062185971984377, 3: 0.0956184406357258, 4: 0.0759286782433521, 5: 0.05650482343611251, 6: 0.04810963236990753, 7: 0.04191632670807823, 8: 0.039906431674339965, 9: 0.03622098248165567, 10: 0.03717434307268702}) 

Completed EM iteration 2
defaultdict(<class 'float'>, {1: 0.41179848233975536, 2: 0.18361180772688757, 3: 0.13683613974670636, 4: 0.10962768101092404, 5: 0.08239864602579823, 6: 0.0704070354633887, 7: 0.06157419310652035, 8: 0.05865440329751951, 9: 0.05334857072480224, 10: 0.05471338215298999}) 

Completed EM iteration 3
defaultdict(<class 'float'>, {1: 0.47639312798774713, 2: 

In [18]:
# Read data into dataframe
columns = ["SessionID", "TimePassed", "TypeOfAction", "TargetID", "RegionID", 1,2,3,4,5,6,7,8,9,10]
df = pd.read_csv(training_file, sep='\t', header=None, names=columns)

gamma = defaultdict(float)
alpha = defaultdict(lambda: defaultdict(float))

print("Training with EM...")

for i in range(50):

  # Initialise sums and counts
  gamma_count = defaultdict(float)
  alpha_count = defaultdict(lambda: defaultdict(lambda: 2))

  gamma_sum = defaultdict(float)
  alpha_sum = defaultdict(lambda: defaultdict(lambda: 1))

  # Iterate sessions
  grouped = df.groupby("SessionID")
  for session_id, session_df in grouped:

    # Extract session clicks
    session_clicks = session_df[session_df["TypeOfAction"] == "C"]["TargetID"].tolist()

    # Iterate session queries
    query_df = session_df[session_df["TypeOfAction"] == "Q"]
    for j, (index, row) in enumerate(query_df.iterrows()):
      query_id = row["TargetID"]
      for rank in range(1, 11): # from rank 1 to 10
        document_id = row[rank]
        
        # Determine what values should be added to the EM formula sums 
        if j == len(query_df)-1 and document_id in session_clicks:
          gamma_value = alpha_value = 1
        else:
          gamma_value = (gamma[rank] * (1 - alpha[document_id][query_id])) / \
                        (1 - gamma[rank] * alpha[document_id][query_id])
          alpha_value = ((1 - gamma[rank]) * alpha[document_id][query_id]) / \
                        (1 - gamma[rank] * alpha[document_id][query_id])
          gamma_add = alpha_add = 1
            
        gamma_sum[rank] += gamma_value
        alpha_sum[document_id][query_id] += alpha_value

        gamma_count[rank] += 1
        alpha_count[document_id][query_id] += 1   

  # Update variables
  for rank, param in gamma.items():
    gamma[rank] = gamma_sum[rank] / gamma_count[rank]
    
  for document_id, document_params in alpha.items():
    for query_id, param in document_params.items():
      alpha[document_id][query_id] = alpha_sum[document_id][query_id] / alpha_count[document_id][query_id]

  print("Completed EM iteration", i)
  print(gamma, "\n")

Training with EM...
Completed EM iteration 0
defaultdict(<class 'float'>, {1: 0.1802729063115446, 2: 0.06874238019319141, 3: 0.04958735815436556, 4: 0.039083747538216265, 5: 0.028861483634999532, 6: 0.024500609584544687, 7: 0.021288567945231172, 8: 0.02025696333114508, 9: 0.01835787301885023, 10: 0.018850229766482228}) 

Completed EM iteration 1
defaultdict(<class 'float'>, {1: 0.2879324961149324, 2: 0.11751237609570969, 3: 0.08576906679999548, 4: 0.06801139584469315, 5: 0.050532927588336524, 6: 0.04298010947813839, 7: 0.037424271858497664, 8: 0.035612688000053155, 9: 0.03230370187059883, 10: 0.03313803975360679}) 

Completed EM iteration 2
defaultdict(<class 'float'>, {1: 0.333994011207157, 2: 0.1423937614149068, 3: 0.1049425252313509, 4: 0.08361558255590253, 5: 0.06246604120999655, 6: 0.05318697802794656, 7: 0.046411648372549984, 8: 0.04415095674230409, 9: 0.040081412741837234, 10: 0.041062945569561436}) 

Completed EM iteration 3
defaultdict(<class 'float'>, {1: 0.35283328323721824,

In [19]:
# Read data into dataframe
columns = ["SessionID", "TimePassed", "TypeOfAction", "TargetID", "RegionID", 1,2,3,4,5,6,7,8,9,10]
df = pd.read_csv(training_file, sep='\t', header=None, names=columns)

gamma = defaultdict(float)
alpha = defaultdict(lambda: defaultdict(float))

print("Training with EM...")

for i in range(50):

  # Initialise sums and counts
  gamma_count = defaultdict(float)
  alpha_count = defaultdict(lambda: defaultdict(lambda: 10))

  gamma_sum = defaultdict(float)
  alpha_sum = defaultdict(lambda: defaultdict(lambda: 1))

  # Iterate sessions
  grouped = df.groupby("SessionID")
  for session_id, session_df in grouped:

    # Extract session clicks
    session_clicks = session_df[session_df["TypeOfAction"] == "C"]["TargetID"].tolist()

    # Iterate session queries
    query_df = session_df[session_df["TypeOfAction"] == "Q"]
    for j, (index, row) in enumerate(query_df.iterrows()):
      query_id = row["TargetID"]
      for rank in range(1, 11): # from rank 1 to 10
        document_id = row[rank]
        
        # Determine what values should be added to the EM formula sums 
        if j == len(query_df)-1 and document_id in session_clicks:
          gamma_value = alpha_value = 1
        else:
          gamma_value = (gamma[rank] * (1 - alpha[document_id][query_id])) / \
                        (1 - gamma[rank] * alpha[document_id][query_id])
          alpha_value = ((1 - gamma[rank]) * alpha[document_id][query_id]) / \
                        (1 - gamma[rank] * alpha[document_id][query_id])
          gamma_add = alpha_add = 1
            
        gamma_sum[rank] += gamma_value
        alpha_sum[document_id][query_id] += alpha_value

        gamma_count[rank] += 1
        alpha_count[document_id][query_id] += 1   

  # Update variables
  for rank, param in gamma.items():
    gamma[rank] = gamma_sum[rank] / gamma_count[rank]
    
  for document_id, document_params in alpha.items():
    for query_id, param in document_params.items():
      alpha[document_id][query_id] = alpha_sum[document_id][query_id] / alpha_count[document_id][query_id]

  print("Completed EM iteration", i)
  print(gamma, "\n")

Training with EM...
Completed EM iteration 0
defaultdict(<class 'float'>, {1: 0.1802729063115446, 2: 0.06874238019319141, 3: 0.04958735815436556, 4: 0.039083747538216265, 5: 0.028861483634999532, 6: 0.024500609584544687, 7: 0.021288567945231172, 8: 0.02025696333114508, 9: 0.01835787301885023, 10: 0.018850229766482228}) 

Completed EM iteration 1
defaultdict(<class 'float'>, {1: 0.3113377777661081, 2: 0.12723017163675693, 3: 0.09292296356208396, 4: 0.07372174670739996, 5: 0.05479333831220556, 6: 0.046624969201277844, 7: 0.040603322473462856, 8: 0.038652116939068004, 9: 0.03507054830947158, 10: 0.035991558325094056}) 

Completed EM iteration 2
defaultdict(<class 'float'>, {1: 0.4027828947733362, 2: 0.1748319687546444, 3: 0.12931933050665018, 4: 0.10329601755061286, 5: 0.07730598291210417, 6: 0.06593363519272816, 7: 0.05757070205391697, 8: 0.054825105870099755, 9: 0.04981248278356237, 10: 0.051088235209981236}) 

Completed EM iteration 3
defaultdict(<class 'float'>, {1: 0.4691612598991045

In [None]:
model.click_prob(0.1, 1, 0)

In [8]:
alpha

defaultdict(<function __main__.<lambda>()>,
            {7.0: defaultdict(float, {8: 0.4475055145766243}),
             103.0: defaultdict(float, {8: 0.47877592789031514}),
             51.0: defaultdict(float, {8: 0.4843442597605541}),
             92.0: defaultdict(float, {8: 0.4874767190697134}),
             43.0: defaultdict(float, {8: 0.4905214046971167}),
             12.0: defaultdict(float, {8: 0.4918644796905163}),
             73.0: defaultdict(float, {8: 0.4928195325239988}),
             69.0: defaultdict(float, {8: 0.4931479344509808}),
             27.0: defaultdict(float, {8: 0.4937229835462545}),
             105.0: defaultdict(float, {8: 0.49359343377546283}),
             1625.0: defaultdict(float,
                         {174: 0.3660015002719183,
                          29382: 0.4843442597605541,
                          11009: 0.4905214046971167,
                          17231: 0.47877592789031514}),
             1627.0: defaultdict(float,
                    