# Data leakage solution - Expedia Kaggle challenge

In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
%matplotlib inline

import time
import pdb

In [13]:
load = time.time()

# Read the Data
train = pd.read_csv("./train.csv")#, nrows = 10000)
target = train['hotel_cluster']
test = pd.read_csv("./test.csv")#, nrows = 1000)

feat_names = test.columns.values

dest = pd.read_csv("./destinations.csv")
print(train.shape)
print(test.shape)
train.head()

num_guesses = 5



grp_feat_list = [['user_location_city', 'orig_destination_distance'],
                 ['srch_destination_id', 'hotel_country', 'hotel_market'],
                 ['srch_destination_id'],
                 ['hotel_country']]
grp_scoring = [[1, 1], # These are the scores for non-bookings vs bookings
               [3, 20],
               [3, 20],
               [1, 6]]

# Create a new score column... makes some things easier later...
train['score'] = 1

start = time.time()

# We need to include the base aggregating case:
import copy
grps = []
#for fl in grp_feat_list:
for i in range(len(grp_feat_list)):
    fl = grp_feat_list[i]
    fl_hc = copy.copy(fl)
    fl_hc.append('hotel_cluster')
    
    # Assign appropriate scores to bookings vs non-bookings
    train.loc[train['is_booking'] == 0, 'score'] = grp_scoring[i][0]
    train.loc[train['is_booking'] == 1, 'score'] = grp_scoring[i][1]
    
    # Sum up the score for each unique combo in the feature_list + hotel_cluster,
    # then sort the values, then sort by the values, and regroup in to feature clusters
    score = train.groupby(fl_hc).sum()['score'].sort_values(ascending=False).sortlevel(level=range(len(fl)), sort_remaining=False)
    
    # Now the rest of the magic: group and get the top 5 scores for each combo,
    # then groupby and make that in to a list (that's why there's two groupby calls...)
    # NB the .apply(list) call takes the majority of the computing time, it's a python inner loop...
    #top_scoring = score.reset_index().groupby(fl_hc).head(num_guesses).groupby(fl)['hotel_cluster'].apply(list)
    
    # oh wait nevermind, top 5 (using groupby(...).head(num_guesses)) doesn't work... just list:
    top_scoring = score.reset_index().groupby(fl)['hotel_cluster'].apply(list)
   
    grps.append(top_scoring.to_dict())
    
    print("grouped " + str(fl))

# Now need to just do the top 5 hotel_clusters...

# Assign appropriate scores to bookings vs non-bookings
train.loc[train['is_booking'] == 0, 'score'] = 1
train.loc[train['is_booking'] == 1, 'score'] = 1

top_h_c = train.groupby(['hotel_cluster'])['score'].sum().sort_values(ascending=False).index

mid = time.time()
# Now the tricky bit to do fast: selecting the appropriate combinations of predictors and hotel_clusters...
#
# First a slow row by row solution
#

# This is all the guesses for the test data set
guesses = np.zeros([test.shape[0], num_guesses], dtype=np.int32)

for r_i, r in test.iterrows():
    num_filled = 0
    
    # Go through each group combo to see if it exists in the training set
    for i in range(len(grp_feat_list)):

        # This is a tuple of the relevant features, e.g.
        # (user_location_city, orig_destination_distance) = (3, 5539.06)
        # THIS IS A TUPLE to be able to select using the .loc() 
        # function of the grouped data frames.
        
        chooser = tuple(r[grp_feat_list[i]].values)
        try:
            candidates = grps[i][chooser]#.sort_values(ascending = False)
            
            # Remove duplicates...
            
            # Select only as many as we can fill, and at most 5 (YES, we do it this
            # way around and potentially exhaust the candidates when we look at uniqueness)
            fillable = min(len(candidates), num_guesses-num_filled)
            
            guesses[r_i, num_filled:num_filled+fillable] = candidates[:fillable]
            num_filled = fillable + num_filled
            
            if num_filled == num_guesses:
                break
            
        except KeyError:
            # We are here because the "chooser" tuple was not found in the combo from 
            # the training set, so we merely go on to the next tuple of features in grp_feat_list
            pass

    # Now we fill in any remaining guesses with just the top guesses:
    fillable = min(len(top_h_c), num_guesses-num_filled)
    guesses[r_i, num_filled:num_filled+fillable] = top_h_c[:fillable]
    
print("That took " + str(time.time() - load) + " s:\n" + str(start - load) + " s for load,\n" + str(mid-start) \
      + "s for dictionary build part,\n" + str(time.time() - mid) + " for applying to test set ")
guesses

(37670293, 24)
(2528243, 22)
grouped ['user_location_city', 'orig_destination_distance']
grouped ['srch_destination_id', 'hotel_country', 'hotel_market']
grouped ['srch_destination_id']
grouped ['hotel_country']
That took 3082.7677631378174 s:
173.98987698554993 s for load,
874.5405080318451s for dictionary build part,
2034.2373881340027 for applying to test set 


array([[ 5, 37, 55, 11, 22],
       [ 5, 91, 41, 48, 64],
       [91,  0, 31, 96, 91],
       ..., 
       [54,  1, 45, 79, 24],
       [50, 47, 43, 15, 32],
       [12, 36, 81, 57, 62]], dtype=int32)

In [14]:
guesses[:10,:]

array([[ 5, 37, 55, 11, 22],
       [ 5, 91, 41, 48, 64],
       [91,  0, 31, 96, 91],
       [ 1,  1, 45, 79, 24],
       [50, 51, 91,  2, 42],
       [91, 42, 28, 95, 48],
       [95, 21,  2, 33, 98],
       [95, 91, 18, 98, 68],
       [88,  1, 45, 79, 24],
       [55, 32, 10, 34, 50]], dtype=int32)

In [None]:
# Now the tricky bit to do fast: selecting the appropriate combinations of predictors and hotel_clusters...
#
# First a slow row by row solution
#
import time

# This is all the guesses for the test data set
guesses = np.zeros([test.shape[0], num_guesses], dtype=np.int32)

t = time.time()

for r_i, r in test.iterrows():
    if r_i % 100 == 0:
        print("row: " + str(r_i) + " time: " + str(time.time() - t))
        t = time.time()
    num_filled = 0
    
    # Go through each group combo to see if it exists in the training set
    for i in range(len(grp_feat_list)):

        # This is a tuple of the relevant features, e.g.
        # (user_location_city, orig_destination_distance) = (3, 5539.06)
        # THIS IS A TUPLE to be able to select using the .loc() 
        # function of the grouped data frames.
        
        chooser = tuple(r[grp_feat_list[i]].values)
        try:
            # Selects the scores matching the features of our current row
            # Also sorts by search score. THIS IS THE SLOWEST PART OF THE ROUTINE
            # PROBABLY DUE TO SORT...
            if chooser: # True if the chooser exists
                candidates = grps[i][chooser]#.sort_values(ascending = False)
            else:
                candidates = grps[i]#.sort_values(ascending = False)
            
            # This gets the top 5 (max) hotel_cluster candidates from the sorted series
            h_c_candidates = candidates[:num_guesses].index.get_level_values('hotel_cluster')
            
            # Remove duplicates (for some reason can't to inplace=True)
            h_c_candidates = h_c_candidates.drop(guesses[r_i, :num_filled], errors='ignore')
            
            # Select only as many as we can fill, and at most 5 (YES, we do it this
            # way around and potentially exhaust the candidates when we look at uniqueness)
            fillable = min(len(h_c_candidates), num_guesses-num_filled)
            
            guesses[r_i, num_filled:num_filled+fillable] = h_c_candidates[:fillable].values
            num_filled = fillable + num_filled
            
            if num_filled == num_guesses:
                break
            
        except KeyError:
            # We are here because the "chooser" tuple was not found in the combo from 
            # the training set, so we merely go on to the next tuple of features in grp_feat_list
            pass

    if num_filled < num_guesses:
        # This only occures if there's an error..
        print(r_i)
        pdb.set_trace()

row: 0 time: 0.0010449886322021484
0
> <ipython-input-4-8bb3f5024fad>(12)<module>()
-> for r_i, r in test.iterrows():
(Pdb) candidates
*** NameError: name 'candidates' is not defined
(Pdb) chooser
(204,)
(Pdb) p i
3
(Pdb) h_c_candidates
*** NameError: name 'h_c_candidates' is not defined
(Pdb) 1
1
(Pdb) q


In [39]:
guesses

# Now some half baked attempt at a "fast" solution?


array([[ 5, 37, 55, 11, 22],
       [ 5, 25, 64, 11,  8],
       [91,  0, 31, 96, 77],
       ..., 
       [ 0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0]], dtype=int32)

In [124]:
grp = train.groupby(['user_location_city', 'orig_destination_distance']).count()

print(train[(train['user_location_city'] == grp.index[1][0]).values & (train['orig_destination_distance'] == grp.index[1][1]).values])
print(grp.iloc[1])
#train['user_location_city'] == grp.index[1][0]
#print(train['orig_destination_distance'] == grp.index[1][1])
#train[train['user_location_city'] == 3]

                date_time  site_name  posa_continent  user_location_country  \
8279  2014-10-02 13:28:49         10               0                    182   
8281  2014-10-24 19:35:53         10               0                    182   

      user_location_region  user_location_city  orig_destination_distance  \
8279                   199                   3                   577.1299   
8281                   199                   3                   577.1299   

      user_id  is_mobile  is_package      ...        srch_children_cnt  \
8279    49731          0           0      ...                        0   
8281    49731          0           0      ...                        0   

     srch_rm_cnt srch_destination_id  srch_destination_type_id  is_booking  \
8279           1               41452                         1           0   
8281           1               41452                         1           0   

      cnt  hotel_continent  hotel_country  hotel_market  hotel_cluster  

$\int_{-\infty}^{\infty} f(x) \, \mathrm{d}x$