# Data exploration - Expedia Kaggle challenge

In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
%matplotlib inline

import pdb

In [33]:
# Read the Data
train = pd.read_csv("./train.csv")#, nrows = 10000)
target = train['hotel_cluster']
test = pd.read_csv("./test.csv")#, nrows = 1000)

feat_names = test.columns.values

dest = pd.read_csv("./destinations.csv")
print(train.shape)
print(test.shape)
train.head()

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster
0,2014-08-11 07:46:59,2,3,66,348,48862,2234.2641,12,0,1,...,0,1,8250,1,0,3,2,50,628,1
1,2014-08-11 08:22:12,2,3,66,348,48862,2234.2641,12,0,1,...,0,1,8250,1,1,1,2,50,628,1
2,2014-08-11 08:24:33,2,3,66,348,48862,2234.2641,12,0,0,...,0,1,8250,1,0,1,2,50,628,1
3,2014-08-09 18:05:16,2,3,66,442,35390,913.1932,93,0,0,...,0,1,14984,1,0,1,2,50,1457,80
4,2014-08-09 18:08:18,2,3,66,442,35390,913.6259,93,0,0,...,0,1,14984,1,0,1,2,50,1457,21


In [34]:
grp_feat_list = [['user_location_city', 'orig_destination_distance'],
                 ['srch_destination_id', 'hotel_country', 'hotel_market'],
                 ['srch_destination_id'],
                 ['hotel_country'],
                 []]
grp_scoring = [[1, 1], # These are the scores for non-bookings vs bookings
               [3, 20],
               [3, 20],
               [6, 1],
               [1, 1]]

# Create a new score column... makes some things easier later...
train['score'] = 1

# We need to include the base aggregating case:
import copy
grps = []
#for fl in grp_feat_list:
for i in range(len(grp_feat_list)):
    fl = grp_feat_list[i]
    r = copy.copy(fl)
    
    r.append('hotel_cluster')
    
    # Assign appropriate scores to bookings vs non-bookings
    train.loc[train['is_booking'] == 0, 'score'] = grp_scoring[i][0]
    train.loc[train['is_booking'] == 1, 'score'] = grp_scoring[i][1]
    
    # Or we sum up the score...
    score = train.groupby(r)['score'].sum()
    grps.append(score)
    

In [32]:
# Now the tricky bit to do fast: selecting the appropriate combinations of predictors and hotel_clusters...
#
# First a slow row by row solution
#

# This is all the guesses for the test data set
num_guesses = 5
guesses = np.zeros([test.shape[0], num_guesses], dtype=np.int32)

for r_i, r in test.iterrows():

    num_filled = 0
    
    # Go through each group combo to see if it exists in the training set
    for i in range(len(grp_feat_list)):

        # This is a tuple of the relevant features, e.g.
        # (user_location_city, orig_destination_distance) = (3, 5539.06)
        # THIS IS A TUPLE to be able to select using the .loc() 
        # function of the grouped data frames.
        chooser = tuple(r[grp_feat_list[i]].values)
        
        try:
            # Selects the scores matching the features of our current row
            # Also sorts by search score.
            if chooser: # True if the chooser exists
                candidates = grps[i][chooser].sort_values(ascending = False)
            else:
                candidates = grps[i].sort_values(ascending = False)
                
            # This gets the top 5 (max) hotel_cluster candidates from the sorted series
            h_c_candidates = candidates[:num_guesses].index.get_level_values('hotel_cluster')
            
            # Remove duplicates (for some reason can't to inplace=True)
            h_c_candidates = h_c_candidates.drop(guesses[r_i, :num_filled], errors='ignore')
            
            # Select only as many as we can fill, and at most 5 (YES, we do it this
            # way around and potentially exhaust the candidates when we look at uniqueness)
            fillable = min(len(h_c_candidates), num_guesses-num_filled)
            
            guesses[r_i, num_filled:num_filled+fillable] = h_c_candidates[:fillable].values
            num_filled = fillable + num_filled
            
            if num_filled == num_guesses:
                break
            
        except KeyError:
            # We are here because the "chooser" tuple was not found in the combo from 
            # the training set, so we merely go on to the next tuple of features in grp_feat_list
            pass

    if num_filled < num_guesses:
        # This only occures if there's an error..
        print(r_i)
        pdb.set_trace()

In [21]:
guesses

# Now some half baked attempt at a "fast" solution


array([[25, 64, 41,  5, 58],
       [25, 64, 41,  5, 58],
       [21, 91, 84, 59, 48],
       ..., 
       [25, 64, 41,  5, 58],
       [83, 70, 48, 16,  0],
       [82, 76, 67, 60, 81]], dtype=int32)

In [124]:
grp = train.groupby(['user_location_city', 'orig_destination_distance']).count()

print(train[(train['user_location_city'] == grp.index[1][0]).values & (train['orig_destination_distance'] == grp.index[1][1]).values])
print(grp.iloc[1])
#train['user_location_city'] == grp.index[1][0]
#print(train['orig_destination_distance'] == grp.index[1][1])
#train[train['user_location_city'] == 3]

                date_time  site_name  posa_continent  user_location_country  \
8279  2014-10-02 13:28:49         10               0                    182   
8281  2014-10-24 19:35:53         10               0                    182   

      user_location_region  user_location_city  orig_destination_distance  \
8279                   199                   3                   577.1299   
8281                   199                   3                   577.1299   

      user_id  is_mobile  is_package      ...        srch_children_cnt  \
8279    49731          0           0      ...                        0   
8281    49731          0           0      ...                        0   

     srch_rm_cnt srch_destination_id  srch_destination_type_id  is_booking  \
8279           1               41452                         1           0   
8281           1               41452                         1           0   

      cnt  hotel_continent  hotel_country  hotel_market  hotel_cluster  

$\int_{-\infty}^{\infty} f(x) \, \mathrm{d}x$