In [1]:
#Importing the needed packages
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline 

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB


import random
import time
import os
import math

# Null Handling
gross_bookings_usd: 97% <br/>
srch_query_affinity_score: 94%  <br/>
orig_destination_distance: 32% #juul fixt deze <br/>
prop_location_score2: 22% #replaced by mean <br/>
prop_location_score1: 0% <br/>
visitor_hist_adr_usd: 95% <br/>
visitor_hist_starrating: 95% full corr. <br/>


In [2]:
#loading the data
expedia_df = pd.read_csv('training_set_VU_DM_2014.csv')
test_df = pd.read_csv('test_set_VU_DM_2014.csv')

In [95]:
d = expedia_df

In [98]:
d.loc[d['prop_id'] == 893][['prop_review_score','prop_starrating']]

Unnamed: 0,prop_review_score,prop_starrating
0,3.5,3
6938,3.5,3
12605,3.5,3
21958,3.5,3
30512,3.5,3
30572,3.5,3
32045,3.5,3
37487,3.5,3
43935,3.5,3
44063,3.5,3


In [10]:
def bin_cont(data, variable_name, nbins):
    labels_ = [str(x) for x in np.arange(nbins)]
    # assign categorical labels to n bins
    data[variable_name] = pd.cut(data[variable_name],nbins, labels=labels_)
    # Transform categorical variable to several binary dummy variables
    data = pd.get_dummies(data,columns=[variable_name])
    return data
    
def normalize(data, variable):
    d = data[variable]
    mean = np.mean(d)
    norm_d = [(x-mean)/mean for x in d]
    data[variable] = norm_d
    return data

def pp_time(data):
    data["date_time"] = pd.to_datetime(data["date_time"])
    data["year"] = data["date_time"].dt.year
    data["month"] = data["date_time"].dt.month
    return data

def binning(data):
    data['price_usd'] =  pd.cut(data['price_usd'], bins=[0, 100, 175, 250, 500, 1000], include_lowest=True)
    return data

def join_comps(data):
    comp_vars = ['comp1_rate','comp1_inv','comp1_rate_percent_diff',
                'comp2_rate','comp2_inv','comp2_rate_percent_diff',
                'comp3_rate','comp3_inv','comp3_rate_percent_diff',
                'comp4_rate','comp4_inv','comp4_rate_percent_diff',
                'comp5_rate','comp5_inv','comp5_rate_percent_diff',
                'comp6_rate','comp6_inv','comp6_rate_percent_diff',
                'comp7_rate','comp7_inv','comp7_rate_percent_diff',
                'comp8_rate','comp8_inv','comp8_rate_percent_diff']
    data = remove_comp_outliers(data, *[x for x in comp_vars if 'percent' in x])
    data = combine_comps(data,comp_vars)
    data.drop(comp_vars,axis=1)
    
    def remove_comp_outliers(data, *variables):
        # Removes outliers from the percent_diff set. Only high-end outliers are removed
        for var in variables:
            median = data[var].median()
            quantile = data[var].quantile(0.9)
            thresh = median + (1.5*(quantile-median))
            removed_outliers = []
            for i,point in enumerate(data[var].values):
                if not math.isnan(point):
                    if point > thresh:
                        removed_outliers.append(None)
                    else:
                        removed_outliers.append(point)
                else:
                    removed_outliers.append(None)
            data[var] = removed_outliers
        return data
    def combine_comps(data, comp_vars):
        def combine_rate_or_inv(row, is_inv):
            if is_inv:
                print('inv!',row)
            non_null = [x for x in row if not math.isnan(x)]
            if len(non_null) != 0:
                return sum(non_null)/len(non_null)
            else:
                return None

        def combine_percent_diff(percent, signs):
            rel_dif = np.array(percent) * np.array(signs)
            rel_diff = [x for x in rel_dif if not math.isnan(x)]
            if len(rel_diff) != 0:
                return sum(rel_diff)/len(rel_diff)
            else:
                return None

        rate_vars = [x for x in comp_vars if 'rate' in x and 'percent' not in x]
        inv_vars  = [x for x in comp_vars if 'inv'  in x]
        percent_vars = [x for x in comp_vars if 'percent' in x]
        print(rate_vars)
        print(inv_vars)
        print(percent_vars)
        comp_rate = []
        comp_inv = []
        comp_diff = []
        for i,row in enumerate(data[rate_vars].values):
            comp_rate.append(combine_rate_or_inv(row,False))
            comp_inv.append(combine_rate_or_inv(data[inv_vars].values[i],True))

        for i,row in enumerate(data[percent_vars].values):
            signs = data[rate_vars].values[i]
            comp_diff.append(combine_percent_diff(row, signs))
        data['comp_rate'] = comp_rate
        data['comp_inv'] = comp_inv
        data['comp_diff'] = comp_diff
        print('Done')
        return data
    return data
    
def normalize_mult(data, *variables):
    for var in variables:
        data = normalize(data, var)
    return data

# TODO:
# 1. Julian: Fix price (can be per night or per stay): differs per COUNTRY (either prop or src sure)
# 2. Julian: Do orig_destination_distance & prop_location_score2 (correlation with loc1 and starrating)
# 3. Julian: (after 1) Add price difference over history price_usd / e^prop_log_hist_price
# 4. Normalize prop_starrating * prop_review_score over properties in query
# 5. Normalize prop_starrating * prop_review_score over 
# 6. Replace null values with median (not sure for which) method change_null not implemened
# Maybe: balance training set positive & negative

In [None]:
def preprocess(data):
    data = pp_time(data) # drops date time adds year and month
    data = binning(data) # bins price_usd
    data = join_comps(data) # drops compx vars adds joined comp vars (and outliers)
    

In [85]:
def create_train_file(data, name, features):
    # This method converts the dataset to a txt format on which 
    # LambdaMART can be trained according to LEMUR file specification
    # data: dataset
    # name: name of train file
    # features: list of feature names.
    s = time.time()
    def is_numerical(val):
        if type(val) == float or type(val) == int:
            return True
        else:
            for el in val:
                if el != '.':
                    if not el.isdigit():
                        return False
        return True
    
            
    def feat_to_line(target, srch_id, feat_dict):
        line = '{} qid:{} '.format(target, srch_id)
        for (k,v) in list(feat_dict.items()):
            assert is_numerical(v), 'non numerical value detected: {}'.format(v)
            line += '{}:{} '.format(k,v)
        return line
    #f = open(name+'_train.txt','w')
    for i, row in data.iterrows():
        srch_id = row['srch_id']
        target = max(np.array((row['booking_bool'],row['click_bool']))*[5,1])
        feat_dict = {k+1:row[name] for k,name in enumerate(features)}
        line = feat_to_line(target,srch_id, feat_dict)
        print(line)
        
def sample_by_query(data, N):
    s = time.time()
    ids = data.srch_id.unique()
    selection = np.random.choice(ids,N,replace=False)
    result = data.loc[data['srch_id'].isin(selection)]
    print('{0:.2f}% of total data sampled (n = {1:})'.format(result.shape[0]*100/data.shape[0], result.shape[0]))
    print('{0:.2f}% of queries sampled'.format(N*100/len(ids)))
    print('Sampling took {} seconds'.format(time.time()-s))
    return result

sampled_data = sample_by_query(expedia_df, 20000)


9.97% of total data sampled (n = 494304)
10.01% of queries sampled
Sampling took 0.48395419120788574 seconds
