The first thing we want to do is read in the data, explore, take a sample and create new variables. 

In [1]:
# initialisation and read the data
# https://www.dataquest.io/blog/kaggle-tutorial/

import os 
import pandas as pd
import random
from sklearn.decomposition import PCA
os.chdir('/Users/jpmallette/Desktop/kaggle_expedia/')

destinations = pd.read_csv("destinations.csv")
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")

In [2]:
# create some variables useful for downsampling 
train["date_time"] = pd.to_datetime(train["date_time"])
train["year"] = train["date_time"].dt.year
train["month"] = train["date_time"].dt.month

In [8]:
# quick validation
train.shape

(37670293, 24)

In [3]:
# draw sampling base on the number of user
def draw_sample(df,row):

    unique_users = train.user_id.unique()

    sel_user_ids = [unique_users[i] for i in sorted(random.sample(range(len(unique_users)), row)) ]
    sel_train = train[train.user_id.isin(sel_user_ids)]

    sample_train = sel_train[((sel_train.year == 2013) | ((sel_train.year == 2014) & (sel_train.month < 8)))]
    sample_test = sel_train[((sel_train.year == 2014) & (sel_train.month >= 8))]

    # remove click events
    sample_test = sample_test[sample_test.is_booking == True]
    
    # remove 
    return sample_train,sample_test

# draw the sample
sample_train, sample_test = draw_sample(train,10000)

In [None]:
Create new variables 

In [4]:
# number of people in trip categorisation
sample_train["nb_person"] =  sample_train['srch_adults_cnt'] + sample_train['srch_children_cnt']

# family variables
sample_train['family'] = 'group'

sample_train.ix[(sample_train.srch_adults_cnt == 1) & 
                (sample_train.srch_children_cnt == 0),'family'] = 'single'

sample_train.ix[(sample_train.srch_adults_cnt == 1) & 
                (sample_train.srch_children_cnt > 0),'family'] = 'monoparental_with_kids'

sample_train.ix[(sample_train.srch_adults_cnt == 2) & 
                (sample_train.srch_children_cnt == 0),'family'] = 'couple'

sample_train.ix[(sample_train.srch_adults_cnt == 2) & 
                (sample_train.srch_children_cnt > 0),'family'] = 'family'

sample_train.ix[(sample_train.srch_adults_cnt > 2) & 
                (sample_train.srch_children_cnt > 4),'family'] = 'group'

# We can see the categorisation is not working
sample_train.groupby(['family'])['family'].agg(['count'])


Unnamed: 0_level_0,count
family,Unnamed: 1_level_1
couple,98235
family,33646
group,153
monoparental_with_kids,7083
other,25867
single,37771


In [5]:

# PCA 
pca = PCA(n_components=3)
dest_small = pca.fit_transform(destinations[["d{0}".format(i + 1) for i in range(149)]])
dest_small = pd.DataFrame(dest_small)
dest_small["srch_destination_id"] = destinations["srch_destination_id"]

# date variables
def calc_fast_features(df):
    df["date_time"] = pd.to_datetime(df["date_time"])
    df["srch_ci"] = pd.to_datetime(df["srch_ci"], format='%Y-%m-%d', errors="coerce")
    df["srch_co"] = pd.to_datetime(df["srch_co"], format='%Y-%m-%d', errors="coerce")
    
    props = {}
    for prop in ["month", "day", "hour", "minute", "dayofweek", "quarter"]:
        props[prop] = getattr(df["date_time"].dt, prop)
    
    carryover = [p for p in df.columns if p not in ["date_time", "srch_ci", "srch_co"]]
    for prop in carryover:
        props[prop] = df[prop]
    
    date_props = ["month", "day", "dayofweek", "quarter"]
    for prop in date_props:
        props["ci_{0}".format(prop)] = getattr(df["srch_ci"].dt, prop)
        props["co_{0}".format(prop)] = getattr(df["srch_co"].dt, prop)
    props["stay_span"] = (df["srch_co"] - df["srch_ci"]).astype('timedelta64[h]')
        
    ret = pd.DataFrame(props)
    
    ret = ret.join(dest_small, on="srch_destination_id", how='left', rsuffix="dest")
    ret = ret.drop("srch_destination_iddest", axis=1)
    return ret

sample_train = calc_fast_features(sample_train)                        

In [12]:
# Write the file
sample_train.describe 
sample_train.to_csv('preparation.csv')

Handle missing value

In [6]:
# first attemp just fill missing value with -1
sample_train.fillna(-1, inplace=True)

In [None]:
Machine Learning predictions

In [11]:
# evaluation for algorithm
import ml_metrics as metrics
target = [[l] for l in sample_test["hotel_cluster"]]
metrics.mapk(target, predictions, k=5)

# Random Forest to find the most important features
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
from itertools import chain

all_probs = []
unique_clusters = df["hotel_cluster"].unique()
for cluster in unique_clusters:
    df["target"] = 1
    df["target"][df["hotel_cluster"] != cluster] = 0
    predictors = [col for col in df if col not in ['hotel_cluster', "target"]]
    probs = []
    cv = KFold(len(df["target"]), n_folds=2)
    clf = RandomForestClassifier(n_estimators=10, min_weight_fraction_leaf=0.1)
    for i, (tr, te) in enumerate(cv):
        clf.fit(df[predictors].iloc[tr], df["target"].iloc[tr])
        preds = clf.predict_proba(df[predictors].iloc[te])
        probs.append([p[1] for p in preds])
    full_probs = chain.from_iterable(probs)
    all_probs.append(list(full_probs))

prediction_frame = pd.DataFrame(all_probs).T
prediction_frame.columns = unique_clusters

def find_top_5(row):
    return list(row.nlargest(5).index)

preds = []
for index, row in prediction_frame.iterrows():
    preds.append(find_top_5(row))

metrics.mapk([[l] for l in t2.iloc["hotel_cluster"]], preds, k=5)
scores

NameError: name 'predictions' is not defined

In [8]:
#  Annexe
# properly install ipython package
import pip

def install(package):
   pip.main(['install', package])

install('ml_metrics') 

Collecting ml-metrics
Installing collected packages: ml-metrics
Successfully installed ml-metrics-0.1.4


You are using pip version 8.0.3, however version 8.1.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
