Couple of things this script cover 
 - read the data
 - down sampling
 - create new variables
 - basic missing value preparations
 - find meaningful features with random Forest

Read the data

In [1]:
# initialisation and read the data
# https://www.dataquest.io/blog/kaggle-tutorial/
import os 
import pandas as pd
import random
from sklearn.decomposition import PCA
os.chdir('/Users/jpmallette/Desktop/kaggle_expedia/')

destinations = pd.read_csv("destinations.csv")
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")

In [3]:
# create some variables useful for downsampling 
train["date_time"] = pd.to_datetime(train["date_time"])
train["year"] = train["date_time"].dt.year
train["month"] = train["date_time"].dt.month

# draw sampling base on the number of user
def draw_sample(df,row):

    unique_users = train.user_id.unique()

    sel_user_ids = [unique_users[i] for i in sorted(random.sample(range(len(unique_users)), row)) ]
    sel_train = train[train.user_id.isin(sel_user_ids)]

    sample_train = sel_train[((sel_train.year == 2013) | ((sel_train.year == 2014) & (sel_train.month < 8)))]
    sample_test = sel_train[((sel_train.year == 2014) & (sel_train.month >= 8))]

    # remove click events
    sample_test = sample_test[sample_test.is_booking == True]
    
    # return the data
    return sample_train,sample_test

# draw the sample
sample_train, sample_test = draw_sample(train,10000)

Create new variables 
  - related to the number of people
  - PCA reduction with destination 
  - adding new date variables

In [4]:
# number of people in person in the bookin
sample_train["nb_person"] =  sample_train['srch_adults_cnt'] + sample_train['srch_children_cnt']

# family variables
sample_train['family'] = 'group'

sample_train.ix[(sample_train.srch_adults_cnt == 1) & 
                (sample_train.srch_children_cnt == 0),'family'] = 'single'

sample_train.ix[(sample_train.srch_adults_cnt == 1) & 
                (sample_train.srch_children_cnt > 0),'family'] = 'monoparental_with_kids'

sample_train.ix[(sample_train.srch_adults_cnt == 2) & 
                (sample_train.srch_children_cnt == 0),'family'] = 'couple'

sample_train.ix[(sample_train.srch_adults_cnt == 2) & 
                (sample_train.srch_children_cnt > 0),'family'] = 'family'

sample_train.ix[(sample_train.srch_adults_cnt > 2) & 
                (sample_train.srch_children_cnt > 4),'family'] = 'group'

# Explore categorisation
sample_train.groupby(['family'])['family'].agg(['count'])

Unnamed: 0_level_0,count
family,Unnamed: 1_level_1
couple,96298
family,32450
group,24245
monoparental_with_kids,6560
single,34183


In [5]:
# PCA and date variables
pca = PCA(n_components=3)
dest_small = pca.fit_transform(destinations[["d{0}".format(i + 1) for i in range(149)]])
dest_small = pd.DataFrame(dest_small)
dest_small["srch_destination_id"] = destinations["srch_destination_id"]

# date variables + PCA features
def calc_fast_features(df):
    
# for preventions purposes and making sure the code will work
    df["date_time"] = pd.to_datetime(df["date_time"])
    df["srch_ci"] = pd.to_datetime(df["srch_ci"], format='%Y-%m-%d', errors="coerce")
    df["srch_co"] = pd.to_datetime(df["srch_co"], format='%Y-%m-%d', errors="coerce")
    
# .dt only extract property of object not the data 
# getattr enable to extract the property (ex : minutes) from the date format 
# props will result in a dataframe with 6 column time data related of the date_time column.
# props act like the base dataframe
    props = {}
    for prop in ["month", "day", "hour", "minute", "dayofweek", "quarter"]:
        props[prop] = getattr(df["date_time"].dt, prop)
    
# add all the columns in the props dataframe except timestamp column define in not in
    carryover = [p for p in df.columns if p not in ["date_time", "srch_ci", "srch_co"]]
    for prop in carryover:
        props[prop] = df[prop]
    
# ad time columns related to srch_ci and srch_co
    date_props = ["month", "day", "dayofweek", "quarter"]
    for prop in date_props:
        props["ci_{0}".format(prop)] = getattr(df["srch_ci"].dt, prop)
        props["co_{0}".format(prop)] = getattr(df["srch_co"].dt, prop)

# ad some more time variables
    props["stay_span_hours"] = (df["srch_co"] - df["srch_ci"]).astype('timedelta64[h]')
    props["stay_span_days"] = (df["srch_co"] - df["srch_ci"]).astype('timedelta64[D]')
        
    ret = pd.DataFrame(props)

# ad the PCA variables on the ret previously props dataframe
    ret = ret.join(dest_small, on="srch_destination_id", how='left', rsuffix="dest")
    ret = ret.drop("srch_destination_iddest", axis=1)
    return ret

# prepare the features
sample_train = calc_fast_features(sample_train)  
sample_train.columns

In [55]:
# Write the file
#sample_train['stay_span_hours'].head
#sample_train.to_csv('preparation.csv')

<bound method Series.head of 2835        168
2836         96
2837         96
2838         96
2839        336
2840        336
3829         72
3830         72
3831         72
3832         72
3833         72
3834         72
3835         72
3836         96
3837         96
3838         96
3839         96
3840         72
3841         72
3842         48
3843         96
3844         72
3845         72
3846         96
3847         72
3848         72
3849         72
3850         72
3851         72
3852         72
           ... 
37664614     96
37664615     96
37664616     96
37664617     96
37664618     48
37664619     96
37664620     72
37664621     48
37664622     72
37664623     96
37664624     72
37664625     48
37664626     48
37664627     48
37664628     72
37664629     72
37664630     72
37664631     72
37666462     72
37666463     72
37666464     72
37666465     72
37666466     24
37666467     24
37668467    120
37668468    240
37668469    240
37668470    144
37668471    144
37668472   

Basic Handling of missing value

In [6]:
# first attemp just fill missing value with -1
sample_train.fillna(-1, inplace=True)

Check the most important features with randomForest

In [21]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib.pyplot as plt

# initialize the forest
forest = RandomForestClassifier(n_estimators=50,min_weight_fraction_leaf=0.1,n_jobs = -1)


# consider the useful variables.Exclude family for the moment need to make it categorical
features_column = [c for c in sample_train.columns if c not in ["hotel_cluster","user_id","family"]]
X_train = sample_train[features_column]
Y_train = sample_train["hotel_cluster"]

# fit the forest
forest.fit(X_train,Y_train)

# analyse important features
importances = forest.feature_importances_

indices = np.argsort(importances)[::-1]

for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, 
                            features_column[indices[f]], 
                            importances[indices[f]]))

 1) hotel_continent                0.292153
 2) hotel_market                   0.161655
 3) hotel_country                  0.142039
 4) 0                              0.076820
 5) is_package                     0.061221
 6) srch_destination_id            0.058125
 7) stay_span_hours                0.047409
 8) stay_span_days                 0.037203
 9) 2                              0.031167
10) orig_destination_distance      0.024796
11) site_name                      0.013378
12) user_location_country          0.011775
13) posa_continent                 0.011198
14) 1                              0.010763
15) user_location_region           0.008694
16) srch_destination_type_id       0.006436
17) nb_person                      0.001258
18) ci_month                       0.000657
19) co_dayofweek                   0.000638
20) year                           0.000593
21) srch_adults_cnt                0.000520
22) co_quarter                     0.000495
23) user_location_city          

In [8]:
#  Annexe
# properly install ipython package
import pip

def install(package):
   pip.main(['install', package])

install('ml_metrics') 

Collecting ml-metrics
Installing collected packages: ml-metrics
Successfully installed ml-metrics-0.1.4


You are using pip version 8.0.3, however version 8.1.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
