# 0.885 script for Airbnb

Single XGB Classifier

In [1]:
# Basic ML modules
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.cross_validation import cross_val_score, KFold

from time import time
from datetime import datetime, timedelta, date

# Calendar modules - needed for holidays
from workalendar.usa import UnitedStates
from workalendar.europe import France, Germany, Italy, UnitedKingdom, Portugal
from workalendar.oceania import Australia

# Modules for plots
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Command so that plots appear in the iPython Notebook
%matplotlib inline

# File management modules
from os import makedirs
from os.path import exists, join

np.random.seed(0)

In [2]:
intermediate_path = './intermediate/'
output_path = './output/'

In [3]:
# Load data
df_train = pd.read_csv('./input/train_users_2.csv')
df_test = pd.read_csv('./input/test_users.csv')


In [4]:
# Create labels 
labels = df_train['country_destination'].values
df_train = df_train.drop(['country_destination'], axis=1)
id_test = df_test['id']
piv_train = df_train.shape[0]

In [5]:
df_train.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome


In [6]:
df_test.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,5uwns89zht,2014-07-01,20140701000006,,FEMALE,35.0,facebook,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
1,jtl0dijy2j,2014-07-01,20140701000051,,-unknown-,,basic,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
2,xx0ulgorjt,2014-07-01,20140701000148,,-unknown-,,basic,0,en,direct,direct,linked,Web,Windows Desktop,Chrome
3,6c6puo6ix0,2014-07-01,20140701000215,,-unknown-,,basic,0,en,direct,direct,linked,Web,Windows Desktop,IE
4,czqhjk3yfe,2014-07-01,20140701000305,,-unknown-,,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Safari


In [7]:
df_train.shape

(213451, 15)

In [8]:
# Creating and Encoding y
le_y = LabelEncoder()
y = le_y.fit_transform(labels) 

In [9]:
# Creating a DataFrame with train+test data
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

# dropping column not available in test data
df_all = df_all.drop('date_first_booking', axis=1)

In [10]:
df_all.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,gxn3p5htnn,2010-06-28,20090319043255,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome
1,820tgsjxq7,2011-05-25,20090523174809,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome
2,4ft3gnwmtx,2010-09-28,20090609231247,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE
3,bjjt8pjhuk,2011-12-05,20091031060129,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox
4,87mebub9p4,2010-09-14,20091208061105,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome


In [11]:
# Reformatting columns with time data and some feature engineering

# from string to datetime format
df_all['datehour_timestamp_first_active'] = df_all['timestamp_first_active'].astype(str)
df_all['date_timestamp_first_active'] = [ d[:8] for d in df_all['datehour_timestamp_first_active'] ]
df_all['hour_timestamp_first_active'] = [ d[8:] for d in df_all['datehour_timestamp_first_active'] ]

df_all['date_timestamp_first_active'] = [ datetime.strptime(d, '%Y%m%d') for d in df_all['date_timestamp_first_active']]
df_all['hour_timestamp_first_active'] = [ datetime.strptime(h, '%H%M%S') for h in df_all['hour_timestamp_first_active']]

df_all['date_account_created'] = [ datetime.strptime(d, '%Y-%m-%d') for d in df_all['date_account_created']]

 
# create d_ac-fa: number of days between account creation and first access
df_all['d_ac-fa'] = df_all['date_account_created'] - df_all['date_timestamp_first_active']
df_all['d_ac-fa'] = df_all['d_ac-fa'].astype('int')/8.64e13  #  CHECK!!!!

# create mo_first_active: month of the first active
df_all['mo_first_active'] = [ d.month if not pd.isnull(d) else 0 for d in df_all['date_timestamp_first_active'] ]
df_all['ho_first_active'] = [ d.hour if not pd.isnull(d) else 0 for d in df_all['hour_timestamp_first_active'] ]

# create ye_first_active: year of the first active
# NOT USED
#df_all['ye_first_active'] = [ d.year if not pd.isnull(d) else 0 for d in df_all['date_timestamp_first_active'] ]

# create weekday_first_active: day of week (categorical variable)
df_all['weekday_first_active'] = [ datetime.strftime(d,'%a') if not pd.isnull(d) else d for d in df_all['date_timestamp_first_active']]

In [12]:
# create features
# d_ac : day of the month account creation 
# d_fa : day of the month first access
df_all['d_ac'] = [ d.day if not pd.isnull(d) else 0 for d in df_all['date_account_created']  ]    
df_all['d_fa'] = [ d.day if not pd.isnull(d) else 0 for d in df_all['date_timestamp_first_active']  ]

In [13]:
# Transform timestamp into date format
df_all['date_timestamp_first_active'] = [ d.date() for d in df_all.date_timestamp_first_active]

In [14]:
df_all.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,...,first_browser,datehour_timestamp_first_active,date_timestamp_first_active,hour_timestamp_first_active,d_ac-fa,mo_first_active,ho_first_active,weekday_first_active,d_ac,d_fa
0,gxn3p5htnn,2010-06-28,20090319043255,-unknown-,,facebook,0,en,direct,direct,...,Chrome,20090319043255,2009-03-19,1900-01-01 04:32:55,466,3,4,Thu,28,19
1,820tgsjxq7,2011-05-25,20090523174809,MALE,38.0,facebook,0,en,seo,google,...,Chrome,20090523174809,2009-05-23,1900-01-01 17:48:09,732,5,17,Sat,25,23
2,4ft3gnwmtx,2010-09-28,20090609231247,FEMALE,56.0,basic,3,en,direct,direct,...,IE,20090609231247,2009-06-09,1900-01-01 23:12:47,476,6,23,Tue,28,9
3,bjjt8pjhuk,2011-12-05,20091031060129,FEMALE,42.0,facebook,0,en,direct,direct,...,Firefox,20091031060129,2009-10-31,1900-01-01 06:01:29,765,10,6,Sat,5,31
4,87mebub9p4,2010-09-14,20091208061105,-unknown-,41.0,basic,0,en,direct,direct,...,Chrome,20091208061105,2009-12-08,1900-01-01 06:11:05,280,12,6,Tue,14,8


In [15]:
# checkpoint 0
df_all_ck0 = df_all.copy()

In [16]:
# revert to checkpoint 0
#df_all = df_all_ck0.copy()

### Feature Engineering: Including Holidays

In [17]:
def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days)):
        yield start_date + timedelta(n)

In [18]:
# create days_to_next_CC_hol 
# CC is the country code
# 

country_dict = { 'NDF': 'None', 'US': UnitedStates(), 'other': 'None', 'FR': France(), 
 'IT': Italy(), 'GB': UnitedKingdom(), 'ES': 'None', 'CA': 'None', 'DE': Germany(), 
 'NL' : 'None', 'AU' : Australia(), 'PT' : Portugal() }

# only end up using holidays in USA
country_dict = { 'NDF': 'None', 'US': UnitedStates(), 'other': 'None', 'FR': 'None', 
 'IT': 'None', 'GB': 'None', 'ES': 'None', 'CA': 'None', 'DE': 'None', 
 'NL' : 'None', 'AU' : 'None', 'PT' : 'None' }

for c in country_dict:
    if country_dict[c] != 'None':
        hol = country_dict[c].holidays(2008)
        for year in range(2009, 2016, 1):
            hol += country_dict[c].holidays(year)
        
        hol = pd.Series([h[0] for h in hol ])
        
        hol_dict = {}
        for d in daterange( date(2009, 1, 1), date(2014, 12, 31) ):
            delta_list = hol - d
            delta_list = [dd.days for dd in delta_list]
            next_holiday_delta = min([n for n in delta_list if n >= 0])
            nex2_holiday_delta = min([n for n in delta_list if n > next_holiday_delta])
            hol_dict[d] = (next_holiday_delta, nex2_holiday_delta)
            
            column_name = 'days_to_next_' + c + '_hol'
        df_all[column_name] = [ hol_dict[d][0] for d in df_all['date_timestamp_first_active'] ]

In [19]:
df_all.shape

(275547, 24)

### Feature Engineering: ranking countries by matching age and gender

using preprocessed data from age_gender_bkts.csv

In [20]:
df_all.gender.value_counts()

-unknown-    129480
FEMALE        77524
MALE          68209
OTHER           334
Name: gender, dtype: int64

In [21]:
# importing pre-processed dictionary with ranks countries by relative population
# for each age and gender

import pickle
c_am = pickle.load(open('./intermediate/dst_dict_f750.pkl', 'rb'))

In [22]:
country_list = list(le_y.classes_)
country_list.remove('NDF')
country_list.remove('other')

In [23]:
country_list

['AU', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NL', 'PT', 'US']

In [24]:
# creating country ranking columns

for c in country_list:
    df_all[c + '_rank'] = [ c_am[a, g][c]  if (not pd.isnull(a)) & (a < 105) else -999
                           for (a, g) in df_all[['age', 'gender']].itertuples(index=False) ]

In [25]:
df_all.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,...,AU_rank,CA_rank,DE_rank,ES_rank,FR_rank,GB_rank,IT_rank,NL_rank,PT_rank,US_rank
0,gxn3p5htnn,2010-06-28,20090319043255,-unknown-,,facebook,0,en,direct,direct,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
1,820tgsjxq7,2011-05-25,20090523174809,MALE,38.0,facebook,0,en,seo,google,...,50,50,45,68,44,47,53,44,61,48
2,4ft3gnwmtx,2010-09-28,20090609231247,FEMALE,56.0,basic,3,en,direct,direct,...,46,54,54,48,48,46,49,51,50,51
3,bjjt8pjhuk,2011-12-05,20091031060129,FEMALE,42.0,facebook,0,en,direct,direct,...,52,49,46,60,50,50,58,52,57,47
4,87mebub9p4,2010-09-14,20091208061105,-unknown-,41.0,basic,0,en,direct,direct,...,52,50,49,66,53,50,61,52,61,48


In [26]:
# removing features not going to be used
features = list(df_all.columns)

remove = ['date_account_created',
 'timestamp_first_active',
 'datehour_timestamp_first_active',
 'date_timestamp_first_active',
 'hour_timestamp_first_active']

for rf in remove:
    features.remove(rf)

df_all = df_all[features]

In [27]:
df_all.head()

Unnamed: 0,id,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,...,AU_rank,CA_rank,DE_rank,ES_rank,FR_rank,GB_rank,IT_rank,NL_rank,PT_rank,US_rank
0,gxn3p5htnn,-unknown-,,facebook,0,en,direct,direct,untracked,Web,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
1,820tgsjxq7,MALE,38.0,facebook,0,en,seo,google,untracked,Web,...,50,50,45,68,44,47,53,44,61,48
2,4ft3gnwmtx,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,...,46,54,54,48,48,46,49,51,50,51
3,bjjt8pjhuk,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,...,52,49,46,60,50,50,58,52,57,47
4,87mebub9p4,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,...,52,50,49,66,53,50,61,52,61,48


In [28]:
df_all.shape

(275547, 29)

In [80]:
# checking age distribution
if False:
    sns.set_context("talk")
    box = sns.distplot(df_all.age.dropna(), hist_kws={"range" : (0, 100)}, bins = 50)
    box.set(xlim=(0, 100))

In [30]:
# checkpoint 1
df_all_ck1 = df_all.copy()

In [31]:
# revert to checkpoint
#df_all = df_all_ck1.copy()

In [32]:
# Note age has a lot of NaN
# and first_affiliate_tracked has few
df_all.count()

id                         275547
gender                     275547
age                        158681
signup_method              275547
signup_flow                275547
language                   275547
affiliate_channel          275547
affiliate_provider         275547
first_affiliate_tracked    269462
signup_app                 275547
first_device_type          275547
first_browser              275547
d_ac-fa                    275547
mo_first_active            275547
ho_first_active            275547
weekday_first_active       275547
d_ac                       275547
d_fa                       275547
days_to_next_US_hol        275547
AU_rank                    275547
CA_rank                    275547
DE_rank                    275547
ES_rank                    275547
FR_rank                    275547
GB_rank                    275547
IT_rank                    275547
NL_rank                    275547
PT_rank                    275547
US_rank                    275547
dtype: int64

In [33]:
# imputation -1 for ages NaN
df_all['age'].fillna(-999, inplace=True)

# imputation most frequent 'untracked' for first_affiliate_tracked
df_all['first_affiliate_tracked'].fillna(df_all.first_affiliate_tracked.value_counts().index[0], inplace=True)

In [34]:
# checkpoint 2
df_all_ck2 = df_all.copy()

In [35]:
# revert to checkpoint
#df_all = df_all_ck2.copy()

In [36]:
df_all.head()

Unnamed: 0,id,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,...,AU_rank,CA_rank,DE_rank,ES_rank,FR_rank,GB_rank,IT_rank,NL_rank,PT_rank,US_rank
0,gxn3p5htnn,-unknown-,-999,facebook,0,en,direct,direct,untracked,Web,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
1,820tgsjxq7,MALE,38,facebook,0,en,seo,google,untracked,Web,...,50,50,45,68,44,47,53,44,61,48
2,4ft3gnwmtx,FEMALE,56,basic,3,en,direct,direct,untracked,Web,...,46,54,54,48,48,46,49,51,50,51
3,bjjt8pjhuk,FEMALE,42,facebook,0,en,direct,direct,untracked,Web,...,52,49,46,60,50,50,58,52,57,47
4,87mebub9p4,-unknown-,41,basic,0,en,direct,direct,untracked,Web,...,52,50,49,66,53,50,61,52,61,48


In [38]:
# Remove age outliers

av = df_all.age.values
df_all['age'] = np.where(np.logical_or(av<14, av>104), -999, av)

# reduce cardinality of feature
df_all['days_to_next_US_hol'] = np.ceil((df_all.days_to_next_US_hol / 2))

In [39]:
df_all.head()

Unnamed: 0,id,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,...,AU_rank,CA_rank,DE_rank,ES_rank,FR_rank,GB_rank,IT_rank,NL_rank,PT_rank,US_rank
0,gxn3p5htnn,-unknown-,-999,facebook,0,en,direct,direct,untracked,Web,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
1,820tgsjxq7,MALE,38,facebook,0,en,seo,google,untracked,Web,...,50,50,45,68,44,47,53,44,61,48
2,4ft3gnwmtx,FEMALE,56,basic,3,en,direct,direct,untracked,Web,...,46,54,54,48,48,46,49,51,50,51
3,bjjt8pjhuk,FEMALE,42,facebook,0,en,direct,direct,untracked,Web,...,52,49,46,60,50,50,58,52,57,47
4,87mebub9p4,-unknown-,41,basic,0,en,direct,direct,untracked,Web,...,52,50,49,66,53,50,61,52,61,48


In [41]:
# removing features based on feature_importances_
features = list(df_all.columns)

remove = ['d_ac-fa','signup_app', 'd_ac']

for rf in remove:
    features.remove(rf)

df_all = df_all[features]

In [81]:
# checking any NaN value left
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 275547 entries, 0 to 275546
Columns: 1028 entries, gender to 999
dtypes: float64(1005), int64(23)
memory usage: 2.1 GB


In [43]:
# building a list of categorical variables to be LabelEncoded 
to_le = [ i for i, t in zip(df_all.dtypes.index, df_all.dtypes.values)  if t == 'O']
to_le.remove('id')

In [44]:
# LabelEncode variables

le = LabelEncoder()
for ri in to_le: # ri: replaced by integer
    le.fit(df_all[ri])   # takes labels and assigns each to a integer,
    df_all[ri] = le.transform(df_all[ri])
    print le.classes_
    print le.transform(le.classes_)
    print "------"
    print

['-unknown-' 'FEMALE' 'MALE' 'OTHER']
[0 1 2 3]
------

['basic' 'facebook' 'google' 'weibo']
[0 1 2 3]
------

['-unknown-' 'ca' 'cs' 'da' 'de' 'el' 'en' 'es' 'fi' 'fr' 'hr' 'hu' 'id'
 'is' 'it' 'ja' 'ko' 'nl' 'no' 'pl' 'pt' 'ru' 'sv' 'th' 'tr' 'zh']
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25]
------

['api' 'content' 'direct' 'other' 'remarketing' 'sem-brand' 'sem-non-brand'
 'seo']
[0 1 2 3 4 5 6 7]
------

['baidu' 'bing' 'craigslist' 'daum' 'direct' 'email-marketing' 'facebook'
 'facebook-open-graph' 'google' 'gsp' 'meetup' 'naver' 'other' 'padmapper'
 'vast' 'wayn' 'yahoo' 'yandex']
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]
------

['linked' 'local ops' 'marketing' 'omg' 'product' 'tracked-other'
 'untracked']
[0 1 2 3 4 5 6]
------

['Android Phone' 'Android Tablet' 'Desktop (Other)' 'Mac Desktop'
 'Other/Unknown' 'SmartPhone (Other)' 'Windows Desktop' 'iPad' 'iPhone']
[0 1 2 3 4 5 6 7 8]
------

['-unknown-' 'AOL Explorer' 'And

In [45]:
df_all.head()

Unnamed: 0,id,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,first_device_type,...,AU_rank,CA_rank,DE_rank,ES_rank,FR_rank,GB_rank,IT_rank,NL_rank,PT_rank,US_rank
0,gxn3p5htnn,0,-999,1,0,6,2,4,6,3,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
1,820tgsjxq7,2,38,1,0,6,7,8,6,3,...,50,50,45,68,44,47,53,44,61,48
2,4ft3gnwmtx,1,56,0,3,6,2,4,6,6,...,46,54,54,48,48,46,49,51,50,51
3,bjjt8pjhuk,1,42,1,0,6,2,4,6,3,...,52,49,46,60,50,50,58,52,57,47
4,87mebub9p4,0,41,0,0,6,2,4,6,3,...,52,50,49,66,53,50,61,52,61,48


In [46]:
print y.shape
print df_all.shape

(213451,)
(275547, 26)


In [47]:
# checkpoint 3
df_all_ck3 = df_all.copy()

In [66]:
# revert to checkpoint 3
df_all = df_all_ck3.copy()

### Merge with session data

In [67]:
# Load session features processed from sessions.csv in
# Extracting features from sessions.csv

df_s = pd.read_csv(join(intermediate_path, 'session_features_1000.csv'))
df_s = df_s.rename(columns = {'user_id': 'id'})

In [68]:
df_s.head()

Unnamed: 0,id,unique_devices,sum,session_id_30,0,1,2,3,4,5,...,990,991,992,993,994,995,996,997,998,999
0,d1mm9tcy42,2,3427529,48,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,yo8nz8bqcq,1,207842,7,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4grx6yxeby,2,1135444,10,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ncf87guaf0,3,3755100,72,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4rvqpxoh3h,1,2555,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [70]:
# merging 
# df_all (original users data) with 
# df_s (processed session data)
df_all = pd.merge(df_all, df_s, how = 'left', on = 'id')

In [71]:
df_all.head()

Unnamed: 0,id,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,first_device_type,...,990,991,992,993,994,995,996,997,998,999
0,gxn3p5htnn,0,-999,1,0,6,2,4,6,3,...,,,,,,,,,,
1,820tgsjxq7,2,38,1,0,6,7,8,6,3,...,,,,,,,,,,
2,4ft3gnwmtx,1,56,0,3,6,2,4,6,6,...,,,,,,,,,,
3,bjjt8pjhuk,1,42,1,0,6,2,4,6,3,...,,,,,,,,,,
4,87mebub9p4,0,41,0,0,6,2,4,6,3,...,,,,,,,,,,


In [72]:
df_all = df_all.drop('id', axis=1)

In [73]:
df_all.shape

(275547, 1028)

In [74]:
# filling NaN with -1
df_all.fillna(-999, inplace = True)

In [75]:
df_all.head()

Unnamed: 0,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,first_device_type,first_browser,...,990,991,992,993,994,995,996,997,998,999
0,0,-999,1,0,6,2,4,6,3,8,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
1,2,38,1,0,6,7,8,6,3,8,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
2,1,56,0,3,6,2,4,6,6,22,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
3,1,42,1,0,6,2,4,6,3,17,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
4,0,41,0,0,6,2,4,6,3,8,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999


In [76]:
#Splitting train and test
vals = df_all.values
X = vals[:piv_train]

X_test = vals[piv_train:]

In [77]:
# scaling features
from sklearn.preprocessing import MinMaxScaler, StandardScaler

#scaler = MinMaxScaler()
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

## Learning and predicting, and generating submission file



In [79]:
# Learning and predicting 
# Generating submission file

clf = XGBClassifier(max_depth=6, learning_rate=0.2, n_estimators=25, #reg_lambda=3.33,
                    objective='multi:softprob', subsample=0.75, colsample_bytree=0.5, seed=0)

# Fit and predict
clf.fit(X, y)
y_pred = clf.predict_proba(X_test)  

# Building submission files

# Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le_y.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Generate submission
if not exists(output_path):
    makedirs(output_path)

sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
f = 'sub.' + datetime.now().strftime("%d%m%y_%H%M%S") + '.csv'
sub.to_csv(join(output_path, f), index=False)

In [64]:
ls -l output

total 206448
-rw-r--r--  1 hitoshinagano  503  4595049 18 Mar 00:23 sub.180316_002338.csv
-rw-r--r--  1 hitoshinagano  503  4594973 19 Mar 02:20 sub.190316_022000.csv
-rw-r--r--  1 hitoshinagano  503  4595024 19 Mar 18:06 sub.190316_180657.csv
-rw-r--r--  1 hitoshinagano  503  4595087 19 Mar 18:57 sub.190316_185754.csv
-rw-r--r--  1 hitoshinagano  503  4595026 19 Mar 20:43 sub.190316_204325.csv
-rw-r--r--  1 hitoshinagano  503  4595033 20 Mar 09:50 sub.200316_095014.csv
-rw-r--r--  1 hitoshinagano  503  4595075 20 Mar 15:35 sub.200316_153540.csv
-rw-r--r--  1 hitoshinagano  503  4595023 20 Mar 16:46 sub.200316_164600.csv
-rw-r--r--  1 hitoshinagano  503  4595020 20 Mar 19:07 sub.200316_190704.csv
-rw-r--r--  1 hitoshinagano  503  4595106 20 Mar 20:18 sub.200316_201807.csv
-rw-r--r--  1 hitoshinagano  503  4595023 20 Mar 21:23 sub.200316_212335.csv
-rw-r--r--  1 hitoshinagano  503  4595014 20 Mar 22:16 sub.200316_221654.csv
-rw-r--r--  1 hitoshinagano  503  4595011 20 Mar 2

### Cross-validation

In [62]:
def scorer(clf, X_trn, y_trn, X_val, y_val):
    clf.fit(X_trn, y_trn)
    y_pred = clf.predict_proba(X_val)  
    
    cts = []  #list of countries
    for i in range(len(y_val)):
        cts.append( np.argsort(y_pred[i])[::-1][:5].tolist() )

    cts = pd.DataFrame(cts)
    y_val = pd.Series(y_val)
    return score_predictions(cts, y_val)

In [63]:
# Building on Wendy Kan's ndgc_at_k example
# https://www.kaggle.com/wendykan/airbnb-recruiting-new-user-bookings/ndcg-example
#
# you can use this script for cross-validation

def dcg_at_k(r, k, method=1):
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.


def ndcg_at_k(r, k=5, method=1):
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max


def score_predictions(preds, truth, n_modes=5):
    """
    preds: pd.DataFrame
      one row for each observation, one column for each prediction.
      Columns are sorted from left to right descending in order of likelihood.
    truth: pd.Series
      one row for each obeservation.
    """
    assert(len(preds)==len(truth))
    r = pd.DataFrame(0, index=preds.index, columns=preds.columns, dtype=np.float64)
    for col in preds.columns:
        r[col] = (preds[col] == truth) * 1.0

    score = pd.Series(r.apply(ndcg_at_k, axis=1, reduce=True), name='score')
    return score
    
    
    
preds = pd.DataFrame([['US','FR'],['FR','US'],['FR','FR']])
truth = pd.Series(['US','US','FR'])
print'predictions:' 
print preds
print'truth:'
print truth
print'scores:'
print score_predictions(preds, truth)

predictions:
    0   1
0  US  FR
1  FR  US
2  FR  FR
truth:
0    US
1    US
2    FR
dtype: object
scores:
0    1.00000
1    0.63093
2    1.00000
Name: score, dtype: float64


In [64]:
i3 = 87628
i4 = 113088
i5 = 170158
i6 = len(X)

In [65]:
# fold1: same quarter one year before
# fold2: one quarter before the test data

fold1_train = np.array(range(0, i3) + range(i4, i6))
fold1_test  = np.array(range(i3, i4))
fold2_train = np.array(range(0, i5))
fold2_test  = np.array(range(i5, i6))

In [66]:
n_folds = 2
kf = [(fold1_train, fold1_test), (fold2_train, fold2_test)]

#n_folds = 5
#kf = KFold(len(X), n_folds=n_folds)

In [80]:

means = {}

for n_estimators in [23]:
    for subsample in [0.75]:
        for learning_rate in [0.2]:
            t0 = time()
            
            clf = XGBClassifier(max_depth=6, learning_rate=learning_rate, n_estimators=n_estimators, reg_lambda=10,
                        objective='multi:softprob', subsample=subsample, colsample_bytree=0.5, seed=0)  
            
                    
            ndcg_scores = []
            
            for train, test in kf:
                cv_pred = scorer(clf, X[train], y[train], X[test], y[test])
                ndcg_scores.append( np.mean(cv_pred) )
            
            print
            print "learning rate:", learning_rate
            print "n_estimators:", n_estimators
            print "subsample:", subsample
            print
            print n_folds, "folds NDCG scores: ", [ "%.4f" % s for s in ndcg_scores]
            print "mean:", np.mean(ndcg_scores)
            means[(n_estimators, subsample, learning_rate)] = ndcg_scores 
            print
            print "Classifier:"
            print clf
            print
            print "Date:", datetime.now().strftime("%A, %d %B %Y, %I:%M%p")
            print
            print "Running time:", "%.2f" % (time() - t0)
            print "******************************************************************************"


learning rate: 0.2
n_estimators: 23
subsample: 0.75

2 folds NDCG scores:  ['0.8336', '0.8509']
mean: 0.842244749997

Classifier:
XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.5,
       gamma=0, learning_rate=0.2, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=23, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=10,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.75)

Date: Wednesday, 23 March 2016, 09:18AM

Running time: 4492.24
******************************************************************************


In [None]:
pickle.dump( means, open( "./intermediate/mean_score_cv.pkl", "wb" ) )

In [149]:
max(means, key = lambda x: means.get(x) )

(20, 0.75, 0.2)

In [123]:
means

{(17, 0.75, 0.16): [0.83377493817166592, 0.85090935500605025],
 (17, 0.75, 0.2): [0.8335042303834892, 0.85062305350791889],
 (17, 0.75, 0.24): [0.83344848334104216, 0.85036823690486718],
 (20, 0.75, 0.16): [0.83371650880330062, 0.85131632434344051],
 (20, 0.75, 0.2): [0.833688144124065, 0.85086141386216418],
 (20, 0.75, 0.24): [0.83336274055005644, 0.85053709389292753],
 (23, 0.75, 0.16): [0.83378450358593914, 0.85159895061716673],
 (23, 0.75, 0.2): [0.83372656076927343, 0.85093847752662988],
 (23, 0.75, 0.24): [0.83343776135317649, 0.85070021889391045]}

In [129]:
df_means['average_ndcg'] = df_means.mean(axis=1)
df_means

Unnamed: 0,Unnamed: 1,Unnamed: 2,0,1,average_ndcg
17,0.75,0.16,0.833775,0.850909,0.842342
17,0.75,0.2,0.833504,0.850623,0.842064
17,0.75,0.24,0.833448,0.850368,0.841908
20,0.75,0.16,0.833717,0.851316,0.842516
20,0.75,0.2,0.833688,0.850861,0.842275
20,0.75,0.24,0.833363,0.850537,0.84195
23,0.75,0.16,0.833785,0.851599,0.842692
23,0.75,0.2,0.833727,0.850938,0.842333
23,0.75,0.24,0.833438,0.8507,0.842069


In [62]:
df_means = pd.DataFrame(means).T
df_means['average_ndcg'] = df_means.mean()
df_means['average_ndcg'] = df_means.mean(axis=1)
df_means

Unnamed: 0,Unnamed: 1,Unnamed: 2,0,1,average_ndcg
20,0.75,0.16,0.833538,0.851061,0.842299
20,0.75,0.2,0.833529,0.850604,0.842066
23,0.75,0.16,0.833594,0.851394,0.842494
23,0.75,0.2,0.833703,0.85089,0.842297


In [229]:
df_means = pd.DataFrame(means).T
df_means['average_ndcg'] = df_means.mean()
df_means['average_ndcg'] = df_means.mean(axis=1)
df_means

Unnamed: 0,Unnamed: 1,Unnamed: 2,0,1,average_ndcg
23,0.75,0.16,0.833634,0.850666,0.84215
23,0.75,0.2,0.833678,0.851042,0.84236


In [232]:
df_means = pd.DataFrame(means).T
df_means['average_ndcg'] = df_means.mean()
df_means['average_ndcg'] = df_means.mean(axis=1)
df_means

Unnamed: 0,Unnamed: 1,Unnamed: 2,0,1,average_ndcg
50,0.75,0.15,0.833604,0.851393,0.842499


In [250]:
df_means = pd.DataFrame(means).T
df_means['average_ndcg'] = df_means.mean()
df_means['average_ndcg'] = df_means.mean(axis=1)
df_means

Unnamed: 0,Unnamed: 1,Unnamed: 2,0,1,average_ndcg
50,0.75,0.15,0.8341,0.852147,0.843124


In [68]:
# FS 1000
# lambda_reg = 10
df_means = pd.DataFrame(means).T
df_means['average_ndcg'] = df_means.mean()
df_means['average_ndcg'] = df_means.mean(axis=1)
df_means

Unnamed: 0,Unnamed: 1,Unnamed: 2,0,1,average_ndcg
23,0.75,0.2,0.833642,0.851399,0.842521


### Simple F1-score benchmark

In [178]:
# simple F1-score benchmark

t0 = time()

cv = 3

clf = XGBClassifier(max_depth=6, learning_rate=0.20, n_estimators=20,
                    objective='multi:softprob', subsample=0.75, colsample_bytree=0.5, seed=0)
scores = cross_val_score(clf, X, y, cv = cv, scoring = 'f1_weighted')

print clf
print datetime.now().strftime("%A, %d. %B %Y %I:%M%p")
print
print scores
print time() - t0


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.5,
       gamma=0, learning_rate=0.2, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=20, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.75)
Monday, 18. January 2016 11:38PM

[ 0.47242493  0.50844404  0.44370861]
2665.43831778


In [181]:
df_all.head()

Unnamed: 0,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,first_device_type,first_browser,...,982,983,984,985,986,987,988,989,session_count,secs_elapsed
0,0,-1,1,0,6,2,4,6,3,8,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,2,38,1,0,6,7,8,6,3,8,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,1,56,0,3,6,2,4,6,6,22,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,1,42,1,0,6,2,4,6,3,17,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,0,41,0,0,6,2,4,6,3,8,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
