In [1]:
import numpy as np
import pandas as pd
import pickle
from datetime import datetime, date
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import StratifiedShuffleSplit

import argparse



In [2]:
print('Loading raw data...')
train_users_path='train_users_2.csv'
test_users_path='test_users.csv'
sessions_path='sessions.csv'

#Note: age_gender_bkts.csv and countries.csv files are not used.

#########Loading data#############
#train_users
df_train = pd.read_csv(train_users_path)
target = df_train['country_destination']
df_train = df_train.drop(['country_destination'], axis=1)

#test_users
df_test = pd.read_csv(test_users_path)    
id_test = df_test['id']

#sessions
df_sessions = pd.read_csv(sessions_path)
df_sessions['id'] = df_sessions['user_id']
df_sessions = df_sessions.drop(['user_id'],axis=1)

Loading raw data...


In [3]:
#df_train.date_account_created

In [4]:
df_train.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome


In [5]:
df_train.gender.replace("-unknown-", np.nan, inplace=True)
#df_train.head()
df_test.gender.replace("-unknown-", np.nan, inplace=True)


In [6]:
pd.unique(df_train.gender.values)

array([nan, 'MALE', 'FEMALE', 'OTHER'], dtype=object)

In [7]:
#sessions grouping by user

#Group by user_id, aggregate by number of counts (counting device_type as it is never NA), 
#and total sum of elapsed time in seconds
group_sessions = df_sessions.groupby("id")
#.agg({'device_type':'count', 'secs_elapsed':'sum'})
#rename columns
# // group_sessions.columns = ['sum_secs_elapsed', 'counts']
#group by variable turns into index, I'm reseting the index and putting user_id back as a column
# //group_sessions.reset_index(level=0, inplace=True)

In [8]:
group_sessions.head()

Unnamed: 0,action,action_type,action_detail,device_type,secs_elapsed,id
0,lookup,,,Windows Desktop,319.0,d1mm9tcy42
1,search_results,click,view_search_results,Windows Desktop,67753.0,d1mm9tcy42
2,lookup,,,Windows Desktop,301.0,d1mm9tcy42
3,search_results,click,view_search_results,Windows Desktop,22141.0,d1mm9tcy42
4,lookup,,,Windows Desktop,435.0,d1mm9tcy42
127,dashboard,view,dashboard,Mac Desktop,2739.0,yo8nz8bqcq
128,create,submit,create_user,Mac Desktop,,yo8nz8bqcq
129,confirm_email,click,confirm_email_link,Mac Desktop,115983.0,yo8nz8bqcq
130,show,view,p3,Mac Desktop,20285.0,yo8nz8bqcq
131,show_personalize,data,user_profile_content_update,Mac Desktop,3255.0,yo8nz8bqcq


In [9]:
group_sessions = group_sessions.agg({'device_type':'count', 'secs_elapsed':'sum'})

In [10]:
group_sessions.head()

Unnamed: 0_level_0,secs_elapsed,device_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1
00023iyk9l,867896.0,40
0010k6l0om,586543.0,63
001wyh0pz8,282965.0,90
0028jgx1x1,297010.0,31
002qnbzfs5,6487080.0,789


In [11]:
group_sessions.columns = ['sum_secs_elapsed', 'counts']

In [12]:
group_sessions.head()

Unnamed: 0_level_0,sum_secs_elapsed,counts
id,Unnamed: 1_level_1,Unnamed: 2_level_1
00023iyk9l,867896.0,40
0010k6l0om,586543.0,63
001wyh0pz8,282965.0,90
0028jgx1x1,297010.0,31
002qnbzfs5,6487080.0,789


In [13]:
group_sessions.reset_index(level=0, inplace=True)
group_sessions.head()

Unnamed: 0,id,sum_secs_elapsed,counts
0,00023iyk9l,867896.0,40
1,0010k6l0om,586543.0,63
2,001wyh0pz8,282965.0,90
3,0028jgx1x1,297010.0,31
4,002qnbzfs5,6487080.0,789


In [14]:
df_train.age[0:20]

0      NaN
1     38.0
2     56.0
3     42.0
4     41.0
5      NaN
6     46.0
7     47.0
8     50.0
9     46.0
10    36.0
11    47.0
12     NaN
13    37.0
14    36.0
15    33.0
16     NaN
17    31.0
18     NaN
19    29.0
Name: age, dtype: float64

In [15]:
df_train.age.replace(np.nan, 0, inplace=True)
df_train.age.astype(str, inplace=True)
df_train.head()
df_train["age_bucket"] = ""
#df_train.age_bucket

In [None]:
for i, row in df_train.iterrows():
    for i in range (15, 101, 5):
        if int(row.age) < i:
            #row.age = int(row.age).astype(str)
            df_train.loc[i, "age_bucket"] = (str(i) + " - " + str(i+4))
            #print (df_train.loc[i, "age_bucket"])
    


In [37]:
df_train.age_bucket

0          
1          
2          
3          
4          
5          
6          
7          
8          
9          
10         
11         
12         
13         
14         
15         
16         
17         
18         
19         
20         
21         
22         
23         
24         
25         
26         
27         
28         
29         
         ..
213421     
213422     
213423     
213424     
213425     
213426     
213427     
213428     
213429     
213430     
213431     
213432     
213433     
213434     
213435     
213436     
213437     
213438     
213439     
213440     
213441     
213442     
213443     
213444     
213445     
213446     
213447     
213448     
213449     
213450     
Name: age_bucket, dtype: object

In [None]:
df_test["age_bucket"] = ""
df_test.age.replace(np.nan, 0, inplace=True)

for i, row in df_test.iterrows():
    for i in range (16, 101, 5):
        if int(row.age) < i:
            df_test.loc[i, "age_bucket"] = (str(i) + " - " + str(i+4))
            #print (df_test.loc[i, "age_bucket"])

In [None]:
#adding time lag columns
df_train['first_book_lag'] = df_train.apply(lambda x: "before" if x.date_first_booking < x.date_account_created \
                                           else if x.date_first_booking == x.date_account_created "same day" \
                                           else "after")
df_train['account_active_lag'] = df_train.apply(lambda x: "before" if x.date_first_booking < x.timestamp_first_active \
                                           else if x.date_first_booking == x.timestamp_first_active "same day" \
                                           else "after")
df_train['account_created_lag'] = df_train.apply(lambda x: "before" if x.date_account_created < x.timestamp_first_active \
                                           else if x.date_first_booking == x.timestamp_first_active "same day" \
                                           else "after")
df_test['account_created_lag'] = df_test.apply(lambda x: "before" if x.date_account_created < x.timestamp_first_active \
                                           else if x.date_first_booking == x.timestamp_first_active "same day" \
                                           else "after")

In [None]:
def bookings(L1, L2, L3, L4):
    timediflist = []
    for i in range(len(L1)):
        if L1[i] == 'same day' or L2[i] == 'same day':
            timediflist.append('early')
        elif L1[i] == 'before' and L2[i] == 'before' and L3[i] == 'same day':
            timediflist.append('early')
        elif L1[i] == 'greater 1 day' and L2[i] == 'greater 1 day':
            timediflist.append('waited')
        elif L1[i] == 'greater 1 day' and L2[i] == 'before':
            timediflist.append('waited')
        elif L1[i] == 'before' and L2[i] == 'greater 1 day':
            timediflist.append('waited')
        elif L1[i] == 'before' and L2[i] == 'before' and L3[i] == 'greater 1 day':
            timediflist.append('waited')
        elif L4[i] == 'NDF':
            timediflist.append('NB')
        else:
            timediflist.append('NA')

            
    return timediflist