In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Draw inline
%matplotlib inline

In [4]:
# Set figure aesthetics
sns.set_style("white", {'ytick.major.size': 10.0})
sns.set_context("poster", font_scale=1.1)

# Data Exploration
### We must first exlorer the data before moving to thinking about what features to pick or building model

### This is what I want to ask:
1. Does the data have missing values?
2. Does the data have strange and unrealistic behaviors that need to be removed?
3. Does the data show some peculiar behavior?

In [10]:
#########Loading data#############
#train_users
train_users = pd.read_csv('train_users_2.csv')
target = train_users['country_destination']
train_users = train_users.drop(['country_destination'], axis=1)

#test_users
test_users = pd.read_csv('test_users.csv')    
id_test = test_users['id']

#sessions
df_sessions = pd.read_csv('sessions.csv')


In [11]:
print("We have", train_users.shape[0], "users in the training set and", 
      test_users.shape[0], "in the test set.")
print("In total we have", train_users.shape[0] + test_users.shape[0], "users.")

We have 213451 users in the training set and 62096 in the test set.
In total we have 275547 users.


In [37]:
train_users.isnull().sum()

id                              0
date_account_created            0
timestamp_first_active          0
date_first_booking         124543
gender                          0
age                         87990
signup_method                   0
signup_flow                     0
language                        0
affiliate_channel               0
affiliate_provider              0
first_affiliate_tracked      6065
signup_app                      0
first_device_type               0
first_browser                   0
dtype: int64

### Let's look at the whole data

In [16]:
# Merge train and test users
users = pd.concat((train_users, test_users), axis=0, ignore_index=True)

In [17]:
# Remove ID's since now we are not interested in making predictions
users.drop('id',axis=1, inplace=True)

users.head()

Unnamed: 0,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome
1,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome
2,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE
3,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox
4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome


### The data has lots of missing data represented by "-unknown-"

In [18]:
users.replace("-unknown-", np.nan, inplace=True)

In [34]:
users.head()


Unnamed: 0,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,2010-06-28,20090319043255,,,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome
1,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome
2,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE
3,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox
4,2010-09-14,20091208061105,2010-02-18,,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome


In [28]:
users_nan_percentage =  (users.isnull().sum() / users.shape[0]) * 100

In [33]:
test_users.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,5uwns89zht,2014-07-01,20140701000006,,FEMALE,35.0,facebook,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
1,jtl0dijy2j,2014-07-01,20140701000051,,-unknown-,,basic,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
2,xx0ulgorjt,2014-07-01,20140701000148,,-unknown-,,basic,0,en,direct,direct,linked,Web,Windows Desktop,Chrome
3,6c6puo6ix0,2014-07-01,20140701000215,,-unknown-,,basic,0,en,direct,direct,linked,Web,Windows Desktop,IE
4,czqhjk3yfe,2014-07-01,20140701000305,,-unknown-,,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Safari


In [36]:
test_users.isnull().sum()

id                             0
date_account_created           0
timestamp_first_active         0
date_first_booking         62096
gender                         0
age                        28876
signup_method                  0
signup_flow                    0
language                       0
affiliate_channel              0
affiliate_provider             0
first_affiliate_tracked       20
signup_app                     0
first_device_type              0
first_browser                  0
dtype: int64

### Since the number of test users and number of missing data in "date_first_booking" are the same, it is obvious the test data does not have this feature.

In [31]:
users_nan_percentage[users_nan_percentage > 0]

date_first_booking         67.733998
gender                     46.990169
age                        42.412365
language                    0.000363
first_affiliate_tracked     2.208335
first_browser              16.111226
dtype: float64

In [39]:
print("Just for the sake of curiosity; we have", 
      int((train_users.date_first_booking.isnull().sum() / train_users.shape[0]) * 100), 
      "% of missing values at date_first_booking in the training data")
print("we have", 
      int((train_users.gender.isnull().sum() / train_users.shape[0]) * 100), 
      "% of missing values at gender in the training data")
print("we have", 
      int((train_users.age.isnull().sum() / train_users.shape[0]) * 100), 
      "% of missing values at age in the training data")

Just for the sake of curiosity; we have 58 % of missing values at date_first_booking in the training data
we have 0 % of missing values at gender in the training data
we have 41 % of missing values at age in the training data


### As we can see, "data_first_booking" has a high missing rate 67.73% contributed by both training missing data and test missing data. I will not use it as a feature due to such a high missing percentage. If I use it, I will have to drop more than half of the data, which is not accectable.

### In addition, both gender and age has high missing percentages, which are 47% and 42.4% respectively. Training data does not have missing data for gender, and in test data gender is 47% empty. 41% of the missing data for age comes from training data, and 1.4% is from test data.

### Lets explore age