airbnb ver2
==========

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = [10, 5]

## Read train data

In [2]:
raw_train_user = pd.read_csv('./data/train_users_2.csv')
raw_train_user.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US


In [3]:
categorical_cols = [
#     'gender', 
    'signup_method', 
    'language', 
    'affiliate_channel', 
    'affiliate_provider', 
    'first_affiliate_tracked', 
    'signup_app', 
    'first_device_type',
    'first_browser'
]

unused_cols = [
    'date_first_booking',
    'gender'
]

In [4]:
from datetime import  datetime
from sklearn.preprocessing import LabelEncoder
process_date_func = lambda x: datetime.strptime(x, '%Y-%m-%d')
process_datetime_func = lambda x: datetime.strptime(str(x), '%Y%m%d%H%M%S')
mean_age = raw_train_user[raw_train_user.age < 120].age.mean()
process_age_func = lambda x: 2015 - x if x > 150 else mean_age if x > 100 and x <= 150 else x
def preprocess_data(df):
    # drop unuseful column
    
    processed_df = df.drop(unused_cols, axis=1)
    processed_df = processed_df.fillna(-1)
    # bad ways to clean data -> dropna
#     processed_df = processed_df.dropna()
    # Process categories data
    processed_df = pd.get_dummies(processed_df, columns=categorical_cols)
    
    # process datetime column
    processed_df['date_account_created'] = processed_df['date_account_created'].apply(process_date_func)
    processed_df['timestamp_first_active'] = processed_df['timestamp_first_active'].apply(process_datetime_func)
    processed_df['dac_year'] = processed_df['date_account_created'].dt.year
    processed_df['dac_month'] = processed_df['date_account_created'].dt.month
    processed_df['dac_day'] = processed_df['date_account_created'].dt.day

    processed_df['tfa_year'] = processed_df['timestamp_first_active'].dt.year
    processed_df['tfa_month'] = processed_df['timestamp_first_active'].dt.month
    processed_df['tfa_day'] = processed_df['timestamp_first_active'].dt.day
    
    # process age
    processed_df['age'] = processed_df['age'].apply(process_age_func)
    
    # process country destination
    processed_df['country_destination'] = LabelEncoder().fit_transform(processed_df['country_destination'])
    
    return processed_df
processed_user = preprocess_data(raw_train_user)
processed_user.head()
# processed_user.shape

Unnamed: 0,id,date_account_created,timestamp_first_active,age,signup_flow,country_destination,signup_method_basic,signup_method_facebook,signup_method_google,language_ca,...,first_browser_TenFourFox,first_browser_TheWorld Browser,first_browser_Yandex.Browser,first_browser_wOSBrowser,dac_year,dac_month,dac_day,tfa_year,tfa_month,tfa_day
0,gxn3p5htnn,2010-06-28,2009-03-19 04:32:55,-1.0,0,7,0,1,0,0,...,0,0,0,0,2010,6,28,2009,3,19
1,820tgsjxq7,2011-05-25,2009-05-23 17:48:09,38.0,0,7,0,1,0,0,...,0,0,0,0,2011,5,25,2009,5,23
2,4ft3gnwmtx,2010-09-28,2009-06-09 23:12:47,56.0,3,10,1,0,0,0,...,0,0,0,0,2010,9,28,2009,6,9
3,bjjt8pjhuk,2011-12-05,2009-10-31 06:01:29,42.0,0,11,0,1,0,0,...,0,0,0,0,2011,12,5,2009,10,31
4,87mebub9p4,2010-09-14,2009-12-08 06:11:05,41.0,0,10,1,0,0,0,...,0,0,0,0,2010,9,14,2009,12,8


In [5]:
sns.distplot(processed_user.age)

<matplotlib.axes._subplots.AxesSubplot at 0x7f80632208d0>

## Session features

In [6]:
df_session = pd.read_csv('./data/sessions.csv')

In [7]:
from session_features import make_sessions_features
train_data = make_sessions_features(processed_user, df_session)

In [8]:
train_data.head(10)

Unnamed: 0,date_account_created,timestamp_first_active,age,signup_flow,country_destination,signup_method_basic,signup_method_facebook,signup_method_google,language_ca,language_cs,...,335,336,337,338,339,340,341,342,343,344
0,2010-06-28,2009-03-19 04:32:55,-1.0,0,7,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2011-05-25,2009-05-23 17:48:09,38.0,0,7,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2010-09-28,2009-06-09 23:12:47,56.0,3,10,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2011-12-05,2009-10-31 06:01:29,42.0,0,11,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2010-09-14,2009-12-08 06:11:05,41.0,0,10,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2010-01-01,2010-01-01 21:56:19,-1.0,0,10,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2010-01-02,2010-01-02 01:25:58,46.0,0,10,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,2010-01-03,2010-01-03 19:19:05,47.0,0,10,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,2010-01-04,2010-01-04 00:42:11,50.0,0,10,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,2010-01-04,2010-01-04 02:37:58,46.0,0,10,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
drop_columns = [
    'country_destination', 
#     'id',
    'date_account_created',
    'timestamp_first_active'
]

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

X = train_data.drop(drop_columns, axis=1)
y = train_data.loc[:, 'country_destination']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [10]:
X_train.shape

(149415, 480)

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import VotingClassifier
import xgboost as xgb

def build_model():
    rfc = RandomForestClassifier()
    model = XGBClassifier(max_depth=6, learning_rate=0.2, n_estimators=43,
                    objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=0, missing=-1, n_jobs=5)
    return model

In [22]:
airbnb_model = build_model()

## Train

In [15]:
params  =  {
    'objective': 'multi:softprob',
    'num_class': 12,
    'bst:eta' :  0.3,
    'bst:max_depth': 4,
    'eval_metric': 'mlogloss',
    'silent': 0,
    'nthread': 4,
    'lambda': 1.0
}
    
values, counts = np.unique(y, return_counts=True)
freqs = counts/float(counts.sum())
weights = 1/freqs

In [14]:
dtrain = xgb.DMatrix(X_train.values, label=y_train)
dtest = xgb.DMatrix(X_test.values, label=y_test)

In [19]:
gbm = xgb.train(params, dtrain, num_boost_round=500, verbose_eval=True, evals=[(dtest, 'test'), (dtrain, 'train')], early_stopping_rounds=5)

[0]	test-mlogloss:1.75166	train-mlogloss:1.74653
Multiple eval metrics have been passed: 'train-mlogloss' will be used for early stopping.

Will train until train-mlogloss hasn't improved in 5 rounds.
[1]	test-mlogloss:1.52299	train-mlogloss:1.51554
[2]	test-mlogloss:1.38303	train-mlogloss:1.37329
[3]	test-mlogloss:1.289	train-mlogloss:1.27745
[4]	test-mlogloss:1.22334	train-mlogloss:1.20982
[5]	test-mlogloss:1.17621	train-mlogloss:1.16098
[6]	test-mlogloss:1.14198	train-mlogloss:1.12475
[7]	test-mlogloss:1.11684	train-mlogloss:1.0977
[8]	test-mlogloss:1.0976	train-mlogloss:1.07674
[9]	test-mlogloss:1.08316	train-mlogloss:1.06049
[10]	test-mlogloss:1.07235	train-mlogloss:1.04782
[11]	test-mlogloss:1.06358	train-mlogloss:1.03713
[12]	test-mlogloss:1.05716	train-mlogloss:1.02879
[13]	test-mlogloss:1.05208	train-mlogloss:1.02198
[14]	test-mlogloss:1.04795	train-mlogloss:1.01598
[15]	test-mlogloss:1.04467	train-mlogloss:1.01113
[16]	test-mlogloss:1.04214	train-mlogloss:1.00702
[17]	test-ml

In [21]:
prediction = gbm.predict(xgb.DMatrix(X_test.values)).argmax(axis=1)

In [24]:
from sklearn.metrics import classification_report
print(classification_report(prediction, y_test))

             precision    recall  f1-score   support

          0       0.00      0.00      0.00         3
          1       0.00      0.00      0.00         4
          2       0.00      0.00      0.00         5
          3       0.00      0.00      0.00         6
          4       0.00      0.14      0.01        29
          5       0.00      0.00      0.00         4
          6       0.00      0.00      0.00        10
          7       0.85      0.71      0.77     44804
          8       0.00      0.00      0.00         2
          9       0.00      0.00      0.00         4
         10       0.52      0.51      0.52     19081
         11       0.00      0.17      0.01        84

avg / total       0.75      0.65      0.70     64036

