In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from xgboost.sklearn import XGBClassifier

np.random.seed(0)

#Loading data
df_train = pd.read_csv('../data/airbnb/train_users_2.csv')
df_test = pd.read_csv('../data/airbnb/test_users.csv')
labels = df_train['country_destination'].values
df_train = df_train.drop(['country_destination'], axis=1)
id_test = df_test['id']
piv_train = df_train.shape[0]

In [4]:
#Creating a DataFrame with train+test data
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

In [5]:
#Filling nan
df_all = df_all.fillna(-1)

In [6]:
#####Feature engineering#######
#date_account_created
dac = np.vstack(df_all.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)
df_all['dac_year'] = dac[:,0]
df_all['dac_month'] = dac[:,1]
df_all['dac_day'] = dac[:,2]

#timestamp_first_active
tfa = np.vstack(df_all.timestamp_first_active.astype(str).apply(lambda x: list(map(int, [x[:4],x[4:6],x[6:8],x[8:10],x[10:12],x[12:14]]))).values)
df_all['tfa_year'] = tfa[:,0]
df_all['tfa_month'] = tfa[:,1]
df_all['tfa_day'] = tfa[:,2]

## dates
df_all['date_account_created'] = pd.to_datetime(df_all['date_account_created'])
df_all['date_first_active'] = pd.to_datetime((df_all.timestamp_first_active // 1000000), format='%Y%m%d')
df_all['date_first_booking'] = pd.to_datetime(df_all['date_first_booking'])

## dow
df_all['dac_dow'] = df_all['date_account_created'].dt.dayofweek
df_all['dfa_dow'] = df_all['date_first_active'].dt.dayofweek
df_all['dfb_dow'] = df_all['date_first_booking'].dt.dayofweek

## drops
df_all = df_all.drop(['date_account_created'], axis=1)
df_all = df_all.drop(['timestamp_first_active'], axis=1)
df_all = df_all.drop(['date_first_active'], axis=1)
df_all = df_all.drop(['id', 'date_first_booking'], axis=1)

#Age
av = df_all.age.values
df_all['age'] = np.where(np.logical_and(av>1919, av<1995), 2015-av, av)
df_all['age'] = np.where(np.logical_or(av<14, av>100), -1, av)

In [7]:
#One-hot-encoding features
ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']
for f in ohe_feats:
    df_all_dummy = pd.get_dummies(df_all[f], prefix=f)
    df_all = df_all.drop([f], axis=1)
    df_all = pd.concat((df_all, df_all_dummy), axis=1)
    
print(df_all.shape)
print(list(df_all.columns.values))
print(df_all.head(5))

(275547, 164)
['age', 'dac_year', 'dac_month', 'dac_day', 'tfa_year', 'tfa_month', 'tfa_day', 'dac_dow', 'dfa_dow', 'dfb_dow', 'gender_-unknown-', 'gender_FEMALE', 'gender_MALE', 'gender_OTHER', 'signup_method_basic', 'signup_method_facebook', 'signup_method_google', 'signup_method_weibo', 'signup_flow_0', 'signup_flow_1', 'signup_flow_2', 'signup_flow_3', 'signup_flow_4', 'signup_flow_5', 'signup_flow_6', 'signup_flow_8', 'signup_flow_10', 'signup_flow_12', 'signup_flow_14', 'signup_flow_15', 'signup_flow_16', 'signup_flow_20', 'signup_flow_21', 'signup_flow_23', 'signup_flow_24', 'signup_flow_25', 'language_-unknown-', 'language_ca', 'language_cs', 'language_da', 'language_de', 'language_el', 'language_en', 'language_es', 'language_fi', 'language_fr', 'language_hr', 'language_hu', 'language_id', 'language_is', 'language_it', 'language_ja', 'language_ko', 'language_nl', 'language_no', 'language_pl', 'language_pt', 'language_ru', 'language_sv', 'language_th', 'language_tr', 'language_z

In [8]:
#Splitting train and test
vals = df_all.values
X = vals[:piv_train]
le = LabelEncoder()
y = le.fit_transform(labels)   
X_test = vals[piv_train:]

In [9]:
#Classifier
xgb = XGBClassifier(max_depth=5, learning_rate=0.2, n_estimators=50,
                    objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=0, silent=0)                  
xgb.fit(X, y)
y_pred = xgb.predict_proba(X_test)  

In [10]:
#Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

In [11]:
#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('sub2.csv',index=False)