In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
sns.set()
%matplotlib inline

In [2]:
from lightgbm import LGBMClassifier

In [34]:
df_train_origin = pd.read_csv("data/train_users_2.csv")
df_test_origin = pd.read_csv("data/test_users.csv")
df_session = pd.read_csv("data/sessions.csv")

### Create features about session

In [35]:
tmp = df_session.groupby(["user_id", "action_type"])["secs_elapsed"].count().unstack().fillna(0)
df_session_type = pd.DataFrame(tmp)
df_session_type.rename(columns = lambda x : "type_" + x, inplace = True)

In [6]:
tmp = df_session.groupby(["user_id", "action"])["secs_elapsed"].count().unstack().fillna(0)
df_session_action = pd.DataFrame(tmp)
df_session_action.rename(columns = lambda x : "action_" + x, inplace = True)

In [7]:
tmp = df_session.groupby(["user_id", "action_detail"])["secs_elapsed"].count().unstack().fillna(0)
df_session_action_detail = pd.DataFrame(tmp)
df_session_action_detail.rename(columns = lambda x : "detail_" + x, inplace = True)

In [8]:
df_session_info = df_session_type.merge(df_session_action, how = "left", left_index = True, right_index = True)
df_session_info = df_session_info.merge(df_session_action_detail, how = "left", left_index = True, right_index = True)

##### drop unknown data

In [9]:
df_session_info.drop(["type_-unknown-", "detail_-unknown-"], axis = 1, inplace = True)

In [10]:
df_session_info.shape

(135478, 522)

##### Merge session features (impute session)

In [11]:
from sklearn.preprocessing import Imputer

In [12]:
df_train = df_train_origin.merge(df_session_info, how = "left", left_on = "id", right_index = True)
df_test = df_test_origin.merge(df_session_info, how = "left", left_on = "id", right_index = True)

In [13]:
imp = Imputer(missing_values='NaN', strategy='median', axis=0)

In [14]:
df_train[df_session_info.columns.tolist()] = imp.fit_transform(df_train[df_session_info.columns.tolist()])
df_test[df_session_info.columns.tolist()] = imp.fit_transform(df_test[df_session_info.columns.tolist()])

### lag feature

In [15]:
df_train_origin["date_account_created"] = pd.to_datetime(df_train_origin["date_account_created"], format = "%Y-%m-%d")
df_train_origin["timestamp_first_active"] = pd.to_datetime(df_train_origin["timestamp_first_active"], format="%Y%m%d%H%M%S", errors='ignore')
df_test_origin["date_account_created"] = pd.to_datetime(df_test_origin["date_account_created"], format = "%Y-%m-%d")
df_test_origin["timestamp_first_active"] = pd.to_datetime(df_test_origin["timestamp_first_active"], format="%Y%m%d%H%M%S", errors='ignore')

In [16]:
s_train_lag = df_train_origin["timestamp_first_active"] - df_train_origin["date_account_created"]
s_test_lag = df_test_origin["timestamp_first_active"] - df_test_origin["date_account_created"]

In [17]:
df_train["lag_days"] = s_train_lag.apply(lambda x : -1 * x.days)
df_test["lag_days"] = s_test_lag.apply(lambda x : -1 * x.days)
df_train["lag_seconds"] = s_train_lag.apply(lambda x : x.seconds)
df_test["lag_seconds"] = s_test_lag.apply(lambda x : x.seconds)

### faithless_sign_in

In [18]:
s_all_input_train = np.logical_not(df_train_origin['age'] >= 120) | (df_train_origin['gender'] == '-unknown-')
s_all_input_test = np.logical_not(df_test_origin['age'] >= 120) | (df_test_origin['gender'] == '-unknown-')

In [19]:
df_train['faithless_sign'] = s_all_input_train.apply(lambda x : 1 if x == True else 0)
df_test['faithless_sign'] = s_all_input_test.apply(lambda x : 1 if x == True else 0)

In [20]:
df_train.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,...,detail_view_user_real_names,detail_wishlist,detail_wishlist_content_update,detail_wishlist_note,detail_your_listings,detail_your_reservations,detail_your_trips,lag_days,lag_seconds,faithless_sign
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,466,16375,1
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,732,64089,1
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,476,83567,1
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,765,21689,1
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,280,22265,1


In [33]:
df_train.isnull().sum()

id                                         0
date_account_created                       0
timestamp_first_active                     0
date_first_booking                    124543
gender                                     0
age                                    87990
signup_method                              0
signup_flow                                0
language                                   0
affiliate_channel                          0
affiliate_provider                         0
first_affiliate_tracked                 6065
signup_app                                 0
first_device_type                          0
first_browser                              0
type_booking_request                       0
type_booking_response                      0
type_click                                 0
type_data                                  0
type_message_post                          0
type_modify                                0
type_partner_callback                      0
type_submi

In [22]:
df_train.shape, df_test.shape

((213451, 541), (62096, 540))

In [27]:
df_target = df_train["country_destination"]
df_train = df_train.drop(labels=['country_destination'], axis =1)

from sklearn import preprocessing

le = preprocessing.LabelEncoder()
y_label = le.fit_transform(df_target) 
y_label

array([ 7,  7, 10, ...,  7,  7,  7])

In [32]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213451 entries, 0 to 213450
Columns: 540 entries, id to faithless_sign
dtypes: float64(523), int64(5), object(12)
memory usage: 879.4+ MB


In [29]:
df_train.shape, df_test.shape, df_target.shape

((213451, 540), (62096, 540), (213451,))

# lightGBM classifier

In [30]:
import lightgbm as lgb

In [31]:
model_lgb = lgb.LGBMClassifier(max_depth=10, objective = 'multiclass', random_state=0, n_jobs=4).fit(df_train, y_label)

ValueError: DataFrame.dtypes for data must be int, float or bool. Did not expect the data types in fields id, date_account_created, date_first_booking, gender, signup_method, language, affiliate_channel, affiliate_provider, first_affiliate_tracked, signup_app, first_device_type, first_browser

In [None]:
#predict_proba
y_lgb = model_lgb.predict_proba(df_test)



ids = []  #list of ids
cts = []  #list of countries
for i in range(len(df_sample)):
    idx = df_sample.iloc[i, 0]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_lgb[i])[::-1])[:5].tolist()

df_sample1 = pd.DataFrame(columns=['id', 'country'])
df_sample1['id'] = ids
df_sample1['country'] = cts
df_sample1 = df_sample1.reset_index(drop=True)
df_sample1.to_csv('sub_lgb.csv', sep=',', na_rep='NaN', index = False)
#0.85932 w/o age
#0.86495 w/ age