## Exploratory Data Analysis

In [17]:
import gc
import pandas as pd
import numpy as np
import lightgbm as lgb
from operator import xor
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from ml_metrics import rmsle
%matplotlib inline

le = preprocessing.LabelEncoder()
lb = preprocessing.LabelBinarizer()

air = {
    "reserve": pd.read_csv("data/air/air_reserve.csv", parse_dates=["visit_datetime", "reserve_datetime"]),
    "store_info": pd.read_csv("data/air/air_store_info.csv"),
    "visit_data": pd.read_csv("data/air/air_visit_data.csv", parse_dates=["visit_date"])
}

hpg = {
    "reserve": pd.read_csv("data/hpg/hpg_reserve.csv", parse_dates=["visit_datetime", "reserve_datetime"]),
    "store_info": pd.read_csv("data/hpg/hpg_store_info.csv")
}

date_info = pd.read_csv("data/date_info.csv", parse_dates=["calendar_date"])
store_id_relation = pd.read_csv("data/store_id_relation.csv")

def remove_outliers(data):
    df_0 = data.loc[data.visitors == 0]   
    q1 = np.percentile(data.visitors, 25, axis=0)
    q3 = np.percentile(data.visitors, 75, axis=0)
    k = 2.8
    iqr = q3 - q1
    df_temp = data.loc[data.visitors > q1 - k*iqr]
    df_temp = data.loc[data.visitors < q3 + k*iqr]
    frames = [df_0, df_temp]
    result = pd.concat(frames)
    return result

air["visit_data"] = remove_outliers(air["visit_data"])

df_test = pd.read_csv('sample_submission.csv')
df_test['air_store_id'] = df_test['id'].apply(lambda x: '_'.join(x.split('_')[:2]))
df_test['visit_date'] = df_test['id'].apply(lambda x: x.split('_')[-1])
index_test = df_test['id']
del df_test['id'], df_test['visitors']

In [18]:
print('mapping and dropping useless information in df_hr ...')
s_1 = store_id_relation['air_store_id']
s_2 = store_id_relation['hpg_store_id']
a_h_map = dict(zip(s_2.values, s_1.values))
del s_1, s_2

hpg["reserve"]['air_store_id'] = hpg["reserve"]['hpg_store_id'].map(a_h_map)
hpg["reserve"] = hpg["reserve"].drop('hpg_store_id', axis=1).dropna()


print('mapping and dropping useless information in df_hr Done!')
print("-----------------------------------------------------------------------------------------")

print('mapping and dropping useless information in df_hr ...')

hpg["store_info"]['air_store_id'] = hpg["store_info"]['hpg_store_id'].map(a_h_map)
hpg["store_info"] = hpg["store_info"].drop('hpg_store_id', axis=1).dropna()
print('mapping and dropping useless information in df_hs Done!')
gc.collect()

mapping and dropping useless information in df_hr ...
mapping and dropping useless information in df_hr Done!
-----------------------------------------------------------------------------------------
mapping and dropping useless information in df_hr ...
mapping and dropping useless information in df_hs Done!


468

## Feature Analysis

###### Looking for correlations in data:

# Applying ML

label encoding (?):

In [19]:
le.fit(air["store_info"]['air_genre_name'])
air["store_info"]['air_genre_name'] = le.fit_transform(air["store_info"]['air_genre_name'])

le.fit(air["store_info"]['air_area_name'])
air["store_info"]['air_area_name'] = le.fit_transform(air["store_info"]['air_area_name'])

le.fit(hpg["store_info"]['hpg_genre_name'])
hpg["store_info"]['hpg_genre_name'] = le.fit_transform(hpg["store_info"]['hpg_genre_name'])

le.fit(hpg["store_info"]['hpg_area_name'])
hpg["store_info"]['hpg_area_name'] = le.fit_transform(hpg["store_info"]['hpg_area_name'])



le.fit(air["store_info"]['air_store_id'])


air["reserve"]['air_store_id'] = le.transform(air["reserve"]['air_store_id'])
air["store_info"]['air_store_id'] = le.transform(air["store_info"]['air_store_id'])
air["visit_data"]['air_store_id'] = le.transform(air["visit_data"]['air_store_id'])
hpg["reserve"]['air_store_id'] = le.transform(hpg["reserve"]['air_store_id'])
hpg["store_info"]['air_store_id'] = le.transform(hpg["store_info"]['air_store_id'])

df_test['air_store_id'] = le.transform(df_test['air_store_id'])

gc.collect()

0

In [20]:
time_format = "%Y-%m-%d %H:%M:%S"
def seperate_date(data):
    # split date feature in reservation datetime
    data_time = pd.to_datetime(data.reserve_datetime, format=time_format)
    data['Year_re']= data_time.dt.year
    data['Month_re'] = data_time.dt.month
    data['DayOfYear_re'] = data_time.dt.dayofyear
    data['DayOfWeek_re'] = data_time.dt.dayofweek
    data['Hour_re'] = data_time.dt.hour
    return data

seperate_date(air["reserve"])


def seperate_date(data):
    # split date feature in reservation datetime
    data_time = pd.to_datetime(data.reserve_datetime, format=time_format)
    data['Year_re_h']= data_time.dt.year
    data['Month_re_h'] = data_time.dt.month
    data['DayOfYear_re_h'] = data_time.dt.dayofyear
    data['DayOfWeek_re_h'] = data_time.dt.dayofweek
    data['Hour_re_h'] = data_time.dt.hour
    return data

seperate_date(hpg["reserve"])


time_format = "%Y-%m-%d %H:%M:%S"
def seperate_date(data):
    # split date feature in reserved visiting datetime
    data_time = pd.to_datetime(data.visit_datetime, format=time_format)
    data['Year_re_visit']= data_time.dt.year
    data['Month_re_visit'] = data_time.dt.month
    data['DayOfYear_re_visit'] = data_time.dt.dayofyear
    data['DayOfWeek_re_visit'] = data_time.dt.dayofweek
    data['Hour_re_visit'] = data_time.dt.hour
    return data

seperate_date(air["reserve"])


def seperate_date(data):
    # split date feature in reserved visiting datetime
    data_time = pd.to_datetime(data.visit_datetime, format=time_format)
    data['Year_re_visit_h']= data_time.dt.year
    data['Month_re_visit_h'] = data_time.dt.month
    data['DayOfYear_re_visit_h'] = data_time.dt.dayofyear
    data['WeekOfYear_re_visit_h'] = data_time.dt.week
    data['DayOfWeek_re_visit_h'] = data_time.dt.dayofweek
    data['Hour_re_visit_h'] = data_time.dt.hour
    return data

seperate_date(hpg["reserve"])

print('seperating date time features done! ...')
gc.collect()

seperating date time features done! ...


105

In [21]:
hpg["reserve"].columns

Index(['visit_datetime', 'reserve_datetime', 'reserve_visitors',
       'air_store_id', 'Year_re_h', 'Month_re_h', 'DayOfYear_re_h',
       'DayOfWeek_re_h', 'Hour_re_h', 'Year_re_visit_h', 'Month_re_visit_h',
       'DayOfYear_re_visit_h', 'WeekOfYear_re_visit_h', 'DayOfWeek_re_visit_h',
       'Hour_re_visit_h'],
      dtype='object')

Merge dataset

In [22]:
features_to_drop = [
        'air_store_id__'
        ]

def merge_df(data, data_to_join):
    # merge dataframes        
    data = data.join(data_to_join, on='air_store_id', rsuffix='__', how='left')   
    return data

def fix_data(data):
    # drop __ data    
    for feature in features_to_drop:
        data.drop(feature, axis=1)
    return data

# Merge to df_train
print('merging dataframes ...')
df_train = merge_df(air["visit_data"], air["reserve"])
df_train = merge_df(df_train, air["store_info"])

hpg["reserve"]['reserve_visitors_hr'] = hpg["reserve"]['reserve_visitors'] 
hpg["reserve"].drop('reserve_visitors', axis=1) 

hpg["store_info"]['latitude_hr'] = hpg["store_info"]['latitude'] 
hpg["store_info"].drop('latitude', axis=1)

hpg["store_info"]['longitude_hr'] = hpg["store_info"]['longitude'] 
hpg["store_info"].drop('longitude', axis=1) 

df_train = merge_df(df_train, hpg["store_info"])
df_train = merge_df(df_train, hpg["store_info"])
gc.collect()
fix_data(df_train)

# Merge to df_test

df_test = merge_df(df_test, air["reserve"])
df_test = merge_df(df_test, air["store_info"])

df_test = merge_df(df_test, hpg["store_info"])
df_test = merge_df(df_test, hpg["reserve"])
gc.collect()
fix_data(df_test)


print('merging dataframes done!')
gc.collect()
print("=========================================================================================")

merging dataframes ...
merging dataframes done!


drop date-time-hour info

In [23]:
def drop_datetime_info(data):
    data = data.drop(['visit_date', 'visit_datetime', 'reserve_datetime', 'visit_datetime__', 'reserve_datetime__'], axis=1)
    return data
df_train = drop_datetime_info(df_train)
df_test = drop_datetime_info(df_test)

In [24]:
train = df_train.fillna(-1).iloc[:1000000]
test = df_test.fillna(-1).iloc[:1000000]

shuffle dataset

In [None]:
from sklearn.utils import shuffle
train = shuffle(train, random_state=524)

X_pretrain, X_prevalid = train_test_split(train, test_size=0.05, random_state=524, shuffle=False)

X_train = X_pretrain.drop(['visitors'], axis=1)
y_train = np.log1p(X_pretrain["visitors"].values)

X_valid = X_prevalid.drop(['visitors'], axis=1)
y_valid = np.log1p(X_prevalid['visitors'].values)

checking mean rmse

training model

In [None]:
print("Training LGBM model...")
params = {
    "application": "regression",
    "boosting": "gbdt",
    "learning_rate": 0.01,
    "num_leaves": 32,
    "min_sum_hessian_in_leaf": 1e-2,
    "min_gain_to_split": 0,
    
    "bagging_fraction": 0.8,
    "feature_fraction": 0.8,
    "num_threads": 4,
    "metric": "rmse"
}

d_train = lgb.Dataset(X_train, y_train)
d_valid = lgb.Dataset(X_valid, y_valid)

watchlist = [d_train, d_valid]

lgb_model1 = lgb.train(params, train_set=d_train, num_boost_round=35000, valid_sets=watchlist, verbose_eval=1000)

print("Model trained. Predicting...")

test_probs = lgb_model1.predict(test)
test_probs = np.expm1(test_probs)

result = pd.DataFrame({"id": index_test, "visitors": test_probs})
result.to_csv("LGB_sub.csv", index=False) # 0.60
print("Prediction complete.")

Training LGBM model...
[1000]	training's rmse: 0.615307	valid_1's rmse: 0.611369
[2000]	training's rmse: 0.597215	valid_1's rmse: 0.595273
[3000]	training's rmse: 0.594414	valid_1's rmse: 0.593297
[4000]	training's rmse: 0.593902	valid_1's rmse: 0.593159
[5000]	training's rmse: 0.593798	valid_1's rmse: 0.593234
[6000]	training's rmse: 0.593767	valid_1's rmse: 0.593275
[7000]	training's rmse: 0.593753	valid_1's rmse: 0.593293
[8000]	training's rmse: 0.593746	valid_1's rmse: 0.593304
[9000]	training's rmse: 0.593742	valid_1's rmse: 0.59331
[10000]	training's rmse: 0.59374	valid_1's rmse: 0.593312
[11000]	training's rmse: 0.593738	valid_1's rmse: 0.593315
[12000]	training's rmse: 0.593737	valid_1's rmse: 0.593317
[13000]	training's rmse: 0.593737	valid_1's rmse: 0.593318
[14000]	training's rmse: 0.593736	valid_1's rmse: 0.593319
[15000]	training's rmse: 0.593736	valid_1's rmse: 0.59332
[16000]	training's rmse: 0.593736	valid_1's rmse: 0.593321
[17000]	training's rmse: 0.593736	valid_1's r