In [2]:
#@title Load Drive and Move to Data Directory

# reference
# https://github.com/jieunjeon/kaggle-recruit_restaurant_visitor_forecasting
# https://github.com/jieunjeon/kaggle-recruit_restaurant_visitor_forecasting/blob/main/EDA.ipynb
# https://github.com/jieunjeon/kaggle-recruit_restaurant_visitor_forecasting/blob/main/Feature_Engineering%2BModel.ipynb

from google.colab import drive
drive.mount('/content/drive/')

import os
loc = '/content/drive/MyDrive/2Research/KIXLAB Project (CHI 2023)/restaurant_data/' #@param
os.chdir(loc)
print(os.getcwd())

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/MyDrive/2Research/KIXLAB Project (CHI 2023)/restaurant_data


In [3]:
#@title Weather to pre-process the dataset or load the dataset
process_or_load_data = 'load' #@param ['load', 'process']
import pandas as pd
import numpy as np

if process_or_load_data == 'process':
  air_visit = pd.read_csv('air_visit_data.csv')
  date_info = pd.read_csv('date_info.csv')
  submission = pd.read_csv('sample_submission.csv')
  air_store_info = pd.read_csv('air_store_info_with_nearest_active_station.csv')
  print(air_visit.head())

  # (1) Resample the visiting data by days
  def resample_by_visitors(group):
    return group['visitors'].resample('1d').sum() # set frequency to 1d

  air_visit.index = pd.to_datetime(air_visit['visit_date']) # convert to datetime type
  air_visit = air_visit.groupby('air_store_id').apply(resample_by_visitors).reset_index()
  air_visit['visit_date'] = air_visit['visit_date'].dt.strftime('%Y-%m-%d') # Use Series.dt.strftime() function to convert the dates in the given series object to the specified date format.
  air_visit['was_na'] = air_visit['visitors'].isnull() # Use isna()?
  air_visit['visitors'].fillna(0, inplace=True)

  # (2) Handle Holidays
  # handle holiday
  date_info.rename(columns={'holiday_flg': 'is_holiday', 'calendar_date': 'visit_date'}, inplace=True)
  date_info['prev_day_is_holiday'] = date_info['is_holiday'].shift().fillna(0)
  date_info['next_day_is_holiday'] = date_info['is_holiday'].shift(-1).fillna(0)

  # (3) Create Test Dataset
  submission['air_store_id'] = submission['id'].str.slice(0, 20)
  submission['visit_date'] = submission['id'].str.slice(21)
  submission['is_test'] = True
  submission['visitors'] = np.nan
  submission['test_number'] = range(len(submission))

  # (4) Format the Training set and the Test set Properly
  # set format to merge training set and the test set
  data = pd.concat((air_visit, submission.drop('id', axis='columns')))
  data['is_test'].fillna(False, inplace=True)
  data = pd.merge(left=data, right=date_info, on='visit_date', how='left')
  data['visitors'] = data['visitors'].astype(float)

  # (5) AIR Store data
  # add restaurant info
  data = pd.merge(left=data, right=air_store_info, on='air_store_id', how='left')

  # (6) Preprocessed Weather Data
  import glob

  weather_dfs = []

  for path in glob.glob('1-1-16_5-31-17_Weather/*.csv'):
      weather_df = pd.read_csv(path)
      weather_df['station_id'] = path.split('\\')[-1].rstrip('.csv')
      weather_dfs.append(weather_df)

  weather = pd.concat(weather_dfs, axis='rows')
  weather.rename(columns={'calendar_date': 'visit_date'}, inplace=True)

  means = weather.groupby('visit_date')[['avg_temperature', 'precipitation']].mean().reset_index()
  means.rename(columns={'avg_temperature': 'global_avg_temperature', 'precipitation': 'global_precipitation'}, inplace=True)
  weather = pd.merge(left=weather, right=means, on='visit_date', how='left')
  weather['avg_temperature'].fillna(weather['global_avg_temperature'], inplace=True)
  weather['precipitation'].fillna(weather['global_precipitation'], inplace=True)

  data['visit_date'] = pd.to_datetime(data['visit_date'])
  data.sort_values(['air_store_id', 'visit_date'], inplace=True)

  # (7) Genre wise restaurants grouping
  air_genres_area = air_store_info.copy()
  air_genres_area = air_genres_area[['air_store_id', 'air_genre_name', 'air_area_name']].groupby(['air_genre_name', 'air_area_name'], as_index = False).count()
  air_genres_area = air_genres_area.rename(columns = {'air_store_id': 'total_stores_of_genre_in_area'})

  # (8) Area wise restaurants grouping
  # total amount of restaurants in area
  air_area = air_store_info.copy()
  air_area = air_area[['air_store_id', 'air_area_name']].groupby(['air_area_name'], as_index = False).count()
  air_area = air_area.rename(columns = {'air_store_id': 'total_r_in_area'})

  data = pd.merge(left=data, right=air_area, on='air_area_name', how='left')
  data['is_weekend'] = data['day_of_week'].isin(['Saturday', 'Sunday']).astype(int)
  data['day_of_month'] = data['visit_date'].dt.day

  # 2-(7) One - Hot Encoding
  data = pd.get_dummies(data, columns=['day_of_week', 'air_genre_name'])

  # 2-(5) Split Train and Test dataset
  data['visitors_log1p'] = np.log1p(data['visitors'])
  train = data[(data['is_test'] == False) & (data['was_na'] == False)]
  test = data[data['is_test']].sort_values('test_number')

  ######## skip to_drop
  to_drop = ['air_store_id', 'is_test', 'test_number', 'visit_date', 'was_na',
            'visitors', 'air_area_name', 'station_id', 'station_latitude',
            'station_longitude', 'station_vincenty', 'station_great_circle']

  train = train.drop(to_drop, axis='columns')
  train = train.dropna()
  test = test.drop(to_drop, axis='columns')

  X_train = train.drop('visitors_log1p', axis='columns')
  X_test = test.drop('visitors_log1p', axis='columns')
  y_train = train['visitors_log1p']
  pd.to_datetime(train.index, format= '%Y-%m-%d')

  # 2-(6) Validation Set
  # getting train & validation indexes
  train_index = train.loc[pd.to_datetime(train.index, format= '%Y-%m-%d') <= pd.to_datetime('2017-03-30',format= '%Y-%m-%d')].index
  val_index = train.loc[pd.to_datetime(train.index, format= '%Y-%m-%d') > pd.to_datetime('2017-03-30',format= '%Y-%m-%d')].index
  y = train['visitors_log1p'].values

  print(X_train.head())

  sum(test[test["visitors_log1p"] == np.nan].count())

  assert X_train.isnull().sum().sum() == 0
  assert y_train.isnull().sum() == 0
  assert len(X_train) == len(y_train)
  assert X_test.isnull().sum().sum() == 0
  assert len(X_test) == 32019

  print(X_train.isnull().sum().sum())
  print(y_train.isnull().sum())
  print()

  # 2-(8) Label encoding categorial features
  from sklearn import preprocessing

  train_opt = train.copy()
  test_opt = test.copy()
  text_columns = []
  # label encoding categorical features
  for f in train_opt.columns:
      if (train_opt[f].dtype == 'object'):  
          text_columns.append(f)            
          lbl = preprocessing.LabelEncoder()
          lbl.fit(list(train_opt[f].values) + list(test_opt[f].values))
          train_opt[f] = lbl.transform(list(train_opt[f].values))
          test_opt[f] = lbl.transform(list(test_opt[f].values))

  # 2-(9) Feature Importance
  # ValueError: DataFrame.dtypes for data must be int, float or bool.
  #             Did not expect the data types in fields latitude_str, longitude_str

  ######## skip to_drop
  to_drop = ['latitude_str', 'longitude_str']

  X_train = X_train.drop(to_drop, axis='columns')
  X_test= X_test.drop(to_drop, axis='columns')

  # 2-(10) Feature Selection using Recursive Feature Elimination
  # setting up Recursive Feature Eliminator with n_features_to_select=61

  from sklearn import model_selection

  cv = model_selection.KFold(n_splits=6, shuffle=True, random_state=42)
  tmp = cv.split(X_train, y_train)
  print(type(tmp))
  model_s = list(tmp)
  print(type(model_s))

  res = submission['id'].to_frame()
  res['visitors'] = 0

  # validation dataset
  X_fit = X_train.iloc[model_s[0][0]]
  y_fit = y_train.iloc[model_s[0][0]]
  X_val = X_train.iloc[model_s[0][1]]
  y_val = y_train.iloc[model_s[0][1]]

  ######## skip to_drop
  # to_drop = ['air_store_id', 'is_test', 'test_number', 'visit_date', 'was_na',
  #            'is_outlier', 'visitors_capped', 'visitors', 'air_area_name',
  #            'station_id', 'station_latitude', 'station_longitude', 'station_vincenty',
  #            'station_great_circle', 'visitors_capped_log1p']
  # train = train.drop(to_drop, axis='columns')
  train = train.dropna()
  # test = test.drop(to_drop, axis='columns')

  X_train = train.drop('visitors_log1p', axis='columns')
  X_test = test.drop('visitors_log1p', axis='columns')
  y_train = train['visitors_log1p']

  to_drop = ['latitude_str', 'longitude_str']

  X_train = X_train.drop(to_drop, axis='columns')
  X_test= X_test.drop(to_drop, axis='columns')

  print(f"X_train.shape: {X_train.shape}")
  print(f"y_train.shape: {y_train.shape}")
  print(f"X_val.shape: {X_val.shape}")
  print(f"y_val.shape: {y_val.shape}")
  print(f"X_test.shape: {X_test.shape}")

  import xgboost as xgb

  # model
  model = xgb.XGBRegressor()
  model.fit(X=X_train, y = y_train)

  features = X_train.columns
  importances = model.feature_importances_
  indices = (np.argsort(importances))[-20:]

  import matplotlib.pyplot as plt

  plt.barh(range(len(indices)), importances[indices], color='r', align='center')
  plt.figure(figsize=(15,12))
  plt.title('Feature Importances (Top 20 features)')
  plt.barh(range(len(indices)), importances[indices], color='r', align='center')
  plt.yticks(range(len(indices)), [features[i] for i in indices])
  plt.xlabel('Relative Importance')
  plt.show()

  X_train.to_csv('X_train.csv',index=False)
  y_train.to_csv('y_train.csv',index=False)
  X_val.to_csv('X_val.csv',index=False)
  y_val.to_csv('y_val.csv',index=False)
  X_test.to_csv('X_test.csv',index=False)
else:
  X_train = pd.read_csv('X_train.csv')
  y_train = pd.read_csv('y_train.csv')
  X_val = pd.read_csv('X_val.csv')
  y_val = pd.read_csv('y_val.csv')
  X_test = pd.read_csv('X_test.csv')

# # Drop Latitude and Longitude column
# to_drop = ['latitude', 'longitude']
# X_train = X_train.drop(to_drop, axis='columns')
# X_val = X_val.drop(to_drop, axis='columns')
# X_test = X_test.drop(to_drop, axis='columns')

In [4]:
#@title Print data info
print(f"X_train.columns: {X_train.columns}")
print(f"X_train.shape: {X_train.shape}")
print(f"y_train.shape: {y_train.shape}")
print(f"X_val.shape: {X_val.shape}")
print(f"y_val.shape: {y_val.shape}")
print(f"X_test.shape: {X_test.shape}")

X_train.columns: Index(['is_holiday', 'prev_day_is_holiday', 'next_day_is_holiday', 'latitude',
       'longitude', 'total_r_in_area', 'is_weekend', 'day_of_month',
       'day_of_week_Friday', 'day_of_week_Monday', 'day_of_week_Saturday',
       'day_of_week_Sunday', 'day_of_week_Thursday', 'day_of_week_Tuesday',
       'day_of_week_Wednesday', 'air_genre_name_Asian',
       'air_genre_name_Bar/Cocktail', 'air_genre_name_Cafe/Sweets',
       'air_genre_name_Creative cuisine', 'air_genre_name_Dining bar',
       'air_genre_name_International cuisine', 'air_genre_name_Italian/French',
       'air_genre_name_Izakaya', 'air_genre_name_Japanese food',
       'air_genre_name_Karaoke/Party',
       'air_genre_name_Okonomiyaki/Monja/Teppanyaki', 'air_genre_name_Other',
       'air_genre_name_Western food', 'air_genre_name_Yakiniku/Korean food'],
      dtype='object')
X_train.shape: (296279, 29)
y_train.shape: (296279, 1)
X_val.shape: (49380, 29)
y_val.shape: (49380, 1)
X_test.shape: (32019, 2

In [None]:
#@title Train XGBoost Model to have feature importance
import xgboost as xgb

# model
model = xgb.XGBRegressor()
model.fit(X=X_train, y=y_train)



XGBRegressor()

In [None]:
#@title Print the feature importance plot
features = X_train.columns
importances = model.feature_importances_
num_features = 29#@param
assert num_features < 30, "num_features has to be integer value less than 30"
indices = (np.argsort(importances))[-num_features:]

import plotly.express as px

data = pd.DataFrame()
data['imp'] = importances[indices]
data['feature'] = features[indices].values.tolist() # [features[i] for i in indices]
fig = px.bar(data, x='imp', y='feature')

fig.update_layout(title_text="Feature importance", height=700, width=1000)
fig.show()

In [None]:
#@title Model Training
import lightgbm as lgbm

model = lgbm.LGBMRegressor(
    objective='regression',
    max_depth=5,
    num_leaves=5 ** 2 - 1,
    learning_rate=0.2, # 조정 0.002 -> 0.2
    n_estimators=300, # 조정 300000 -> 300
    min_child_samples=80,
    subsample=0.8,
    colsample_bytree=1,
    reg_alpha=0,
    reg_lambda=0,
    random_state=np.random.randint(10e6)
)

model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)],
          eval_names=('train', 'val'), eval_metric='l2', early_stopping_rounds=200,
          feature_name=X_fit.columns.tolist(), verbose=False
)

LGBMRegressor(colsample_bytree=1, learning_rate=0.2, max_depth=5,
              min_child_samples=80, n_estimators=300, num_leaves=24,
              objective='regression', random_state=6120716, reg_alpha=0,
              reg_lambda=0, subsample=0.8)

In [None]:
#@title Run Prediction
score = np.sqrt(model.best_score_['val']['l2']) # "12" means the default lgbm
res['visitors'] = model.predict(X_test, num_iteration=model.best_iteration_)
feature_importance = model.feature_importances_
print("RMLSE: ", score) # Root mean log square error

In [None]:
#@title Print a barplot of frequence by days
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

fig = make_subplots(rows=1, cols=1)

d_mon = len(X_train[X_train['day_of_week_Monday'] == 1])
d_tue = len(X_train[X_train['day_of_week_Tuesday'] == 1])
d_wed = len(X_train[X_train['day_of_week_Wednesday'] == 1])
d_thu = len(X_train[X_train['day_of_week_Thursday'] == 1])
d_fri = len(X_train[X_train['day_of_week_Friday'] == 1])
d_sat = len(X_train[X_train['day_of_week_Saturday'] == 1])
d_sun = len(X_train[X_train['day_of_week_Sunday'] == 1])

data2 = pd.DataFrame()
data2['days'] = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
data2['freq'] = [d_mon, d_tue, d_wed, d_thu, d_fri, d_sat, d_sun]

fig2 = px.bar(data2, x='days', y='freq')

fig2.update_layout(title_text="Freq by Day", height=500, width=500)
fig2.show()

In [7]:
#@title Data preparation (DataFrame --> Torch)
X_train_torch = torch.tensor(X_train.values).float()
X_val_torch = torch.tensor(X_val.values).float()
X_test_torch = torch.tensor(X_test.values).float()
y_train_torch = torch.tensor(y_train.values).float()
y_val_torch = torch.tensor(y_val.values).float()

In [23]:
#@title Train a Neural Network model

# reference
# https://janakiev.com/blog/pytorch-iris/
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.autograd import Variable
import tqdm

EPOCHS  = 100 #@param EPOCHS
learing_rate = 1e-3 #@param learning_rate

class Model(nn.Module):
    def __init__(self, input_dim):
        super(Model, self).__init__()
        self.layer1 = nn.Linear(input_dim, 20)
        self.relu1 = nn.ReLU()
        self.layer2 = nn.Linear(20, 20)
        self.relu2 = nn.ReLU()
        self.layer3 = nn.Linear(20, 1)
        
    def forward(self, x):
        hidden1 = self.relu1(self.layer1(x))
        hidden2 = self.relu2(self.layer2(hidden1))
        out = self.layer3(hidden2)
        return out, hidden2

model     = Model(X_train.shape[1])
optimizer = torch.optim.Adam(model.parameters(), lr=learing_rate)
loss_fn   = nn.MSELoss()

training_loss_list     = np.zeros((EPOCHS,))
test_loss_list         = np.zeros((EPOCHS,))

for epoch in tqdm.trange(EPOCHS):
    y_pred, hidden = model(X_train_torch)
    loss = loss_fn(y_pred, y_train_torch)
    training_loss_list[epoch] = loss.item()
    
    # Zero gradients
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    with torch.no_grad():
        y_pred, hidden = model(X_val_torch)
        # print(f"y_val_torch.type: {y_val_torch}")
        loss = loss_fn(y_pred, y_val_torch)
        test_loss_list[epoch] = loss.item()

100%|██████████| 100/100 [00:14<00:00,  6.92it/s]


In [40]:
#@title Training loss chart
import plotly.express as px

df = pd.DataFrame()
df['training_loss_list'] = training_loss_list
df['ix'] = np.arange(len(training_loss_list))
fig = px.line(df, x="ix", y="training_loss_list", title='Training loss chart', height=500, width=500)
fig.show()

In [38]:
#@title 2D projection

import numpy as np
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

sample_num =  300#@param sample_num
method = 'tSNE' #@param sample_num = ['PCA', 'tSNE']
hidden_sample = hidden[:sample_num]

if method == 'tSNE':
  embed = TSNE(n_components=2, learning_rate='auto').fit_transform(hidden_sample)
  print(f"hidden.shape: {hidden.shape}, embed.shape: {embed.shape}")
else:
  embed = PCA(n_components=2).fit_transform(hidden_sample)
  print(f"hidden.shape: {hidden.shape}, embed.shape: {embed.shape}")

import plotly.io as pio
import plotly.express as px
import plotly.offline as py

df = pd.DataFrame()
df['x'] = embed[:, 0]
df['y'] = embed[:, 1]

fig = px.scatter(df, x="x", y="y", title="2D Projection chart", height=500, width=500)
fig


hidden.shape: torch.Size([49380, 20]), embed.shape: (300, 2)
