## Model Definition Notebook

### Imports

In [74]:
import random
import types
import re
import gc

from botocore.client import Config
import ibm_boto3
import numpy as np
from sklearn.model_selection import GroupShuffleSplit, GroupKFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import pickle
from matplotlib import pyplot as plt

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

### Load and read data

In [75]:
# The code was removed by Watson Studio for sharing.

In [76]:
def download_file_cos(credentials, local_file_name, key):  
    cos = ibm_boto3.client(service_name='s3',
    ibm_api_key_id=credentials['IBM_API_KEY_ID'],
    ibm_service_instance_id=credentials['IAM_SERVICE_ID'],
    ibm_auth_endpoint=credentials['IBM_AUTH_ENDPOINT'],
    config=Config(signature_version='oauth'),
    endpoint_url=credentials['ENDPOINT'])
    try:
        res=cos.download_file(Bucket=credentials['BUCKET'], Key=key, Filename=local_file_name)
    except Exception as e:
        print(Exception, e)
    else:
        print('File Downloaded')

In [77]:
# download data from cloud object storage
download_file_cos(credentials=credentials, local_file_name='df_raw_pickle.pickle', key='df_processed_pickle.pickle')
download_file_cos(credentials=credentials, local_file_name='dtypes_dict.pickle', key='dtypes_dict.pickle')

File Downloaded
File Downloaded


In [78]:
# read downloaded data with pandas
df = pd.read_pickle('df_raw_pickle.pickle')

with open('dtypes_dict.pickle', 'rb') as pkl:
    dtypes_dict = pickle.load(pkl)

In [79]:
# take a look at loaded data
df.head()

Unnamed: 0,reservation_id,channel_code,main_product_code,numberofadults,numberofchildren,persontravellingid,resort_region_code,resort_type_code,room_type_booked_code,roomnights,...,booking_Dayofweek,booking_Dayofyear,booking_Is_month_end,booking_Is_month_start,booking_Is_quarter_end,booking_Is_quarter_start,booking_Is_year_end,booking_Is_year_start,booking_Elapsed,n_people
0,9785,2,0,-0.726263,-0.478479,1,2,3,2,-1.104595,...,0.13369,-0.769077,0,0,0,0,0,0,1.069282,-0.87119
1,4694,0,0,-0.726263,-0.478479,1,2,3,3,0.508926,...,0.65367,-1.436825,0,0,0,0,0,0,-1.6332,-0.87119
2,278743,0,0,-0.726263,-0.478479,2,0,5,3,0.105546,...,-0.38629,-1.390453,0,0,0,0,0,0,-1.621631,-0.87119
3,276853,0,0,-0.726263,2.159981,1,1,2,2,0.508926,...,1.173649,-0.518672,0,0,0,0,0,0,-1.404137,0.192497
4,196536,0,0,-0.726263,-0.478479,1,1,2,3,0.508926,...,-0.38629,0.622064,0,0,0,0,0,0,-1.119543,-0.87119


### Validation subset of data

We will put aside 20% of data to evaluate the model performance later. 
The train/valid split will be performed based on `memberid` variable to prevent the data leak (we want to be able to predict the amount spent for the new members of hotels).

In [80]:
# shuffle data
random.seed(42)
df = df.iloc[random.sample(range(len(df)), len(df)), :]

# get groups
groups = df["memberid"].values

# get indices for train and validation data
group_shuffle_split = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, valid_idx = next(group_shuffle_split.split(df, groups=groups))

# split data
train_df = df.iloc[train_idx, :]
valid_df = df.iloc[valid_idx, :]

# print the dimensions
train_df.shape, valid_df.shape

((272546, 63), (68878, 63))

In [81]:
valid_df.to_pickle("valid_df.pickle")

del df

gc.collect()

3159

### Train / Test Split

Now we split the `train_df` into train and test again for model training and hyperparameter tuning.

In [82]:
# get groups
groups = train_df["memberid"].values

# get indices for train and validation data
group_shuffle_split = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

train_idx, test_idx = next(group_shuffle_split.split(train_df, groups=groups))

# split data
train = train_df.iloc[train_idx, :]
test = train_df.iloc[test_idx, :]
y_train = train['amount_spent_per_room_night_scaled']
y_test = test['amount_spent_per_room_night_scaled']

train.drop(['amount_spent_per_room_night_scaled', 'memberid', 'reservation_id'], axis=1, inplace=True)
test.drop(['amount_spent_per_room_night_scaled', 'memberid', 'reservation_id'], axis=1, inplace=True)
dtypes_dict['category_feats'].remove('memberid')
dtypes_dict['category_feats'].remove('reservation_id')


# print the dimensions
train.shape, test.shape

((217837, 60), (54709, 60))

### Baseline Modeling

First we will build two baseline models with two most popular machine learning algorithms: Linear Regression and Random Forest.

The chosen metric for model quality evaluation is Root Mean Squared Error (RMSE).

In [83]:
%%time
# define the model
lr = Ridge(random_state=42)
# fit the model to the train set
lr.fit(train, y_train)
# predict one the test set
lr_prediction = lr.predict(test)

print("Linear Regression RMSE Score: {:.4f}".format(np.sqrt(mean_squared_error(y_test, lr_prediction))))

Linear Regression RMSE Score: 0.9296
CPU times: user 3.14 s, sys: 8.3 s, total: 11.4 s
Wall time: 11.3 s


In [84]:
%%time
# define the model
rf = RandomForestRegressor(n_estimators=10, random_state=42)
# fit the model to the train set
rf.fit(train, y_train)
# predict one the test set
rf_prediction = rf.predict(test)

print("Random Forest RMSE Score: {:.4f}".format(np.sqrt(mean_squared_error(y_test, rf_prediction))))

Random Forest RMSE Score: 0.9654
CPU times: user 1min 3s, sys: 0 ns, total: 1min 3s
Wall time: 1min 4s


Let's now revisit the Linear Regression with **One Hot Encoding (OHE)** applied to categorical variables.

In [85]:
# Get indices of categorical features
cat_columns_idx = [train.columns.get_loc(col) 
                   for col in dtypes_dict['category_feats']]

In [86]:
ohe = OneHotEncoder(categorical_features=cat_columns_idx, 
                    sparse=False, handle_unknown="ignore")

train = ohe.fit_transform(train_df.drop(['amount_spent_per_room_night_scaled', 'memberid', 'reservation_id'], axis=1))
valid = ohe.transform(valid_df.drop(['amount_spent_per_room_night_scaled', 'memberid', 'reservation_id'], axis=1))

print(train.shape)

(272546, 143)


In [87]:
%%time
gkf = list(GroupKFold(n_splits=5).split(train_df, groups=groups))

model = RidgeCV(cv=gkf, scoring="neg_mean_squared_error")

model.fit(train, train_df['amount_spent_per_room_night_scaled'])

CPU times: user 50 s, sys: 21.4 s, total: 1min 11s
Wall time: 1min 8s


In [88]:
valid_score = np.sqrt(mean_squared_error(valid_df['amount_spent_per_room_night_scaled'], model.predict(valid)))

In [91]:
print("Best alpha: {} \nRidge RMSE Validation Score: {:.4f}".format(model.alpha_, valid_score))

Best alpha: 1.0 
Ridge RMSE Validation Score: 0.9157
