Standard Imports

In [1]:
!pip install sklearn-pandas

[31mtensorflow 1.3.0 requires tensorflow-tensorboard<0.2.0,>=0.1.0, which is not installed.[0m


In [2]:
import datetime
import gc
import types
import random
import re
import sys
import pkg_resources

import types
from botocore.client import Config
import ibm_boto3

import matplotlib.pyplot as plt # data visualization
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GroupShuffleSplit, GroupKFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn_pandas import DataFrameMapper, gen_features, FunctionTransformer

%matplotlib inline

In [3]:
# show versions of packages
# adopted from https://stackoverflow.com/questions/40428931/package-for-listing-version-of-packages-used-in-a-jupyter-notebook

def get_imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            # Split ensures you get root package, 
            # not just imported function
            name = val.__name__.split(".")[0]
        elif isinstance(val, type):
            name = val.__module__.split(".")[0]
        # Some packages are weird and have different
        # imported names vs. system/pip names. Unfortunately,
        # there is no systematic way to get pip names from
        # a package's imported name. You'll have to add
        # exceptions to this list manually!
        poorly_named_packages = {
            "sklearn": "scikit-learn"
        }
        if name in poorly_named_packages.keys():
            name = poorly_named_packages[name]
        yield name.lower()
imports = list(set(get_imports()))

# The only way I found to get the version of the root package
# from only the name of the package is to cross-check the names 
# of installed packages vs. imported packages
modules = []
for m in sys.builtin_module_names:
    if m.lower() in imports and m !='builtins':
        modules.append((m,'Python BuiltIn'))
        imports.remove(m.lower())

for m in pkg_resources.working_set:
    if m.project_name.lower() in imports and m.project_name!="pip":
        modules.append((m.project_name, m.version))
        imports.remove(m.project_name.lower())

for m in sys.modules:
    if m.lower() in imports and m !='builtins':
        modules.append((m,'unknown'))


for r in modules:
    print("{}=={}".format(*r))

gc==Python BuiltIn
sys==Python BuiltIn
scikit-learn==0.19.1
pandas==0.21.0
numpy==1.13.3
matplotlib==2.1.0
botocore==1.7.20
types==unknown
random==unknown
ibm_boto3==unknown
pickle==unknown
re==unknown
pkg_resources==unknown
datetime==unknown
sklearn_pandas==unknown


Utility Functions

In [4]:
def preprocess(df):
    
    # Fill N/A values
    df.season_holidayed_code.fillna(-1, inplace = True)
    df.state_code_residence.fillna(-1, inplace = True)
    
    # Parse dates
    for col in ['checkout_date', 'checkin_date', 'booking_date']:
        df[col] = df[col].apply(lambda x: pd.datetime.strptime(x, '%d/%m/%y'))
        
    # Add datetime features
    df['booking_in_advance'] = (df['checkin_date'] - df['booking_date']).dt.days
    df['days_stayed'] = (df['checkout_date'] - df['checkin_date']).dt.days
    
    # Apply add_datepart function to datetime features
    for col in ['checkout_date', 'checkin_date', 'booking_date']:
        # add date features
        add_datepart(df, col, drop=True)
    
    # Add other features
    df['n_people'] = df['numberofadults'] + df['numberofchildren']
    
    return df

In [5]:
def add_datepart(df, fldname, drop=True, time=False, errors="raise"):
    """add_datepart converts a column of df from a datetime64 to many columns containing
    the information from the date. This applies changes inplace.
    Parameters:
    -----------
    df: A pandas data frame. df gain several new columns.
    fldname: A string that is the name of the date column you wish to expand.
        If it is not a datetime64 series, it will be converted to one with pd.to_datetime.
    drop: If true then the original date column will be removed.
    time: If true time features: Hour, Minute, Second will be added.
    Examples:
    ---------
    >>> df = pd.DataFrame({ 'A' : pd.to_datetime(['3/11/2000', '3/12/2000', '3/13/2000'], infer_datetime_format=False) })
    >>> df
        A
    0   2000-03-11
    1   2000-03-12
    2   2000-03-13
    >>> add_datepart(df, 'A')
    >>> df
        AYear AMonth AWeek ADay ADayofweek ADayofyear AIs_month_end AIs_month_start AIs_quarter_end AIs_quarter_start AIs_year_end AIs_year_start AElapsed
    0   2000  3      10    11   5          71         False         False           False           False             False        False          952732800
    1   2000  3      10    12   6          72         False         False           False           False             False        False          952819200
    2   2000  3      11    13   0          73         False         False           False           False             False        False          952905600
    """
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True, errors=errors)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)

In [6]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)

        try:
            return X[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("The DataFrame does not include the columns: %s" % cols_error)

Access to IBM Cloud Object Storage

In [7]:
# The code was removed by Watson Studio for sharing.

Load Data

In [8]:
%%time
# define the datetime data types dictionary
date_feats = ['checkout_date', 'checkin_date', 'booking_date']

# load the data and take a look at first 5 rows
df = pd.read_csv(body)

CPU times: user 2.08 s, sys: 476 ms, total: 2.56 s
Wall time: 3.41 s


Pre-process the data

In [9]:
%%time
df = preprocess(df)

CPU times: user 15.8 s, sys: 532 ms, total: 16.3 s
Wall time: 17.1 s


Define the data types.

In [10]:
dtypes_dict = {}

category_feats = df.columns[df.dtypes=='object'].tolist() + \
['room_type_booked_code', 'booking_type_code', 'state_code_resort', 'channel_code', 
 'main_product_code', 'persontravellingid', 'resort_region_code', 'resort_type_code']

category_feats = [x for x in category_feats if x not in date_feats + ['memberid', 'reservation_id']]

bin_feats = []

for col in df.columns:
    if len(df[col].unique()) == 2:
        bin_feats.append(col)

dtypes_dict = {
    "date_feats": date_feats, 
    "category_feats": category_feats,
    "bin_feats": bin_feats}


dtypes_dict['num_feats'] = [x for x in df.columns if x not in sum(dtypes_dict.values(), ['reservation_id', 'memberid', 'amount_spent_per_room_night_scaled'])]

Create and fit sklearn pipeline

In [11]:
category_feats = gen_features(
    columns=dtypes_dict['category_feats']+dtypes_dict['bin_feats'], 
    classes=[LabelBinarizer])

num_feats = gen_features(
    columns=[[col] for col in dtypes_dict['num_feats']], 
    classes=[StandardScaler])

mapper = DataFrameMapper(category_feats+num_feats, df_out=True, default=None)

columns = [c for c in df.columns if c not in ['memberid', 'reservation_id', 'amount_spent_per_room_night_scaled']]
cs = ColumnSelector(columns=columns)

In [12]:
pipeline = Pipeline([
    ('column_dropper', cs),
    ('mapper_transformer', mapper), 
    ('ridge_model', Ridge(alpha=1, random_state=42))
])

In [13]:
%%time
pipeline.fit(df.drop(['amount_spent_per_room_night_scaled'], axis=1), df['amount_spent_per_room_night_scaled'])



CPU times: user 28.5 s, sys: 35.6 s, total: 1min 4s
Wall time: 1min 5s


Pipeline(memory=None,
     steps=[('column_dropper', ColumnSelector(columns=['channel_code', 'main_product_code', 'numberofadults', 'numberofchildren', 'persontravellingid', 'resort_region_code', 'resort_type_code', 'room_type_booked_code', 'roomnights', 'season_holidayed_code', 'state_code_residence', 'state_code_resort', 't... fit_intercept=True, max_iter=None,
   normalize=False, random_state=42, solver='auto', tol=0.001))])

Save pipeline to pickle

In [14]:
with open('pipeline.pickle', 'wb') as pkl:
    pickle.dump(pipeline, pkl, protocol=pickle.HIGHEST_PROTOCOL)
    
cos.upload_file(Filename='pipeline.pickle', Bucket=credentials['BUCKET'], Key='pipeline.pickle')