In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
import category_encoders as ce

In [3]:
df = pd.read_csv(r'https://raw.githubusercontent.com/birdDogKep/dat-11-15/main/ClassMaterial/Unit1/data/master.csv', parse_dates = ['visit_date'])

In [4]:
df.select_dtypes(include = np.object).columns.tolist()

['id', 'day_of_week', 'genre', 'area']

In [22]:
df['yesterday'] = df.groupby('id').apply(lambda x: x['visitors'].shift()).values
df['last_week'] = df.groupby('id').apply(lambda x: x['visitors'].shift(7)).values

# fill in missing reservations
df['reserve_visitors'] = df['reserve_visitors'].fillna(0)

# drop missing values from shifts
df = df.dropna()

In [23]:
X = df[['id', 'yesterday', 'day_of_week']]
y = df['visitors']

In [24]:
# let's assume these were our optimized parameters
tree = DecisionTreeRegressor(max_depth = 7, max_features = 0.8, min_samples_leaf = 10)

pipe = make_pipeline(ce.TargetEncoder(), tree)

In [25]:
# fit the tree, and export it
pipe.fit(X, y)

Pipeline(memory=None,
         steps=[('targetencoder',
                 TargetEncoder(cols=['id', 'day_of_week'], drop_invariant=False,
                               handle_missing='value', handle_unknown='value',
                               min_samples_leaf=1, return_df=True,
                               smoothing=1.0, verbose=0)),
                ('decisiontreeregressor',
                 DecisionTreeRegressor(criterion='mse', max_depth=7,
                                       max_features=0.8, max_leaf_nodes=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=10, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       presort=False, random_state=None,
                                       splitter='best'))],
         verbose=False)

In [26]:
# the pickle module allows you to export saved models
import pickle

# rb -- WRITE the file in BYTES
with open('pipe.pkl', 'wb') as export:
    # this creates an external version of the file that we can now import later on
    pickle.dump(pipe, export)

In [27]:
# we can now import this, and re-use it on new data:  very handy

with open('pipe.pkl', 'rb') as import_:
    # this will import the pickled object again
    pipe2 = pickle.load(import_)

In [28]:
# here it is
pipe2

Pipeline(memory=None,
         steps=[('targetencoder',
                 TargetEncoder(cols=['id', 'day_of_week'], drop_invariant=False,
                               handle_missing='value', handle_unknown='value',
                               min_samples_leaf=1, return_df=True,
                               smoothing=1.0, verbose=0)),
                ('decisiontreeregressor',
                 DecisionTreeRegressor(criterion='mse', max_depth=7,
                                       max_features=0.8, max_leaf_nodes=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=10, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       presort=False, random_state=None,
                                       splitter='best'))],
         verbose=False)

In [29]:
# and we can use it to make new predictions
pipe2.predict(X)

array([26.23166727, 31.46089744, 21.40311066, ..., 51.37125749,
       40.35158151, 51.37125749])

In [30]:
id_ = 'dkjtdhaggslkj5838fk4dke'
day_of_week = 'Sunday'
yesterday = 33

In [31]:
sample = {
    'id': id_,
    'yesterday': yesterday,
    'day_of_week': day_of_week
}

sample = pd.DataFrame(sample, index = [0])

In [33]:
pipe2.predict(sample)[0]

23.545389733840302