In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import category_encoders as ce
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingRegressor

In [3]:
df = pd.read_csv('/Users/harleyhoffmann/dat-02-22/ClassMaterial/Unit2/data/master.csv', parse_dates=['visit_date'])


In [11]:
df.sort_values(by=['id', 'visit_date'], ascending=True, inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252108 entries, 0 to 252107
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   id                252108 non-null  object        
 1   visit_date        252108 non-null  datetime64[ns]
 2   visitors          252108 non-null  int64         
 3   day_of_week       252108 non-null  object        
 4   holiday           252108 non-null  int64         
 5   genre             252108 non-null  object        
 6   area              252108 non-null  object        
 7   latitude          252108 non-null  float64       
 8   longitude         252108 non-null  float64       
 9   reserve_visitors  108394 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(2), object(4)
memory usage: 19.2+ MB


In [12]:
# define some functions that we can reuse
def create_val_splits(df, val_units=15, return_val=False):
    """Function that will take in a dataset and split it up into training, validation, and test sets"""
    # split into training, validation, and test sets
    df = df.drop('visit_date', axis=1)
    train = df.groupby('id').apply(lambda x: x.iloc[:-val_units]).reset_index(drop=True)
    test  = df.groupby('id').apply(lambda x: x.iloc[-val_units:]).reset_index(drop=True)
    
    if return_val:
        val   = train.groupby('id').apply(lambda x: x.iloc[-val_units:]).reset_index(drop=True)
        train = train.groupby('id').apply(lambda x: x.iloc[:-val_units]).reset_index(drop=True)
        return train, val, test
    else:
        return train, test

In [16]:
#you can fill null values with -999 and it generally works out the same
df=df.fillna(0)
train, val, test = create_val_splits(df, return_val=True)

In [18]:
X_train, y_train = train.drop('visitors', axis=1), train['visitors']
X_val, y_val = val.drop('visitors', axis=1), val['visitors']
X_test, y_test = test.drop('visitors', axis=1), test['visitors']

In [29]:
pipe = make_pipeline(ce.TargetEncoder(), GradientBoostingRegressor())

In [30]:
pipe.fit(X_train, y_train)

  elif pd.api.types.is_categorical(cols):


Pipeline(steps=[('targetencoder',
                 TargetEncoder(cols=['id', 'day_of_week', 'genre', 'area'])),
                ('gradientboostingregressor', GradientBoostingRegressor())])

In [31]:
pipe.score(X_val, y_val)

0.48779122611616976

In [32]:
#first row of our test column
X_test[:1]

Unnamed: 0,id,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors
0,air_00a91d42b08b08d9,Wednesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2.0


In [33]:
pipe.predict(X_test[:1])
#how many visitors are coming on first day day of test set

array([24.01843187])

TIME AND WINDOW STATS

In [39]:
#1 add a month and yesterday column
df['month'] = df['visit_date'].dt.month
df['yesterday'] = df.groupby('id').apply(lambda x: x['visitors'].shift()).values
df['yesterday'] = df['yesterday'].bfill()

In [41]:
train, val, test = create_val_splits(df, return_val=True)
X_train, y_train = train.drop('visitors', axis=1), train['visitors']
X_val, y_val = val.drop('visitors', axis=1), val['visitors']
X_test, y_test = test.drop('visitors', axis=1), test['visitors']

In [42]:
pipe.fit(X_train, y_train)
pipe.score(X_val, y_val)

  elif pd.api.types.is_categorical(cols):


0.5165717181340053

In [49]:
#2 add a last week column to improve our score
df['last_week'] = df.groupby('id').apply(lambda x: x['visitors'].shift(6)).values
df['last_week'] = df['last_week'].bfill()

In [51]:
train, val, test = create_val_splits(df, return_val=True)
X_train, y_train = train.drop('visitors', axis=1), train['visitors']
X_val, y_val = val.drop('visitors', axis=1), val['visitors']
X_test, y_test = test.drop('visitors', axis=1), test['visitors']

In [52]:
pipe.fit(X_train, y_train)
pipe.score(X_val, y_val)

  elif pd.api.types.is_categorical(cols):


0.5389210279483874

In [56]:
#3 add window stats to improve our score
df['7DayAvg']  = df.groupby('id').apply(lambda x: x['visitors'].rolling(7).mean().shift()).values
df['30DayAvg'] = df.groupby('id').apply(lambda x: x['visitors'].rolling(30).mean().shift()).values
df['60DayAvg'] = df.groupby('id').apply(lambda x: x['visitors'].rolling(60).mean().shift()).values
df['7DayAvg'] = df['7DayAvg'].bfill()
df['30DayAvg'] = df['30DayAvg'].bfill()
df['60DayAvg'] = df['60DayAvg'].bfill()

In [57]:
train, val, test = create_val_splits(df, return_val=True)
X_train, y_train = train.drop('visitors', axis=1), train['visitors']
X_val, y_val = val.drop('visitors', axis=1), val['visitors']
X_test, y_test = test.drop('visitors', axis=1), test['visitors']

In [58]:
pipe.fit(X_train, y_train)
pipe.score(X_val, y_val)

  elif pd.api.types.is_categorical(cols):


0.5483394259825141

In [59]:
pipe.get_params()
#could use subsample for every round could be helpful in keeping scores closer
#max features will help with random sampling columns and make scores closer together

{'memory': None,
 'steps': [('targetencoder',
   TargetEncoder(cols=['id', 'day_of_week', 'genre', 'area'])),
  ('gradientboostingregressor', GradientBoostingRegressor())],
 'verbose': False,
 'targetencoder': TargetEncoder(cols=['id', 'day_of_week', 'genre', 'area']),
 'gradientboostingregressor': GradientBoostingRegressor(),
 'targetencoder__cols': ['id', 'day_of_week', 'genre', 'area'],
 'targetencoder__drop_invariant': False,
 'targetencoder__handle_missing': 'value',
 'targetencoder__handle_unknown': 'value',
 'targetencoder__min_samples_leaf': 1,
 'targetencoder__return_df': True,
 'targetencoder__smoothing': 1.0,
 'targetencoder__verbose': 0,
 'gradientboostingregressor__alpha': 0.9,
 'gradientboostingregressor__ccp_alpha': 0.0,
 'gradientboostingregressor__criterion': 'friedman_mse',
 'gradientboostingregressor__init': None,
 'gradientboostingregressor__learning_rate': 0.1,
 'gradientboostingregressor__loss': 'ls',
 'gradientboostingregressor__max_depth': 3,
 'gradientboostingr

In [62]:
#4 let's check a lot of versions of the model at once
n_estimators  = [100, 200]
learning_rate = [.05, .1]
max_depth     = [3, 4, 5, 6]
cv_scores     = []

# and cycle through our model parameters
for estimators in n_estimators:
    for rate in learning_rate:
        for depth in max_depth:
            print(f"Fitting model with parameters:  n_estimators - {estimators}, learning_rate - {rate}, max_depth - {depth}")
            mod   = GradientBoostingRegressor(n_estimators=estimators, learning_rate=rate, max_depth=depth, max_features=0.6)
            pipe  = make_pipeline(ce.TargetEncoder(), mod)
            pipe.fit(X_train, y_train)
            score = pipe.score(X_val, y_val)
            print(f"Out-of-sample score: {score}")
            cv_scores.append((score, estimators, rate, depth))

Fitting model with parameters:  n_estimators - 100, learning_rate - 0.05, max_depth - 3


  elif pd.api.types.is_categorical(cols):


Out-of-sample score: 0.541978677070547
Fitting model with parameters:  n_estimators - 100, learning_rate - 0.05, max_depth - 4


  elif pd.api.types.is_categorical(cols):


Out-of-sample score: 0.5477422781021944
Fitting model with parameters:  n_estimators - 100, learning_rate - 0.05, max_depth - 5


  elif pd.api.types.is_categorical(cols):


Out-of-sample score: 0.556728618257283
Fitting model with parameters:  n_estimators - 100, learning_rate - 0.05, max_depth - 6


  elif pd.api.types.is_categorical(cols):


Out-of-sample score: 0.5603100804248893
Fitting model with parameters:  n_estimators - 100, learning_rate - 0.1, max_depth - 3


  elif pd.api.types.is_categorical(cols):


Out-of-sample score: 0.5517190050291556
Fitting model with parameters:  n_estimators - 100, learning_rate - 0.1, max_depth - 4


  elif pd.api.types.is_categorical(cols):


Out-of-sample score: 0.550938662272862
Fitting model with parameters:  n_estimators - 100, learning_rate - 0.1, max_depth - 5


  elif pd.api.types.is_categorical(cols):


Out-of-sample score: 0.5639016290411647
Fitting model with parameters:  n_estimators - 100, learning_rate - 0.1, max_depth - 6


  elif pd.api.types.is_categorical(cols):


Out-of-sample score: 0.5644200391467542
Fitting model with parameters:  n_estimators - 200, learning_rate - 0.05, max_depth - 3


  elif pd.api.types.is_categorical(cols):


Out-of-sample score: 0.5513244809648175
Fitting model with parameters:  n_estimators - 200, learning_rate - 0.05, max_depth - 4


  elif pd.api.types.is_categorical(cols):


Out-of-sample score: 0.5589382752071699
Fitting model with parameters:  n_estimators - 200, learning_rate - 0.05, max_depth - 5


  elif pd.api.types.is_categorical(cols):


Out-of-sample score: 0.5648407588501716
Fitting model with parameters:  n_estimators - 200, learning_rate - 0.05, max_depth - 6


  elif pd.api.types.is_categorical(cols):


Out-of-sample score: 0.567257121558107
Fitting model with parameters:  n_estimators - 200, learning_rate - 0.1, max_depth - 3


  elif pd.api.types.is_categorical(cols):


Out-of-sample score: 0.5551388938637185
Fitting model with parameters:  n_estimators - 200, learning_rate - 0.1, max_depth - 4


  elif pd.api.types.is_categorical(cols):


Out-of-sample score: 0.5573159721342854
Fitting model with parameters:  n_estimators - 200, learning_rate - 0.1, max_depth - 5


  elif pd.api.types.is_categorical(cols):


Out-of-sample score: 0.5659705317829743
Fitting model with parameters:  n_estimators - 200, learning_rate - 0.1, max_depth - 6


  elif pd.api.types.is_categorical(cols):


Out-of-sample score: 0.5725217370011696


In [63]:
cv_scores

[(0.541978677070547, 100, 0.05, 3),
 (0.5477422781021944, 100, 0.05, 4),
 (0.556728618257283, 100, 0.05, 5),
 (0.5603100804248893, 100, 0.05, 6),
 (0.5517190050291556, 100, 0.1, 3),
 (0.550938662272862, 100, 0.1, 4),
 (0.5639016290411647, 100, 0.1, 5),
 (0.5644200391467542, 100, 0.1, 6),
 (0.5513244809648175, 200, 0.05, 3),
 (0.5589382752071699, 200, 0.05, 4),
 (0.5648407588501716, 200, 0.05, 5),
 (0.567257121558107, 200, 0.05, 6),
 (0.5551388938637185, 200, 0.1, 3),
 (0.5573159721342854, 200, 0.1, 4),
 (0.5659705317829743, 200, 0.1, 5),
 (0.5725217370011696, 200, 0.1, 6)]

In [64]:
max(cv_scores)

(0.5725217370011696, 200, 0.1, 6)

In [65]:
#5 best version of our model
mod = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=4)

In [66]:
#6 now we want all the training data for the final sample
train, test = create_val_splits(df, return_val=False)

In [67]:
X_train, y_train = train.drop('visitors', axis=1), train['visitors']
X_test, y_test = test.drop('visitors', axis=1), test['visitors']

In [68]:
#7pipeline with final model
pipe = make_pipeline(ce.TargetEncoder(), mod)

In [69]:
pipe.fit(X_train, y_train)

  elif pd.api.types.is_categorical(cols):


Pipeline(steps=[('targetencoder',
                 TargetEncoder(cols=['id', 'day_of_week', 'genre', 'area'])),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(max_depth=4, n_estimators=200))])

In [70]:
#final out of sample model performance
pipe.score(X_test, y_test)

0.5334509833086043

In [72]:
naive_guess = y_test.mean()

In [75]:
#calculating r^2 valuee
np.sum((y_test - naive_guess)**2)

4288510.401608363

In [76]:
naive_model = np.sum((y_test - naive_guess)**2)

In [78]:
preds = pipe.predict(X_test)
#checking our model vs naive

In [80]:
our_model = np.sum((y_test - preds)**2)
#predicting total error from our actual model

In [81]:
1 - (our_model / naive_model)

0.5334509833086043

In [82]:
#^^ this is comparing our model vs the average value of y, which is our score
#how we does our model predict the change in y
#when you boost you start with an r^2 value of 0

In [83]:
feats = pd.DataFrame({'Features':X_train.columns, 'Importance':pipe[1].feature_importances_}).sort_values(by='Importance', ascending=False)


Unnamed: 0,Features,Importance
12,30DayAvg,0.441125
11,7DayAvg,0.191228
0,id,0.121731
1,day_of_week,0.088119
10,last_week,0.054662
13,60DayAvg,0.038223
9,yesterday,0.029761
6,longitude,0.008057
5,latitude,0.007643
2,holiday,0.006491


In [85]:
X_test['id'].sample(frac=1)

10903    air_dfe068a1bf85f395
1863     air_2aab19554f91ff82
614      air_0e7c11b9abc50163
10429    air_d4981cdde163b172
4497     air_629d9935273c82ae
                 ...         
5263     air_6d65dd11d96e00fb
8391     air_abcdc8115988a010
1871     air_2aab19554f91ff82
2962     air_4254c3fc3ad078bd
3800     air_54ed43163b7596c4
Name: id, Length: 12435, dtype: object

In [92]:
X_test_copy = X_test.copy()
X_test_copy['id'] = np.random.permutation(X_test['id'])

In [90]:
pipe.score(X_test_copy, y_test)

0.45533984684957307