# Poisson regressor

In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import PoissonRegressor

In [30]:
sample_submission = pd.read_csv('../data/sample_submission.csv')

### import merged dataframes (refer to notebook 2 for data wrangling steps)

In [3]:
train_merged = pd.read_csv('../data/train/train_merged.csv')
train_merged.tail()

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders,center_type,category,cuisine,calendar_week
456543,1271326,145,61,1543,484.09,484.09,0,0,68,TYPE_A,Desert,Indian,41
456544,1062036,145,61,2304,482.09,482.09,0,0,42,TYPE_A,Desert,Indian,41
456545,1110849,145,61,2664,237.68,321.07,0,0,501,TYPE_A,Salad,Italian,41
456546,1147725,145,61,2569,243.5,313.34,0,0,729,TYPE_A,Salad,Italian,41
456547,1361984,145,61,2490,292.03,290.03,0,0,162,TYPE_A,Salad,Italian,41


In [15]:
test_merged = pd.read_csv('../data/test_merged.csv')
test_merged.tail()

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,center_type,category,cuisine,calendar_week
32568,1250239,155,61,1543,482.09,484.09,0,0,TYPE_A,Desert,Indian,51
32569,1039516,155,61,2304,483.09,483.09,0,0,TYPE_A,Desert,Indian,51
32570,1158107,155,61,2664,322.07,323.07,0,0,TYPE_A,Salad,Italian,51
32571,1444235,155,61,2569,322.07,323.07,0,0,TYPE_A,Salad,Italian,51
32572,1291286,155,61,2490,276.45,276.45,0,0,TYPE_A,Salad,Italian,51


In [16]:
train_merged.columns

Index(['id', 'week', 'center_id', 'meal_id', 'checkout_price', 'base_price',
       'emailer_for_promotion', 'homepage_featured', 'num_orders',
       'center_type', 'category', 'cuisine', 'calendar_week'],
      dtype='object')

## train & run poisson regressor

In [17]:
X = train_merged[['center_id', 'meal_id', 'checkout_price', 'base_price',
       'emailer_for_promotion', 'homepage_featured', 
       'center_type', 'category', 'cuisine', 'calendar_week']]

In [18]:
y = train_merged['num_orders']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [20]:
categories = ['center_id', 'meal_id','emailer_for_promotion', 'homepage_featured',
                                                  'center_type', 'category', 'cuisine', 'calendar_week']

metrics = ['base_price','checkout_price']

In [21]:
trans = ColumnTransformer([
    ('categories', OneHotEncoder(drop='first',sparse=False), categories), 
    ('metrics', KBinsDiscretizer(n_bins=10, encode='onehot-dense', strategy='quantile'), metrics)
     ])

In [22]:
model_pipe_poisson = Pipeline([
    ('feature_engineering', trans), 
    ('model', PoissonRegressor(alpha=0.1,max_iter=10000))
                                ])
model_pipe_poisson.fit(X_train, y_train)

Pipeline(steps=[('feature_engineering',
                 ColumnTransformer(transformers=[('categories',
                                                  OneHotEncoder(drop='first',
                                                                sparse=False),
                                                  ['center_id', 'meal_id',
                                                   'emailer_for_promotion',
                                                   'homepage_featured',
                                                   'center_type', 'category',
                                                   'cuisine',
                                                   'calendar_week']),
                                                 ('metrics',
                                                  KBinsDiscretizer(encode='onehot-dense',
                                                                   n_bins=10),
                                                  ['base_price',
           

In [23]:
y_train_poiss_pred = model_pipe_poisson.predict(X_train)
y_test_poiss_pred = model_pipe_poisson.predict(X_test)

In [24]:
print('train score:',np.sqrt(mean_squared_log_error(y_train, y_train_poiss_pred)))
print('test score:',np.sqrt(mean_squared_log_error(y_test, y_test_poiss_pred)))

train score: 0.6655590337732898
test score: 0.6641152434159727


# Submission

In [25]:
X_submission = test_merged[['center_id', 'meal_id', 'checkout_price', 'base_price',
                           'emailer_for_promotion', 'homepage_featured', 
                           'center_type', 'category', 'cuisine', 'calendar_week']]

In [28]:
y_submission_pred = model_pipe_poisson.predict(X_submission)

In [53]:
sub_2 = sample_submission
sub_2['num_orders'] = y_submission_pred
sub_2

Unnamed: 0,id,num_orders
0,1028232,334.192950
1,1127204,264.418086
2,1212707,141.860514
3,1082698,52.089060
4,1400926,43.784539
...,...,...
32568,1250239,38.283405
32569,1039516,31.254649
32570,1158107,249.100574
32571,1444235,228.443190


## submission score: 67.51

Poisson regressor performed a bit better than my benchmark.