# Quick run of a linear regressor

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_log_error

In [5]:
train = pd.read_csv('../data/train/train.csv')
test = pd.read_csv('../data/test.csv')
sample_submission = pd.read_csv('../data/sample_submission.csv')
fulfilment_center_info = pd.read_csv('../data/train/fulfilment_center_info.csv')
meal_info = pd.read_csv('../data/train/meal_info.csv')

## data wrangling: 

### merge with the two other data frames to add features: center information and meal information

### add new feature: calendar week

In [7]:
def get_calendar_week(week) :
    return week % 52

In [78]:
def merge_and_add_calendar_week(df, fulfilment_center_info, meal_info, get_calendar_week):
    df_out = pd.merge(df,fulfilment_center_info[['center_id','center_type']],on='center_id',how='left')
    df_out = pd.merge(df_out,meal_info[['meal_id','category','cuisine']],on='meal_id',how='left')
    df_out['calendar_week'] = df_out['week'].apply(get_calendar_week)
    return df_out

In [80]:
train_merged = merge_and_add_calendar_week(train, fulfilment_center_info, meal_info, get_calendar_week)

In [81]:
test_merged = merge_and_add_calendar_week(test, fulfilment_center_info, meal_info, get_calendar_week)

### export merged dataframes to reuse in further predictions/submissions

In [75]:
train_merged.to_csv('../data/train/train_merged.csv',index=False)

In [82]:
test_merged.to_csv('../data/test_merged.csv',index=False)

## run a linear regressor

after some runs I found out that the following features choice gives better predictions and less negative predictions

In [64]:
X = train_merged[['base_price','homepage_featured','center_type', 'category', 'cuisine', 'calendar_week']]

In [65]:
y = train_merged['num_orders']

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [67]:
trans = ColumnTransformer([
    
    ('categories', OneHotEncoder(drop='first',sparse=False), ['calendar_week','homepage_featured', \
                                                             'category', 'cuisine','center_type']), 
    ('metrics', KBinsDiscretizer(n_bins=5, encode='onehot-dense',strategy='quantile'), ['base_price'])
    
     ])

In [68]:
X_train.columns

Index(['base_price', 'homepage_featured', 'center_type', 'category', 'cuisine',
       'calendar_week'],
      dtype='object')

In [27]:
trans = ColumnTransformer([
    
    ('categories', OneHotEncoder(drop='first',sparse=False), categories), 
    ('metrics', KBinsDiscretizer(n_bins=5, encode='onehot-dense',strategy='quantile'), metrics)
    
     ])

In [69]:
model_pipe_lin = Pipeline([
    ('feature_engineering', trans), 
    ('model', LinearRegression())
    
])

model_pipe_lin.fit(X_train, y_train)

Pipeline(steps=[('feature_engineering',
                 ColumnTransformer(transformers=[('categories',
                                                  OneHotEncoder(drop='first',
                                                                sparse=False),
                                                  ['calendar_week',
                                                   'homepage_featured',
                                                   'category', 'cuisine',
                                                   'center_type']),
                                                 ('metrics',
                                                  KBinsDiscretizer(encode='onehot-dense'),
                                                  ['base_price'])])),
                ('model', LinearRegression())])

In [70]:
y_train_pred = model_pipe_lin.predict(X_train)
y_test_pred = model_pipe_lin.predict(X_test)

In [71]:
len(y_train_pred[y_train_pred<0]) / len(y_train)

0.07765229504893242

In [72]:
len(y_test_pred[y_test_pred<0]) / len(y_test)

0.07901031216870953

Linear regressor predicts negative values ( about 8% for each of train and test set). These values have to be set to zero.

In [73]:
y_train_pred[y_train_pred<0] = 0
y_test_pred[y_test_pred<0] = 0

In [74]:
print('train score:',np.sqrt(mean_squared_log_error(y_train, y_train_pred)))
print('test score:',np.sqrt(mean_squared_log_error(y_test, y_test_pred)))

train score: 1.3657759393516717
test score: 1.3745293670048027


## submission

In [91]:
X_submission = test_merged[['base_price','homepage_featured','center_type', 'category', 'cuisine', 'calendar_week']]

In [92]:
y_submission_pred = model_pipe_lin.predict(X_submission)
y_submission_pred[y_submission_pred<0] = 0

In [93]:
sub_2 = sample_submission
sub_2['num_orders'] = y_submission_pred
sub_2

Unnamed: 0,id,num_orders
0,1028232,400.09375
1,1127204,400.09375
2,1212707,400.09375
3,1082698,0.00000
4,1400926,0.00000
...,...,...
32568,1250239,0.00000
32569,1039516,0.00000
32570,1158107,330.50000
32571,1444235,330.50000


## submission score: 137.23

In this notebook I just run a linear regressor without any model tuning or advance feature engineering . The linear regressor performed worse than the benchmark. One reason for this low performance is the high amount of negative predictions that I set to zero.