# 0.0 Imports


In [7]:
import pandas as pd
import math
import inflection
import numpy as np
import datetime as dtt
import xgboost as xgb
import random
import warnings
import pickle
import json
import requests

from matplotlib import pyplot as plt
from IPython.core.display import HTML
from scipy import stats as ss
from sklearn.preprocessing import RobustScaler, MinMaxScaler, LabelEncoder

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error

warnings.filterwarnings('ignore')

## 0.1. Helper functions

In [3]:
def cramer_v(x, y):
    cm = pd.crosstab(x, y).values
    n = cm.sum()
    r, k = cm.shape
    chi2 = ss.chi2_contingency(cm)[0]
    chi2corr = max(0, chi2 - (k-1)*(r-1)/(n-1))
    kcorr = k - (k-1)**2/(n-1)
    rcorr = r - (r-1)**2/(n-1)
    
    return np.sqrt((chi2corr/n)/(min(kcorr-1, rcorr-1)))

def mean_absolute_percentage_error (y, yhat):
    return np.mean(np.abs((y - yhat)/y))
    
def ml_error(model_name, y, yhat):
    mae = mean_absolute_error(y, yhat)
    mape = mean_absolute_percentage_error(y , yhat)
    rmse = np.sqrt(mean_squared_error(y, yhat))
    
    return pd.DataFrame({'model name': model_name,
                        'MAE': mae,
                        'MAPE': mape,
                        'RMSE': rmse}, index=[0])

def cross_validation(x_training, kfold, model_name, model, verbose=False):
    mae_list = []
    mape_list = []
    rmse_list = []
    for k in reversed(range(1, kfold+1)):
        if verbose:
            print('\nKFold Number{}'.format(k))
        # start and end date for validation
        validation_start_date = x_training['date'].max() - dtt.timedelta(days=k*6*7)
        validation_end_date = x_training['date'].max() - dtt.timedelta(days=(k-1)*6*7)

        # filtering dataset
        training = x_training[x_training['date'] < validation_start_date]
        validation = x_training[(x_training['date'] >= validation_start_date) & 
                                (x_training['date'] <= validation_end_date)]

        # training and validation dataset
        # training
        xtraining = training.drop(['date', 'sales'], axis=1)
        ytraining = training['sales']

        # validation
        xvalidation = validation.drop(['date', 'sales'], axis=1)
        yvalidation = validation['sales']

        # model
        m = model.fit(xtraining, ytraining)

        # prediction
        yhat = m.predict(xvalidation)

        # performance
        m_result = ml_error(model_name, np.expm1(yvalidation), np.expm1(yhat))

        # store performance of each kfold iteration
        mae_list.append(m_result['MAE'])
        mape_list.append(m_result['MAPE'])
        rmse_list.append(m_result['RMSE'])
    
    return pd.DataFrame({'Model Name': model_name, 
                        'MAE CV': np.round(np.mean(mae_list), 2).astype(str) + ' +/- ' + np.round(np.std(mae_list), 2).astype(str) , 
                        'MAPE CV': np.round(np.mean(mape_list), 2).astype(str) + ' +/- ' + np.round(np.std(mape_list), 2).astype(str), 
                        'RMSE CV': np.round(np.mean(rmse_list), 2).astype(str) + ' +/- ' + np.round(np.std(rmse_list), 2).astype(str)}, 
                         index=[0])




## 0.2 Loading data

In [4]:
df_sales_raw = pd.read_csv(r'C:\Users\jota_\Repositorios\DS_producao\Datasets_ds\train.csv', low_memory=False)
df_store_raw = pd.read_csv(r'C:\Users\jota_\Repositorios\DS_producao\Datasets_ds\store.csv', low_memory=False)

#merge 
df_raw = pd.merge(df_sales_raw, df_store_raw, how = 'left', on='Store')

In [5]:
df1 = df_raw.copy()

# 1.0 Data description and transformation

## 1.1 Rename columns

In [8]:
cols_old = ['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday', 
            'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 
            'Promo2', 'Promo2SinceWeek','Promo2SinceYear', 'PromoInterval']


snakecase = lambda x: inflection.underscore(x)

cols_new = list(map(snakecase, cols_old))

#Rename
df1.columns = cols_new

In [9]:
df1['date'] = pd.to_datetime(df1['date'])
print(df1.dtypes)

store                                    int64
day_of_week                              int64
date                            datetime64[ns]
sales                                    int64
customers                                int64
open                                     int64
promo                                    int64
state_holiday                           object
school_holiday                           int64
store_type                              object
assortment                              object
competition_distance                   float64
competition_open_since_month           float64
competition_open_since_year            float64
promo2                                   int64
promo2_since_week                      float64
promo2_since_year                      float64
promo_interval                          object
dtype: object


## 1.2 Fillout NA

In [10]:
#competition_distance             
df1['competition_distance'] = df1['competition_distance'].apply(lambda x: 200000.0 if math.isnan(x) else x)

#competition_open_since_month    

df1['competition_open_since_month'] = df1.apply(lambda x: x['date'].month if math.isnan(x['competition_open_since_month']) 
                                                else x['competition_open_since_month'], axis=1) 

#competition_open_since_year     

df1['competition_open_since_year'] = df1.apply(lambda x: x['date'].year if math.isnan(x['competition_open_since_year']) 
                                                else x['competition_open_since_year'], axis=1) 

#promo2_since_week    
df1['promo2_since_week'] = df1.apply(lambda x: x['date'].week if math.isnan(x['promo2_since_week']) 
                                                else x['promo2_since_week'], axis=1)

#promo2_since_year               
df1['promo2_since_year'] = df1.apply(lambda x: x['date'].year if math.isnan(x['promo2_since_year']) 
                                                else x['promo2_since_year'], axis=1)
                 
#promo_interval
df1['promo_interval'].fillna(0, inplace=True)

month_map = {1: 'Jan', 2: 'Fev', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun', 
             7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'}

df1['month_map'] = df1['date'].dt.month.map(month_map)

df1['is_promo'] = df1[['promo_interval', 'month_map']].apply(
    lambda x: 0 if x['promo_interval'] == 0 else 1 if x['month_map'] in x['promo_interval'].split(',') else 0, axis=1)

## 1.3 Change dtypes

In [11]:
df1.dtypes

store                                    int64
day_of_week                              int64
date                            datetime64[ns]
sales                                    int64
customers                                int64
open                                     int64
promo                                    int64
state_holiday                           object
school_holiday                           int64
store_type                              object
assortment                              object
competition_distance                   float64
competition_open_since_month           float64
competition_open_since_year            float64
promo2                                   int64
promo2_since_week                      float64
promo2_since_year                      float64
promo_interval                          object
month_map                               object
is_promo                                 int64
dtype: object

In [12]:
df1['competition_open_since_month'] = df1['competition_open_since_month'].astype('int64')
df1['competition_open_since_year'] = df1['competition_open_since_year'].astype('int64')
df1['promo2_since_week'] = df1['promo2_since_week'].astype('int64')
df1['promo2_since_year'] = df1['promo2_since_year'].astype('int64')

## 1.4 Descriptive Statistics

In [13]:
num_attributes = df1.select_dtypes(include = ['int64', 'float64'])
cat_attributes = df1.select_dtypes(exclude = ['int64', 'float64', 'datetime64[ns]'])

# 2.0 Feature Engineering

In [14]:
df2 = df1.copy()

In [15]:
# year
df2['year'] = df2['date'].dt.year

# month
df2['month'] = df2['date'].dt.month

# day
df2['day'] = df2['date'].dt.day

# week of year
df2['week_of_year'] = df2['date'].dt.isocalendar().week

# year week
df2['year_week'] = df2['date'].dt.strftime('%Y-%W')

# competition since
df2['competition_since'] = df2.apply(lambda x: dtt.datetime(year=x['competition_open_since_year'],
                                                                 month=x['competition_open_since_month'],
                                                                 day=1), axis=1)

df2['competition_time_month'] = ((df2['date'] - df2['competition_since'])/30).apply(lambda x: x.days).astype('int64')

# promo since
df2['promo_since'] = df2['promo2_since_year'].astype(str) + '-' + df2['promo2_since_week'].astype(str)

df2['promo_since'] = df2['promo_since'].apply(
    lambda x: dtt.datetime.strptime(x + '-1', '%Y-%W-%w') - dtt.timedelta(days = 7))

df2['promo_time_week'] = ((df2['date'] - df2['promo_since'])/7).apply(lambda x: x.days).astype('int64')

# assortment
df2['assortment'] = df2['assortment'].apply(lambda x: 'basic' if x == 'a' else 'extra' if x == 'b' else 'extended')

# state holiday
df2['state_holiday'] = df2['state_holiday'].apply(
    lambda x: 'public_holiday' if x == 'a' else 'easter' if x == 'b' else 'christmas' if x == 'c' else 'regular_day')

# 3.0 Variable filtering

In [16]:
df3 = df2.copy()

## 3.1 Index filtering

In [17]:
df3 = df3[(df3['open'] != 0) & (df3['sales'] > 0)]

## 3.2 Column selection

In [18]:
cols_drop = ['customers', 'open', 'promo_interval', 'month_map']
df3 = df3.drop(cols_drop, axis=1)

# 4.0 Data Preparation

In [26]:
df4 = df3.copy()

## 4.1 Rescaling

In [27]:
a = df5.select_dtypes(include=['int64', 'float64'] )

In [28]:
rs = RobustScaler()
mms = MinMaxScaler()

#competition distance
df4['competition_distance'] = rs.fit_transform(df5[['competition_distance']].values)
pickle.dump(rs, open(r'C:\Users\jota_\Repositorios\DS_producao\webapp1\parameter\competition_distance_scaler.pkl', 'wb'))

#year
df4['year'] = mms.fit_transform(df5[['year']].values)
pickle.dump(mms, open(r'C:\Users\jota_\Repositorios\DS_producao\webapp1\parameter\year_scaler.pkl', 'wb'))

#competition time month
df4['competition_time_month'] = rs.fit_transform(df5[['competition_time_month']].values)
pickle.dump(rs, open(r'C:\Users\jota_\Repositorios\DS_producao\webapp1\parameter\competition_time_month_scaler.pkl', 'wb'))

#promo time week
df4['promo_time_week'] = mms.fit_transform(df5[['promo_time_week']].values)
pickle.dump(rs, open(r'C:\Users\jota_\Repositorios\DS_producao\webapp1\parameter\promo_time_week_scaler.pkl', 'wb'))


## 4.2 Transformation

### 4.2.1 Encoding

In [29]:
# state_holiday - one hot encoding
df4 = pd.get_dummies(df4, prefix=['state_holiday'], columns=['state_holiday'])
# store_type - Label encoding
le = LabelEncoder()
df4['store_type'] = le.fit_transform(df4['store_type'])
pickle.dump(le, open(r'C:\Users\jota_\Repositorios\DS_producao\webapp1\parameter\store_type_scaler.pkl', 'wb'))

#assortment - ordinal encoding
assortment_dict = {'basic': 1,
                 'extended': 3, 
                 'extra': 2}
df4['assortment'] = df4['assortment'].map(assortment_dict)

### 4.2.2 Response Variable Transformation

In [30]:
df4['sales'] = np.log1p(df4['sales'])


### 4.2.3 Nature Transformation - Cyclical Nature

In [31]:
# month
df4['month_sin'] = df4['month'].apply(lambda x: np.sin( x * ( 2 * np.pi/12 )))
df4['month_cos'] = df4['month'].apply(lambda x: np.cos( x * ( 2 * np.pi/12 )))

# day
df4['day_sin'] = df4['day'].apply(lambda x: np.sin( x * ( 2 * np.pi/30 )))
df4['day_cos'] = df4['day'].apply(lambda x: np.cos( x * ( 2 * np.pi/30 )))

# week of year
df4['week_of_year_sin'] = df4['week_of_year'].apply(lambda x: np.sin( x * ( 2 * np.pi/52 )))
df4['week_of_year_cos'] = df4['week_of_year'].apply(lambda x: np.cos( x * ( 2 * np.pi/52 )))

# day of week
df4['day_of_week_sin'] = df4['day_of_week'].apply(lambda x: np.sin( x * ( 2 * np.pi/7 )))
df4['day_of_week_cos'] = df4['day_of_week'].apply(lambda x: np.cos( x * ( 2 * np.pi/7 )))

# 5.0 Feature Selection

In [33]:
df5 = df4.copy()

## 5.1 Split dataframe into training and test dataset

In [34]:
cols_drop = ['month', 'week_of_year', 'day_of_week', 'day', 'promo_since', 'competition_since', 'year_week']
df5 = df5.drop(cols_drop, axis=1)

In [35]:
df5[['date', 'store']].groupby('store').max().reset_index()['date'][0] - dtt.timedelta(days=6*7)

Timestamp('2015-06-19 00:00:00')

In [36]:
#training dataset
X_train = df5[df5['date'] < '2015-06-19']
Y_train = X_train['sales']

#test dataset
X_test = df5[df5['date'] >= '2015-06-19']
Y_test = X_test['sales']

print('Training Min Date: {}'.format(X_train['date'].min()))
print('Training Max Date: {}'.format(X_train['date'].max()))

print('Test Min Date: {}'.format(X_test['date'].min()))
print('Test Max Date: {}'.format(X_test['date'].max()))

Training Min Date: 2013-01-01 00:00:00
Training Max Date: 2015-06-18 00:00:00
Test Min Date: 2015-06-19 00:00:00
Test Max Date: 2015-07-31 00:00:00


In [37]:
cols_selected_boruta = [
     'store',
     'promo',
     'store_type',
     'assortment',
     'competition_distance',
     'competition_open_since_month',
     'competition_open_since_year',
     'promo2',
     'promo2_since_week',
     'promo2_since_year',
     'competition_time_month',
     'promo_time_week',
     'month_cos',
     'month_sin',
     'day_sin',
     'day_cos',
     'week_of_year_cos',
     'week_of_year_sin',
     'day_of_week_sin',
     'day_of_week_cos']

#columns to add
feat_to_add = ['date', 'sales']

#final features
cols_selected_boruta_full = cols_selected_boruta.copy()
cols_selected_boruta_full.extend(feat_to_add)

In [38]:
cols_not_selected_boruta = [
     'is_promo',
     'school_holiday',
     'state_holiday_christmas',
     'state_holiday_easter',
     'state_holiday_public_holiday',
     'state_holiday_regular_day',
     'year']

In [39]:
cols_selected_boruta

['store',
 'promo',
 'store_type',
 'assortment',
 'competition_distance',
 'competition_open_since_month',
 'competition_open_since_year',
 'promo2',
 'promo2_since_week',
 'promo2_since_year',
 'competition_time_month',
 'promo_time_week',
 'month_cos',
 'month_sin',
 'day_sin',
 'day_cos',
 'week_of_year_cos',
 'week_of_year_sin',
 'day_of_week_sin',
 'day_of_week_cos']

# 6.0 Machine Learning Modelling

In [66]:
x_train = X_train[cols_selected_boruta]
x_test = X_test[cols_selected_boruta]

#time series data preparation
x_training = X_train[cols_selected_boruta_full]

In [45]:
param_tuned = {
    'n_estimators': 2500, 
    'eta': 0.01, 
    'max_depth': 9, 
    'subsample': 0.7, 
    'colsample_bytree': 0.3, 
    'min_child_weight': 8}


In [46]:
# model
model_xgb_tuned = xgb.XGBRegressor(objective='reg:squarederror', 
                             n_estimators=param_tuned['n_estimators'], 
                             eta=param_tuned['eta'], 
                             max_depth=param_tuned['max_depth'], 
                             subsample=param_tuned['subsample'], 
                             colsample_bytree=param_tuned['colsample_bytree'], 
                             min_child_weight=param_tuned['min_child_weight']).fit(x_train, Y_train)

# prediction
yhat_xgb_tuned = model_xgb_tuned.predict(x_test)


# performance
xgb_result_tuned = ml_error('XGBoost Regressor', np.expm1(Y_test), np.expm1(yhat_xgb_tuned))
xgb_result_tuned

Unnamed: 0,model name,MAE,MAPE,RMSE
0,XGBoost Regressor,6994.372141,0.999866,7628.056609


# 7.0 Error interpretation

In [48]:
df6 = X_test[cols_selected_boruta_full]

# rescale
df6['sales'] = np.expm1(df6['sales'])
df6['predictions'] = np.expm1(yhat_xgb_tuned)


## 7.1 Business Performance

In [50]:
# Sum of predictions

df7 = df6[['store', 'predictions']].groupby('store').sum().reset_index()

# MAE and MAPE
df6_aux1 = df6[['store', 'sales', 'predictions']].groupby('store').apply(lambda x: mean_absolute_error(x['sales'], x['predictions'])).reset_index().rename(columns={0: 'MAE'})
df6_aux2 = df6[['store', 'sales', 'predictions']].groupby('store').apply(lambda x: mean_absolute_percentage_error(x['sales'], x['predictions'])).reset_index().rename(columns={0: 'MAPE'})

# Merge
df6_aux3 = pd.merge(df6_aux1, df6_aux2, how='inner', on='store')
df8 = pd.merge(df7, df6_aux3, how='inner', on='store')

# Scenarios
df8['worst_scenario'] = df8['predictions'] - df8['MAE']
df8['best_scenario'] = df8['predictions'] + df8['MAE']

# order columns
df8 = df8[['store', 'predictions', 'worst_scenario', 'best_scenario', 'MAE', 'MAPE']]


## 7.2 Total Performance


In [51]:
df9 = df8[['predictions', 'best_scenario', 'worst_scenario']].apply(lambda x: np.sum(x), axis=0).reset_index().rename(columns={'index': 'Scenarios', 0: 'Values'})
df9['Values'] = df9['Values'].map('R${:,.2f}'.format)
df9

Unnamed: 0,Scenarios,Values
0,predictions,"R$32,720.84"
1,best_scenario,"R$7,824,102.98"
2,worst_scenario,"R$-7,758,661.29"


## 7.3 Machine Learning Performance

In [53]:
df6['error'] = df6['sales'] - df6['predictions']
df6['error_rate'] = df6['predictions'] / df6['sales']



# 8.0 Deploy Model to Production

In [65]:
# Save trained model
pickle.dump(model_xgb_tuned, open(r'C:\Users\jota_\Repositorios\DS_producao\model_rossmann_xgb.pkl', 'wb'))



## 8.1 Rosmann Class

In [None]:
import pandas as pd
import numpy as np
import math
import datetime as dtt
import pickle
import inflection

class Rossmann(object):
    def __init__(self):
        self.home_path = '/Users/jota_/Repositorios/DS_producao/'
        self.competition_distance_scaler = pickle.load(open(self.home_path + 'parameter/competition_distance_scaler.pkl', 'rb'))
        self.competition_time_month_scaler = pickle.load(open(self.home_path + 'parameter/competition_time_month_scaler.pkl', 'rb'))
        self.promo_time_week_scaler = pickle.load(open(self.home_path + 'parameter/promo_time_week_scaler.pkl', 'rb'))
        self.year_scaler = pickle.load(open(self.home_path + 'parameter/year_scaler.pkl', 'rb'))
        self.store_type_scaler = pickle.load(open(self.home_path + 'parameter/store_type_scaler.pkl', 'rb'))

      
    def data_cleaning(self, df1):
        cols_old = ['Store', 'DayOfWeek', 'Date', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday', 
            'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 
            'Promo2', 'Promo2SinceWeek','Promo2SinceYear', 'PromoInterval']
        
        snakecase = lambda x: inflection.underscore(x)

        cols_new = list(map(snakecase, cols_old))

        #Rename
        df1.columns = cols_new

        df1['date'] = pd.to_datetime(df1['date'])

        #competition_distance             
        df1['competition_distance'] = df1['competition_distance'].apply(lambda x: 200000.0 if math.isnan(x) else x)

        #competition_open_since_month    

        df1['competition_open_since_month'] = df1.apply(lambda x: x['date'].month if math.isnan(x['competition_open_since_month']) 
                                                        else x['competition_open_since_month'], axis=1) 

        #competition_open_since_year     

        df1['competition_open_since_year'] = df1.apply(lambda x: x['date'].year if math.isnan(x['competition_open_since_year']) 
                                                        else x['competition_open_since_year'], axis=1) 

        #promo2_since_week    
        df1['promo2_since_week'] = df1.apply(lambda x: x['date'].week if math.isnan(x['promo2_since_week']) 
                                                        else x['promo2_since_week'], axis=1)

        #promo2_since_year               
        df1['promo2_since_year'] = df1.apply(lambda x: x['date'].year if math.isnan(x['promo2_since_year']) 
                                                        else x['promo2_since_year'], axis=1)

        #promo_interval
        df1['promo_interval'].fillna(0, inplace=True)

        month_map = {1: 'Jan', 2: 'Fev', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun', 
                     7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'}

        df1['month_map'] = df1['date'].dt.month.map(month_map)

        df1['is_promo'] = df1[['promo_interval', 'month_map']].apply(
            lambda x: 0 if x['promo_interval'] == 0 else 1 if x['month_map'] in x['promo_interval'].split(',') else 0, axis=1)
        
        return df1
    
    def feature_engineering(self, df2):
        # year
        df2['year'] = df2['date'].dt.year

        # month
        df2['month'] = df2['date'].dt.month

        # day
        df2['day'] = df2['date'].dt.day

        # week of year
        df2['week_of_year'] = df2['date'].dt.isocalendar().week

        # year week
        df2['year_week'] = df2['date'].dt.strftime('%Y-%W')


        # competition since
        df2['competition_since'] = df2.apply(lambda x: dtt.datetime(year=x['competition_open_since_year'],
                                                                         month=x['competition_open_since_month'],
                                                                         day=1), axis=1)
        df2['competition_time_month'] = ((df2['date'] - df2['competition_since'])/30).apply(lambda x: x.days).astype('int64')

        # promo since
        df2['promo_since'] = df2['promo2_since_year'].astype(str) + '-' + df2['promo2_since_week'].astype(str)

        df2['promo_since'] = df2['promo_since'].apply(
            lambda x: dtt.datetime.strptime(x + '-1', '%Y-%W-%w') - dtt.timedelta(days = 7))

        df2['promo_time_week'] = ((df2['date'] - df2['promo_since'])/7).apply(lambda x: x.days).astype('int64')


        # assortment
        df2['assortment'] = df2['assortment'].apply(lambda x: 'basic' if x == 'a' else 'extra' if x == 'b' else 'extended')


        # state holiday
        df2['state_holiday'] = df2['state_holiday'].apply(
            lambda x: 'public_holiday' if x == 'a' else 'easter' if x == 'b' else 'christmas' if x == 'c' else 'regular_day')
        
        df2 = df2[df2['open'] != 0]
        
        cols_drop = ['open', 'promo_interval', 'month_map']
        df2 = df2.drop(cols_drop, axis=1)
        
        return df2
    
    def data_preparation(self, df5):
        
        #competition distance
        df5['competition_distance'] = self.competition_distance_scaler.fit_transform(df5[['competition_distance']].values)

        # year
        df5['year'] = self.year_scaler.fit_transform(df5[['year']].values)

        # competition time month
        df5['competition_time_month'] = self.competition_time_month_scaler.fit_transform(df5[['competition_time_month']].values)

        # promo time week
        df5['promo_time_week'] = self.promo_time_week_scaler.fit_transform(df5[['promo_time_week']].values)
        
        # state_holiday - one hot encoding
        df5 = pd.get_dummies(df5, prefix=['state_holiday'], columns=['state_holiday'])
        
        # store_type - Label encoding
        df5['store_type'] = self.store_type_scaler.fit_transform(df5['store_type'])

        #assortment - ordinal encoding
        assortment_dict = {'basic': 1,
                         'extended': 3, 
                         'extra': 2}
        df5['assortment'] = df5['assortment'].map(assortment_dict)

        # month
        df5['month_sin'] = df5['month'].apply(lambda x: np.sin( x * ( 2 * np.pi/12 )))
        df5['month_cos'] = df5['month'].apply(lambda x: np.cos( x * ( 2 * np.pi/12 )))

        # day
        df5['day_sin'] = df5['day'].apply(lambda x: np.sin( x * ( 2 * np.pi/30 )))
        df5['day_cos'] = df5['day'].apply(lambda x: np.cos( x * ( 2 * np.pi/30 )))

        # week of year
        df5['week_of_year_sin'] = df5['week_of_year'].apply(lambda x: np.sin( x * ( 2 * np.pi/52 )))
        df5['week_of_year_cos'] = df5['week_of_year'].apply(lambda x: np.cos( x * ( 2 * np.pi/52 )))

        # day of week
        df5['day_of_week_sin'] = df5['day_of_week'].apply(lambda x: np.sin( x * ( 2 * np.pi/7 )))
        df5['day_of_week_cos'] = df5['day_of_week'].apply(lambda x: np.cos( x * ( 2 * np.pi/7 )))
        
        cols_selected = [
             'store',
             'promo',
             'store_type',
             'assortment',
             'competition_distance',
             'competition_open_since_month',
             'competition_open_since_year',
             'promo2',
             'promo2_since_week',
             'promo2_since_year',
             'competition_time_month',
             'promo_time_week',
             'month_cos',
             'month_sin',
             'day_sin',
             'day_cos',
             'week_of_year_cos',
             'week_of_year_sin',
             'day_of_week_sin',
             'day_of_week_cos']
        
        return df5[cols_selected]
    
    def get_prediction(self, model, original_data, test_data):
        #prediction
        pred = model.predict(test_data)
        
        #join pred into the original data
        original_data['prediction'] = np.expm1(pred)
        
        return original_data.to_json(orient = 'records', date_format='iso')

## 8.2 API Handler

In [None]:
from flask import Flask, request, Response
from rossmann.Rossmann import Rossmann

# loading model
model = pickle.load(open('/Users/jota_/Repositorios/DS_producao/model_rossmann_final2.pkl', 'rb'))

# initialize api
app = Flask( __name__ )

@app.route('/rossmann/predict', methods=['POST'])
def rossmann_predict():
    test_json = request.get_json()
    
    if test_json:
        
        if isinstance(test_json, dict): #unique exmple
            test_raw = pd.DataFrame(test_json, index=[0])
        else: # multiple examples
            test_raw = pd.DataFrame(test_json, columns=test_json[0].keys())
            
        #instantiate rossmann class
        pipeline = Rossmann()
        
        # data cleaning
        df1 = pipeline.data_cleaning(test_raw)
        
        # feature engineering
        df2 = pipeline.feature_engineering(df1)
        
        # data preparation
        df3 = pipeline.data_preparation(df2)
        
        # prediction
        df_response = pipeline.get_prediction(model, test_raw, df3)
        
        return df_response
        
        
    else:
        return Response('{}', status=200, mimetype='application/json')

if __name__ == '__main__':
    app.run('0.0.0.0')

## 8.3 API Tester 

In [5]:
#load test dataset
df10 = pd.read_csv(r'C:\Users\jota_\Repositorios\DS_producao\Datasets_ds\test.csv')

In [6]:
# merge test dataset + store
df_test = pd.merge(df10, df_store_raw, how='left', on='Store')

# choose store for prediction 
df_test = df_test[df_test['Store'].isin([12, 22, 24])]
# remove closed days
df_test = df_test[df_test['Open'] != 0]
df_test = df_test[~df_test['Open'].isnull()]
df_test = df_test.drop('Id', axis=1)

In [7]:
#convert dataframe to json
data = json.dumps(df_test.to_dict(orient='records'))


In [9]:
# API call
url = 'https://rossmann-model-jl1.herokuapp.com/rossmann/predict'
header = {'Content-type': 'application/json'}
data = data

r = requests.post(url, data=data, headers=header)
print('Status Code {}'.format(r.status_code))

Status Code 200


In [10]:
d1 = pd.DataFrame(r.json(),columns = r.json()[0].keys())

In [12]:
d2 =d1[['store', 'prediction']].groupby('store').sum().reset_index()

for i in range(len(d2)):
    print('Store Number {} will sell R${:,.2f} in the next 6 weeks'.format(
    d2.loc[i, 'store'],
    d2.loc[i, 'prediction']))

Store Number 12 will sell R$224,511.73 in the next 6 weeks
Store Number 22 will sell R$201,874.37 in the next 6 weeks
Store Number 24 will sell R$284,107.18 in the next 6 weeks
