In [1]:
import pickle
import numpy as np
import pandas as pd
import json
import joblib
import time
import warnings
warnings.filterwarnings('ignore')

In [2]:
seed = 21 #Setting seed.

df_train = pd.read_csv('/content/drive/My Drive/Rossmann Sales Forecasting/train.csv', parse_dates=['Date'],
                       date_parser=(lambda dt: pd.to_datetime(dt, format='%Y-%m-%d')))

df_submit = pd.read_csv('/content/drive/My Drive/Rossmann Sales Forecasting/test.csv', parse_dates=['Date'],
                       date_parser=(lambda dt: pd.to_datetime(dt, format='%Y-%m-%d')))

df_store = pd.read_csv('/content/drive/My Drive/Rossmann Sales Forecasting/store.csv')

df_store_states = pd.read_csv('/content/drive/My Drive/Rossmann Sales Forecasting/store_states.csv')

In [3]:
df_store_states.head()

Unnamed: 0,Store,State
0,1,HE
1,2,TH
2,3,NW
3,4,BE
4,5,SN


In [4]:
#Creating a dictionary for assigning state in predict function.
store_state_dict = dict()

for i in range(len(df_store_states)):
    store = df_store_states['Store'].iloc[i]
    state = df_store_states['State'].iloc[i]

    store_state_dict[store] = state

In [5]:
X = pd.merge(df_train, df_store, how='left', on='Store')

# X = pd.merge(X, df_store_states, how='left', on='Store')

In [6]:
preprocessor = pickle.load(open('/content/drive/My Drive/Rossmann Sales Forecasting/preprocessor_base.pkl','rb'))

In [7]:
model = joblib.load('/content/drive/My Drive/Rossmann Sales Forecasting/lgb_model.pkl')

In [8]:
sales_per_day_dict = json.load(open('/content/drive/My Drive/Rossmann Sales Forecasting/sales_per_day_dict_no_zeros','r'))
customers_per_day_dict = json.load(open('/content/drive/My Drive/Rossmann Sales Forecasting/customers_per_day_dict_no_zeros','r'))
sales_per_customers_per_day_dict = json.load(open('/content/drive/My Drive/Rossmann Sales Forecasting/sales_per_customers_per_day_dict_no_zeros','r'))

acceleration_dict = json.load(open('/content/drive/My Drive/Rossmann Sales Forecasting/acceleration_dict_no_zeros','rb'))

freq2_dict = json.load(open('/content/drive/My Drive/Rossmann Sales Forecasting/freq2_dict_no_zeros','r'))
freq3_dict = json.load(open('/content/drive/My Drive/Rossmann Sales Forecasting/freq3_dict_no_zeros','r'))
amp2_dict = json.load(open('/content/drive/My Drive/Rossmann Sales Forecasting/amp2_dict_no_zeros','r'))
amp3_dict = json.load(open('/content/drive/My Drive/Rossmann Sales Forecasting/amp3_dict_no_zeros','r'))

In [9]:
#defining metric.
def rmspe(y, yhat):
    rmspe = np.sqrt(np.mean(((y - yhat)/y)**2))
    return rmspe

In [None]:
'Store', 'DayOfWeek', 'Promo','StateHoliday', 'SchoolHoliday','StoreType', 'Assortment', 'CompetitionDistance','CompetitionOpenSinceMonth', 
'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'Year','Month', 'Day', 'WeekOfYear', 'DayOfYear','SalesPerDay',
'Customers_per_day', 'Sales_Per_Customers_Per_Day', 'PromoInterval0', 'PromoInterval1', 'PromoInterval2', 'PromoInterval3','Acceleration','State',
'Promo_before_days','Promo_after_days','Frequency_2', 'Frequency_3', 'Amplitude_2','Amplitude_3','Mean_TemperatureC','Events'

In [47]:
def final_func_1(X):
    '''
    This function takes raw X as input and returns the predicitons.

    All the preprocessing will be done in this function using the pipeline
    that we created to preprocess the data.
    '''
    if X['Open']==1:
        global preprocessor
        global model
        global holidays_dict
        global sales_per_day_dict
        global customers_per_day_dict
        global sales_per_customers_per_day_dict
        global acceleration_dict
        global freq2_dict
        global freq3_dict
        global amp2_dict
        global amp3_dict
        global store_state_dict

        X = pd.DataFrame(data=[X.values], columns=X.index)
        store = X['Store'][0]

        #Adding date features.
        X['Date'] = pd.to_datetime(X['Date'])
        X['Year'] = X['Date'].dt.year
        X['Month'] = X['Date'].dt.month
        X['Day'] = X['Date'].dt.day
        X['WeekOfYear'] = X['Date'].dt.weekofyear
        X['DayOfYear'] = X['Date'].dt.dayofyear

        X['SalesPerDay'] = sales_per_day_dict[f'{store}']
        X['Customers_per_day'] = customers_per_day_dict[f'{store}']
        X['Sales_Per_Customers_Per_Day'] = sales_per_customers_per_day_dict[f'{store}']

        #Splitting PromoInterval into parts. For ex: (Jan,March,May) --> (Jan), (March), (May).
        s = X['PromoInterval'].str.split(',').apply(pd.Series, 1)
        s.columns = ['PromoInterval0', 'PromoInterval1', 'PromoInterval2', 'PromoInterval3']
        X = X.join(s)

        #Converting Promointerval columns to numerical.
        month_to_num_dict = {
                            'Jan' : 1,
                            'Feb' : 2,
                            'Mar' : 3,
                            'Apr' : 4,
                            'May' : 5,
                            'Jun' : 6,
                            'Jul' : 7,
                            'Aug' : 8,
                            'Sept' : 9, 
                            'Oct' : 10,
                            'Nov' : 11,
                            'Dec' : 12,
                            'nan' : np.nan
                            }

        X['PromoInterval0'] = X['PromoInterval0'].map(month_to_num_dict)
        X['PromoInterval1'] = X['PromoInterval1'].map(month_to_num_dict)
        X['PromoInterval2'] = X['PromoInterval2'].map(month_to_num_dict)
        X['PromoInterval3'] = X['PromoInterval3'].map(month_to_num_dict)

        #Removing PromoInterval feature as no further use now.
        del X['PromoInterval']

        X['Acceleration'] = acceleration_dict[f'{store}']
        X['State'] = store_state_dict[store]

        X['Promo_before_days'] = np.nan
        X['Promo_after_days'] = np.nan
        
        X['Frequency_2'] = freq2_dict[f'{store}']
        X['Frequency_3'] = freq3_dict[f'{store}']
        X['Amplitude_2'] = amp2_dict[f'{store}']
        X['Amplitude_3'] = amp3_dict[f'{store}']

        if X['Promo2SinceWeek'][0] == np.nan:  
            X['Promo2SinceWeek'][0] = -1
        
        if X['Promo2SinceYear'][0] == np.nan:
            X['Promo2SinceYear'][0] = -1

        if X['PromoInterval0'][0] == np.nan:
            X['PromoInterval0'][0] = -1

        state = X['State'][0]
        temp_df = pd.read_csv(f'/content/drive/My Drive/Rossmann Sales Forecasting/weather data/{state}.csv',
                    sep=';', parse_dates=['Date'],
                        date_parser=(lambda dt: pd.to_datetime(dt, format='%Y-%m-%d')))
        X = pd.merge(X, temp_df, how='left', on='Date')

        del X['Date']
        del X['Open']

        X = preprocessor.transform(X)

        prediction = model.predict(X)
    
    else:
        prediction = 0

    return prediction

In [50]:
def final_func_2(X:pd.Series, target):
    '''
    This function takes raw X as input and returns the metric score.

    All the preprocessing will be done in this function using the pipeline
    that we created to preprocess the data.
    '''
    if X['Open']==1:
        global preprocessor
        global model
        global holidays_dict
        global sales_per_day_dict
        global customers_per_day_dict
        global sales_per_customers_per_day_dict
        global acceleration_dict
        global freq2_dict
        global freq3_dict
        global amp2_dict
        global amp3_dict
        global store_state_dict

        X = pd.DataFrame(data=[X.values], columns=X.index)
        store = X['Store'][0]

        #Adding date features.
        X['Date'] = pd.to_datetime(X['Date'])
        X['Year'] = X['Date'].dt.year
        X['Month'] = X['Date'].dt.month
        X['Day'] = X['Date'].dt.day
        X['WeekOfYear'] = X['Date'].dt.weekofyear
        X['DayOfYear'] = X['Date'].dt.dayofyear

        X['SalesPerDay'] = sales_per_day_dict[f'{store}']
        X['Customers_per_day'] = customers_per_day_dict[f'{store}']
        X['Sales_Per_Customers_Per_Day'] = sales_per_customers_per_day_dict[f'{store}']

        #Splitting PromoInterval into parts. For ex: (Jan,March,May) --> (Jan), (March), (May).
        s = X['PromoInterval'].str.split(',').apply(pd.Series, 1)
        s.columns = ['PromoInterval0', 'PromoInterval1', 'PromoInterval2', 'PromoInterval3']
        X = X.join(s)

        #Converting Promointerval columns to numerical.
        month_to_num_dict = {
                            'Jan' : 1,
                            'Feb' : 2,
                            'Mar' : 3,
                            'Apr' : 4,
                            'May' : 5,
                            'Jun' : 6,
                            'Jul' : 7,
                            'Aug' : 8,
                            'Sept' : 9, 
                            'Oct' : 10,
                            'Nov' : 11,
                            'Dec' : 12,
                            'nan' : np.nan
                            }

        X['PromoInterval0'] = X['PromoInterval0'].map(month_to_num_dict)
        X['PromoInterval1'] = X['PromoInterval1'].map(month_to_num_dict)
        X['PromoInterval2'] = X['PromoInterval2'].map(month_to_num_dict)
        X['PromoInterval3'] = X['PromoInterval3'].map(month_to_num_dict)

        #Removing PromoInterval feature as no further use now.
        del X['PromoInterval']

        X['Acceleration'] = acceleration_dict[f'{store}']
        X['State'] = store_state_dict[store]

        X['Promo_before_days'] = np.nan
        X['Promo_after_days'] = np.nan
        
        X['Frequency_2'] = freq2_dict[f'{store}']
        X['Frequency_3'] = freq3_dict[f'{store}']
        X['Amplitude_2'] = amp2_dict[f'{store}']
        X['Amplitude_3'] = amp3_dict[f'{store}']

        if X['Promo2SinceWeek'][0] == np.nan:  
            X['Promo2SinceWeek'][0] = -1
        
        if X['Promo2SinceYear'][0] == np.nan:
            X['Promo2SinceYear'][0] = -1

        if X['PromoInterval0'][0] == np.nan:
            X['PromoInterval0'][0] = -1

        state = X['State'][0]
        temp_df = pd.read_csv(f'/content/drive/My Drive/Rossmann Sales Forecasting/weather data/{state}.csv',
                    sep=';', parse_dates=['Date'],
                        date_parser=(lambda dt: pd.to_datetime(dt, format='%Y-%m-%d')))
        X = pd.merge(X, temp_df, how='left', on='Date')

        del X['Date']
        del X['Open']

        X = preprocessor.transform(X)

        prediction = model.predict(X)
    
    else:
        prediction = 0

    metric = rmspe(target, prediction)

    return metric

In [22]:
df = X.drop('Sales', axis=1)
y = X['Sales']

In [23]:
df.columns

Index(['Store', 'DayOfWeek', 'Date', 'Customers', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment',
       'CompetitionDistance', 'CompetitionOpenSinceMonth',
       'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek',
       'Promo2SinceYear', 'PromoInterval'],
      dtype='object')

In [53]:
a = df.iloc[1]
a_target = y[1]

In [57]:
#Preidicting on train data.
start = time.time()
print(f'Predict is {final_func_1(a)}')
print(f'Time taken is {time.time()-start} seconds.')

Predict is [5921.82410226]
Time taken is 0.06006145477294922 seconds.


In [56]:
#Metric on train data.
start = time.time()
print(f'RMSPE score is {final_func_2(a, a_target)}')
print(f'Time taken is {time.time() - start} seconds.')

RMSPE score is 0.023445893426296032
Time taken is 0.05997133255004883 seconds.


In [51]:
d_submit = pd.merge(df_submit, df_store, how='left', on='Store')
d_submit.drop('Id', axis=1, inplace=True)

In [52]:
#Predicting on test data.
start = time.time()
print(final_func_1(d_submit.iloc[1]))
print(f'Time taken is {time.time()-start} seconds.')

[7333.20625737]
Time taken is 0.5567615032196045 seconds.
