In [1]:
import warnings
warnings.filterwarnings("ignore")

# loading packages
# basic + dates 
import numpy as np
import pandas as pd
from pandas import datetime

import logging
import os 

import dvc.api
import pickle
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer

from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [2]:
def log (path,file):
    '''Creating a log file to record the project's logs'''
    #check if file exists
    log_file= os.path.join(path,file)

    if not os.path.isfile(log_file):
        open(log_file,"w+").close()

    console_logging_format = "%(levelname)s %(message)s"
    file_logging_format = "%(levelname)s: %(asctime)s: %(message)s"

    #configure logger
    logging.basicConfig(level=logging.INFO,format=console_logging_format)
    logger=logging.getLogger()

    #file handler for output file
    handler=logging.FileHandler(log_file)

    #set logging level for file
    handler.setLevel(logging.INFO)

    #logging format
    formatter=logging.Formatter(file_logging_format)
    handler.setFormatter(formatter)
    
    #add handlers to logger
    logger.addHandler(handler)

    return logger

In [3]:
#set logger file
#from logs import log
logger=log(path="../logs/",file="rossman_sales.logs")

#Loading datasets
train=pd.read_csv("../data/train.csv",low_memory=False)
test=pd.read_csv("../data/test.csv",low_memory=False)
store=pd.read_csv("../data/store.csv",low_memory=False)
store['CompetitionDistance'].fillna(store['CompetitionDistance'].median(), inplace = True)
train_store=pd.merge(train,store,how='inner',on='Store')
train_store.fillna(0,inplace=True)
train_store.sample()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
1014511,1113,3,2015-03-25,5359,590,1,0,0,0,a,c,9260.0,0.0,0.0,0,0.0,0.0,0


In [4]:
train_store['Date']=pd.to_datetime(train_store['Date'])
train_store=train_store.sort_values('Date')
train_store['Day']=train_store['Date'].dt.day
train_store['Month']=train_store['Date'].dt.month
train_store['Year']=train_store['Date'].dt.year
train_store['WeekOfYear'] = train_store['Date'].dt.weekofyear

# Machine Learning

**Preprocessing**

In [5]:
#drop Date and Open columns
#drop rows where Sales=0
train_store = train_store.drop(['Date','Open','PromoInterval'],axis=1)
train_store = train_store.loc[~(train_store['Sales'] == 0)]

INFO NumExpr defaulting to 4 threads.


In [6]:
train_store['DayOfWeek']=train_store['DayOfWeek'].apply(str)
train_store['Year']=train_store['Year'].apply(str)
train_store['Promo']=train_store['Promo'].apply(str)
train_store['Promo2']=train_store['Promo2'].apply(str)
train_store['SchoolHoliday']=train_store['SchoolHoliday'].apply(str)
train_store['StateHoliday']=train_store['StateHoliday'].apply(str)
train_store['WeekOfYear']=train_store['WeekOfYear'].apply(str)

In [7]:
train_store.to_csv('../data/train_store.csv',header=True)

In [8]:
path = 'data/train_store.csv'
repo = "../"
version = "'v2'"
# return to normal tag version and print in markdown

data_url = dvc.api.get_url(
    path=path,
    repo=repo,
)

mlflow.set_experiment('Rossmann Pharmeceutical Sales Forecasting')

if __name__ == '__main__':
    warnings.filterwarnings("ignore")
    np.random.seed(40)
    df = pd.read_csv('../data/train_store.csv', index_col=0)
    mlflow.log_param('data_url', data_url)
    mlflow.log_param('data_version', version)
    mlflow.log_param('input_rows', df.shape[0])
    mlflow.log_param('input_cols', df.shape[1])

<h3>Pipeline</h3>

In [9]:
train,test = train_test_split(train_store,test_size=0.2)
train,val = train_test_split(train,test_size=0.2)

print('Train set:',len(train))
print('Validation set:',len(val))
print('Test set:',len(test))

Train set: 540376
Validation set: 135094
Test set: 168868


In [10]:
train_x=train.drop(['Sales'],axis=1)
test_x=test.drop(['Sales'],axis=1)
train_y=train[['Sales']]
test_y=test[['Sales']]

In [11]:
cat_cols = ['StateHoliday','SchoolHoliday','DayOfWeek','Promo','Promo2','Assortment','StoreType','Year','WeekOfYear']
num_cols = [i for i in train_x.columns if i not in cat_cols]

num_transformer = Pipeline(steps = [('imp', IterativeImputer(initial_strategy='median')),
                                    ('scaler', StandardScaler())])

cat_transformer = Pipeline(steps = [('imp', SimpleImputer(strategy='most_frequent')),
                                    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[('cat',cat_transformer,cat_cols)])

<h3>Regression models</h3>

**Random Forest Regression**

In [12]:
rand=Pipeline(steps=[('preprocessor',preprocessor),('random_forest', RandomForestRegressor(max_depth=10,random_state=2))],verbose=True)
rand_forest=rand.fit(train_x,train_y.values)
#predictions for validation data
rand_pred=rand_forest.predict(val)

[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=  10.3s
[Pipeline] ..... (step 2 of 2) Processing random_forest, total=11.6min


ValueError: X has 19 features, but ColumnTransformer is expecting 18 features as input.

**Decision Tree Regression**

In [None]:
tree=Pipeline(steps=[('preprocessor',preprocessor),('decision_tree',DecisionTreeRegressor(splitter='random', max_depth=10, random_state=2))],verbose=True)
dtree=tree.fit(train_x,train_y.values)
#predictions for validation data
dtree_pred=dtree.predict(val)

**SGD Regression**

In [None]:
sgdreg=Pipeline(steps=[('preprocessor',preprocessor),('sdg_reg',SGDRegressor(eta0=0.1,fit_intercept=False,shuffle=False,learning_rate='adaptive',random_state=2))],verbose=True)
sgd=sgdreg.fit(train_x,train_y.values)
#predictions for validation data
sgd_pred=sgd.predict(val)

**Serialising**

In [None]:
models=[rand_forest,dtree,sgd]
with open("../pickle/30-07-2021-20-51-03-00.pkl.", "wb") as f:
    for model in models:
         pickle.dump(model, f)

<h3>Loss function</h3>

In [None]:
#loss function
def loss_function(actual,pred):
    rmse=np.sqrt(mean_squared_error(actual,pred))
    mae=mean_absolute_error(actual,pred)
    r2=r2_score(actual,pred)
    return rmse,mae,r2

In [None]:
mlflow.log_metric('RandomForest',loss_function(test_y,rand_pred))
mlflow.log_metric('DecisionTree',loss_function(test_y,dtree_pred))
mlflow.log_metric('SGDRegression',loss_function(test_y,sgd_pred))

mlflow.sklearn.log_model(rand_forest,'random_forest v2')
mlflow.sklearn.log_model(dtree,'decision_tree v2')
mlflow.sklearn.log_model(sgd,'sgd_regressor v2')