# Case Study: Demand forecasting
# Context: Your client is a meal delivery company which operates in multiple cities. They have various fulfillment centers in these cities for dispatching meal orders to their customers. The client wants you to help these centers with demand forecasting for upcoming weeks so that these centers will plan the stock of raw materials accordingly.
# The replenishment of majority of raw materials is done on weekly basis and since the raw material is perishable, the procurement planning is of utmost importance. Secondly, staffing of the centers is also one area wherein accurate demand forecasts are really helpful. Given the following information, the task is to predict the demand for the next 10 weeks (Weeks: 146-155) for the center-meal combinations in the test set:
# Historical data of demand for a product-center combination (Weeks: 1 to 145)
# Product(Meal) features such as category, sub-category, current price and discount
# Information for fulfillment center like center area, city information etc.

## Content:
## Weekly Demand data (train.csv): Contains the historical demand data for all centers
## fulfilment_center_info.csv: Contains information for each fulfillment center
## meal_info.csv: Contains information for each meal being served.

# Import the Relevant libraries and Packages

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import plotly.offline as pyoff
import plotly.io as pio
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import metrics
from xgboost import XGBRegressor
from xgboost import plot_importance
from warnings import simplefilter

# Load the Dataset into dataframe

In [None]:
df_fulfil_data = pd.read_csv(r'C:\Users\kgupta\Desktop\Product_Demand_Data\fulfilment_center_info.csv')
df_meal_data = pd.read_csv(r'C:\Users\kgupta\Desktop\Product_Demand_Data\meal_info.csv')
df_train_data = pd.read_csv(r'C:\Users\kgupta\Desktop\Product_Demand_Data\train.csv')
df_test_data = pd.read_csv(r'C:\Users\kgupta\Desktop\Product_Demand_Data\test.csv')

In [None]:
df_train_data.head(5)

In [None]:
# Check the Rows and columns
print('The rows and columns of Fulfilment_data:', format(df_fulfil_data.shape))
print('The rows and columns of Meal_data:', format(df_meal_data.shape))
print('The rows and columns of Training_data:', format(df_train_data.shape))
print('The rows and columns of Testing_data:', format(df_test_data.shape))

# Merge the Meal and Fulfillment Information into the training data

In [None]:
merge1 = pd.merge(df_train_data, df_fulfil_data, how = "inner",on='center_id')
df = pd.merge(merge1, df_meal_data, how = "inner",on='meal_id')

In [None]:
df.head(5)

# Check the null values from dataframe

In [None]:
df.isnull().sum()

# Check duplicated data

In [None]:
df.duplicated().sum()

# Study the columns and their relationship

In [None]:
df.info()

In [None]:
# CHeck the cuisine which is popular
data = df['cuisine'].value_counts(normalize = True) * 100

In [None]:
# Reset the Index 
data = data.reset_index()

In [None]:
data.rename(columns = {'proportion': 'Percentage'}, inplace = True)

In [None]:
# Create a Pie chart for percentage of people like which cuisine
fig = px.pie(data , values = 'Percentage', names = 'cuisine', width=800, height=500)
pyoff.iplot(fig)
fig.show('notebook')

# CHeck where the Maximum Orders are coming from which centre type

In [None]:
# CHeck where the Maximum Orders are coming from which centre type
#del data_f
data_f = df['center_type'].value_counts(normalize = True) * 100
# Reset the Index
data_f = pd.DataFrame(data_f)

data_f = data_f.reset_index()

In [None]:
# Rename the column
data_f.rename(columns = {'proportion': 'Percentage'}, inplace = True) 
data_f.columns

# Create a Pie chart for percentage of orders from which center type

In [None]:
# Create a Pie chart for percentage of orders from which center type
fig = px.pie(data_f , values = 'Percentage', names = 'center_type', width=800, height=500)
pyoff.iplot(fig)
fig.show("notebook")

# Check the maximum orders placed for which category

In [None]:
# Check the maximum orders placed for which category
df_cat = df['category'].value_counts(normalize = True) *  100
df_cat = df_cat.reset_index()
df_cat.rename(columns = {'proportion': 'Percentage'}, inplace = True) 
fig = px.pie(df_cat , values = 'Percentage', names = 'category', width=800, height=500)
pyoff.iplot(fig)
fig.show("notebook")

# Check for relationship between week and num of orders sold

In [None]:
# Check for relationship between week and num of orders sold
data_ord = df.groupby(['week'])['num_orders'].sum()
data_ord = pd.DataFrame(data_ord)
plot_data = go.Scatter(x = data_ord.index, y = data_ord['num_orders'], name = 'Time Series for num_orders',
           marker = dict(color = 'Blue'))
plot_layout = go.Layout(title = 'Total orders per week', yaxis_title = 'Total Orders', xaxis_title = 'Week')
Fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(Fig)
fig.show("notebook")

In [None]:
cat_var = ['center_type','category','cuisine']
for i in cat_var:
    grp = df.groupby([i])
    grp = pd.DataFrame(grp)
    lis  = grp[0]
    x = 0
    for j in lis:
        s  = list(j) 
        s_data = df[df[i] == s[0].strip(',')]
        data = pd.DataFrame(s_data)
        tot_orders = s_data.groupby(['week'])['num_orders'].sum()
        tot_orders = pd.DataFrame(tot_orders)
        data_plot  = [go.Scatter(x = tot_orders.index, y = tot_orders['num_orders'], name = 'Time series for num_orders for:' +s[0].strip(','), 
                                marker = dict(color = 'Blue'))]
        data_layout = go.Layout(title = 'Total order per week:' +s[0].strip(','), yaxis_title = 'Total Orders',xaxis_title = 'Week')
        fig = go.Figure(data = data_plot, layout = data_layout)
        x+=1
        pyoff.iplot(fig)
        fig.show("notebook")

# Check the relationship between number of orders for particular cuisine from each location

In [None]:
# Check the relationship between number of orders for particular cuisine from each location
grp = df.groupby(['category','center_type'])['num_orders'].sum()
grp = pd.DataFrame(grp) 
grp = grp.reset_index()
fig = px.bar(grp, x = 'center_type', y = 'num_orders', color = 'category', title = 'Total Orders for particular cuisine from each location')
pyoff.iplot(fig)
fig.show("notebook")

# Check the relationship between number of orders for particular cuisine from center location

In [None]:
# Check the relationship between number of orders for particular cuisine from center location
grps = df.groupby(['category','center_id'])['num_orders'].sum()
grps = pd.DataFrame(grps) 
grps = grps.reset_index()
fig_1 = px.bar(grps, x = 'center_id', y = 'num_orders', color = 'category',title = 'Total Orders for each cuisine from center location')  
pyoff.iplot(fig_1)
fig.show("notebook")

# Check the relationship between number of orders for particular cuisine from center location

In [None]:
# Check the relationship between number of orders for particular cuisine from center location
cat_var = ['center_type']
for i in cat_var:
    grp = df.groupby([i])
    grp = pd.DataFrame(grp)
    lis  = grp[0]
    x = 0
    data_plot=[]
    for j in lis:
        s  = list(j) 
        s_data = df[df[i] == s[0].strip(',')]
        data = pd.DataFrame(s_data)
        tot_orders = s_data.groupby(['week'])['num_orders'].sum()
        tot_orders = pd.DataFrame(tot_orders)
        data_plot.append(go.Scatter(x = tot_orders.index, y = tot_orders['num_orders'],name = s[0].strip(','))) 
        x+=1
    data_layout = go.Layout(title = 'Total order per week based on center type', yaxis_title = 'Total Orders', xaxis_title = 'Week')
    fig = go.Figure(data = data_plot, layout = data_layout)
    pyoff.iplot(fig)
    fig.show("notebook")

# CHeck if data have any correlation

In [None]:
# CHeck if data have any correlation
df_copy = df
df_copy = df_copy.drop(columns = ['center_type','category','cuisine'])
cormat = df_copy.corr()
fig = plt.figure(figsize = (12,5))
sns.heatmap(cormat, annot = True)
plt.show()

# Inferences: Checkout_price, base_price show negative correlation while emailer_promotion and homepage_feature show positive correlation with num_orders.

In [None]:
grped_data = df.groupby(['category','cuisine'])['num_orders'].sum()
cat_cu_sc = grped_data.unstack().fillna(0)
cat_cu_sc

# Create a bar plot between category, cuisine and number of Orders

In [None]:
# Create a bar plot
ax = cat_cu_sc.plot(kind ='bar', figsize = (7,5),grid=True)
ax.set_ylabel('count')

# Graph between Number of Orders and Checkout Price

In [None]:
# Graph between Number of Orders and Checkout Price
plt.scatter(x = df['num_orders'], y = df['checkout_price'])
plt.xlabel('Number of Orders')
plt.ylabel('Checkout Price')
plt.show()

# Graph between Number of Orders and Base Price

In [None]:
# Graph between Number of Orders and Base Price
plt.scatter(x = df['num_orders'], y = df['base_price'])
plt.xlabel('Number of Orders')
plt.ylabel('Base Price')
plt.show()

# Converting Categorical data to numerical for Model use

In [None]:
df_copy = df.copy()
X_copy = df[['center_type','cuisine']]
X_copy = pd.get_dummies(X_copy,drop_first=True)
df_copy.reset_index(drop = True,inplace = True)
X_copy.reset_index(drop = True,inplace = True)
df_copy = pd.concat([df_copy,X_copy],axis = 1 )


# Drop the Categorical columns

In [None]:
df_copy = df_copy.drop(columns = ['category','cuisine','center_type'])

In [None]:
# Seperate the Features 
X = df_copy.copy()
y = np.array(df_copy['num_orders'])

# Drop the y feature from X

In [None]:
X = X.drop(columns =['num_orders','id','center_type_TYPE_B','center_type_TYPE_C','cuisine_Indian','cuisine_Italian','cuisine_Thai'])

# Transform the data on comman scale to avoid outliers

In [None]:
X_scaled = preprocessing.RobustScaler().fit_transform(X)

# Split the data into training and test set

In [None]:
# Divide the data into train and test data
X_train, X_test,y_train, y_test = model_selection.train_test_split(X_scaled, y, test_size = 0.30, random_state = 42)

In [None]:
# Instatiate the Xgb classifier
xgb = XGBRegressor(objective="reg:linear", random_state=42)

In [None]:
xgb.fit(X_train, y_train)

In [None]:
y_pred = xgb.predict(X_test)

In [None]:
mse= metrics.mean_squared_error(y_test, y_pred)

In [None]:
print('Root Mean Squared Error for XGBoost:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
plt.figure(figsize=(17, 8))
plt.plot(y_test, label="actual")
plt.plot(y_pred, color='red', label="predicted")
plt.legend(loc='best')
plt.show()

# Plot the Feature Importance

In [None]:
# plot the Feature Importance
feature_importances = pd.Series(xgb.feature_importances_)
index = X.columns
new_frame = pd.DataFrame({'Feature_Importances': list(feature_importances)}, index = index)
new_frame.plot.barh()

# Instatiate the Random Forest classifier

In [None]:
# Instatiate the Random Forest classifier
from sklearn import ensemble 
Rfr = ensemble.RandomForestRegressor(n_estimators = 100, random_state = 42) 

In [None]:
Rfr.fit(X_train, y_train)

In [None]:
y_pred = Rfr.predict(X_test)

In [None]:
print('Root Mean Squared Error for Random Forest Regressor:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
plt.figure(figsize= (12,7))
plt.plot(y_test, label = 'actual')
plt.plot(y_pred, label = 'predicted', color ='red')
plt.legend(loc ='best')
plt.plot()

In [None]:
# plot the Feature Importance
importances = Rfr.feature_importances_
std = np.std([tree.feature_importances_ for tree in Rfr.estimators_], axis=0)
forest_importances = pd.Series(importances, index=X.columns)
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

# Instatiate the Decision Tree classifier

In [None]:
# Instatiate the Decision Tree classifier
from sklearn import tree 
Dtr = tree.DecisionTreeRegressor(random_state = 42) 

In [None]:
Dtr.fit(X_train,y_train)

In [None]:
y_pred = Dtr.predict(X_test)

In [None]:
print('Root Mean Squared Error for Decision Tree:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

# Implementing the Random Forest on required dataset

In [None]:
training = df.loc[:, ['week', 'center_id', 'meal_id', 'checkout_price', 'base_price',
       'emailer_for_promotion', 'homepage_featured','num_orders']] 

In [None]:
training

In [None]:
X_train = training.drop(['num_orders'], axis=1).values
y_train = training['num_orders'].values
X_test = df_test_data.drop(['id'],axis=1).values

In [None]:
from sklearn import ensemble 
Rfr = ensemble.RandomForestRegressor(n_estimators = 100, random_state = 42) 
Rfr.fit(X_train, y_train)
y_pred = Rfr.predict(X_test)
ypred = pd.DataFrame(y_pred)

In [None]:
predictions = pd.merge(df_test_data,ypred,left_index = True, right_index = True,how ='inner')

In [None]:
predictions.head(5)

In [None]:
predictions['num_orders'] = predictions[0]

In [None]:
predictions = predictions.drop(columns =[0])

In [None]:
ts_ord_pred = predictions.groupby(['week'])['num_orders'].sum()
ts_ord_pred = pd.DataFrame(ts_ord_pred)

In [None]:
plot_data = [go.Scatter(x=data_ord.index, y = data_ord['num_orders'],
                        name = 'Actual',
                        marker = dict(color = 'Blue')),
             go.Scatter(x = ts_ord_pred.index, y = ts_ord_pred['num_orders'],
                        name = 'Predicted',
                        marker = dict(color = 'Red'))
                        ]
plot_layout = go.Layout(title = 'Total Orders per week',
                        yaxis_title = 'Total Orders',
                        xaxis_title = 'Week')

fig = go.Figure(data = plot_data,layout =plot_layout)
pyoff.iplot(fig)