## Importing Dependecies and Libraries

In [17]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split

In [18]:
# reading the dataset
data = pd.read_csv('number of orders.csv')

In [19]:
data.head()     # checking first 5 values of the dataset

Unnamed: 0,ID,Store_id,Store_Type,Location_Type,Region_Code,Date,Holiday,Discount,#Order,Sales
0,T1000001,1,S1,L3,R1,2018-01-01,1,Yes,9,7011.84
1,T1000002,253,S4,L2,R1,2018-01-01,1,Yes,60,51789.12
2,T1000003,252,S3,L2,R1,2018-01-01,1,Yes,42,36868.2
3,T1000004,251,S2,L3,R1,2018-01-01,1,Yes,23,19715.16
4,T1000005,250,S2,L3,R4,2018-01-01,1,Yes,62,45614.52


In [20]:
# looking some necessary insights from the dataset to understand the type of dataset we are using
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188340 entries, 0 to 188339
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   ID             188340 non-null  object 
 1   Store_id       188340 non-null  int64  
 2   Store_Type     188340 non-null  object 
 3   Location_Type  188340 non-null  object 
 4   Region_Code    188340 non-null  object 
 5   Date           188340 non-null  object 
 6   Holiday        188340 non-null  int64  
 7   Discount       188340 non-null  object 
 8   #Order         188340 non-null  int64  
 9   Sales          188340 non-null  float64
dtypes: float64(1), int64(3), object(6)
memory usage: 14.4+ MB


In [21]:
# checking missing values
data.isnull().sum()

ID               0
Store_id         0
Store_Type       0
Location_Type    0
Region_Code      0
Date             0
Holiday          0
Discount         0
#Order           0
Sales            0
dtype: int64

In [22]:
# getting the summary of the dataset
data.describe()

Unnamed: 0,Store_id,Holiday,#Order,Sales
count,188340.0,188340.0,188340.0,188340.0
mean,183.0,0.131783,68.205692,42784.327982
std,105.366308,0.338256,30.467415,18456.708302
min,1.0,0.0,0.0,0.0
25%,92.0,0.0,48.0,30426.0
50%,183.0,0.0,63.0,39678.0
75%,274.0,0.0,82.0,51909.0
max,365.0,1.0,371.0,247215.0


In [23]:
data.tail()     # checking last 5 values of the dataset

Unnamed: 0,ID,Store_id,Store_Type,Location_Type,Region_Code,Date,Holiday,Discount,#Order,Sales
188335,T1188336,149,S2,L3,R2,2019-05-31,1,Yes,51,37272.0
188336,T1188337,153,S4,L2,R1,2019-05-31,1,No,90,54572.64
188337,T1188338,154,S1,L3,R2,2019-05-31,1,No,56,31624.56
188338,T1188339,155,S3,L1,R2,2019-05-31,1,Yes,70,49162.41
188339,T1188340,152,S2,L1,R1,2019-05-31,1,No,47,37977.0


### Distribution of Number of Orders Received

In [24]:
# distribution according to Store Type
pie = data["Store_Type"].value_counts()
store = pie.index
orders = pie.values
fig = px.pie(data, values = orders, names = store)
fig.show()

In [25]:
# distribution according to Location
pie2 = data["Location_Type"].value_counts()
location = pie2.index
orders = pie2.values
fig = px.pie(data, values = orders, names = location)
fig.show()

In [26]:
# distribution according to Discounts
pie3 = data["Discount"].value_counts()
discount = pie3.index
orders = pie3.values
fig = px.pie(data, values = orders, names = discount)
fig.show()

In [27]:
# distribution according to Holidays
pie4 = data["Holiday"].value_counts()
holiday = pie4.index
orders = pie4.values
fig = px.pie(data, values = orders, names = holiday)
fig.show()

### Training and Testing the Model

Prepare the data so that we can train the machine learning model for the task of the number of order of prediction

In [28]:
data["Discount"] = data["Discount"].map({"No": 0, "Yes": 1})
data["Store_Type"] = data["Store_Type"].map({"S1": 1, "S2": 2, "S3": 3,"S4": 4})
data["Location_Type"] = data["Location_Type"].map({"L1": 1, "L2": 2, "L3": 3, "L4": 4, "L5": 5})
data.dropna()
x = np.array(data[["Store_Type", "Location_Type", "Holiday", "Discount"]])
y = np.array(data["#Order"])

In [29]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [30]:
import lightgbm as ltb
model = ltb.LGBMRegressor()
model.fit(xtrain, ytrain)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 15
[LightGBM] [Info] Number of data points in the train set: 150672, number of used features: 4
[LightGBM] [Info] Start training from score 68.163401


In [32]:
ypred = model.predict(xtest)
data = pd.DataFrame(data={"Predicted Orders ": ypred.flatten()})
print(data.head())

   Predicted Orders 
0          47.351897
1          97.068717
2          66.577788
3          85.143083
4          54.451098
