# Problem: Predict the sales of a store.

In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error

In [32]:
test = pd.read_csv('test_Bigmart_Sales.csv')
train = pd.read_csv('train_Bigmart_Sales.csv')

In [33]:
train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [34]:
train['source'] = 'train'
test['source'] = 'test'

data = pd.concat([train,test],ignore_index=True)
print(data.shape,train.shape,test.shape)

(14204, 13) (8523, 13) (5681, 12)


In [35]:
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,source
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,train
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,train
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,train
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,train
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,train


In [36]:
#check missing values
data.apply(lambda x: sum(x.isnull()))

Item_Identifier                 0
Item_Weight                  2439
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  4016
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales            5681
source                          0
dtype: int64

In [11]:
data.dtypes

Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
source                        object
dtype: object

In [37]:
# Filling missing values
data.Item_Weight = data.Item_Weight.fillna(data.Item_Weight.mean())
data.Item_Outlet_Sales = data.Item_Outlet_Sales.fillna(data.Item_Outlet_Sales.mean())
data['Outlet_Size'].value_counts()

Medium    4655
Small     3980
High      1553
Name: Outlet_Size, dtype: int64

In [13]:
data.Outlet_Size = data.Outlet_Size.fillna('Medium')

In [38]:
data.apply(lambda x: sum(x.isnull()))

Item_Identifier                 0
Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  4016
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
source                          0
dtype: int64

In [39]:
# # Item type combine:
data['Item_Identifier'].value_counts()
data['Item_Type_Identifier'] = data['Item_Identifier'].apply(lambda x: x[0:2])
data['Item_Type_Identifier'] = data['Item_Type_Identifier'].map({'FD':'Food','DR':'Drinks','NC':'Non-consumable'})
data['Item_Type_Identifier'].value_counts()

Food              10201
Non-consumable     2686
Drinks             1317
Name: Item_Type_Identifier, dtype: int64

In [41]:
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,source,Item_Type_Identifier
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,train,Food
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,train,Drinks
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,train,Food
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,train,Food
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,train,Non-consumable


In [42]:
# Coding Items
encode = LabelEncoder()
data['Outlet'] = encode.fit_transform(data['Outlet_Identifier'])
var_mode = ['Item_Fat_Content','Outlet_Location_Type','Outlet_Size','Outlet_Type','Item_Type_Identifier','Outlet']
encode = LabelEncoder()
for i in var_mode:
    data[i] = encode.fit_transform(data[i])

In [43]:
data = pd.get_dummies(data,columns=['Item_Fat_Content','Outlet_Location_Type',
                                    'Outlet_Size','Outlet_Type','Item_Type_Identifier','Outlet'])

In [44]:
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Item_Outlet_Sales,source,Item_Fat_Content_0,...,Outlet_0,Outlet_1,Outlet_2,Outlet_3,Outlet_4,Outlet_5,Outlet_6,Outlet_7,Outlet_8,Outlet_9
0,FDA15,9.3,0.016047,Dairy,249.8092,OUT049,1999,3735.138,train,0,...,0,0,0,0,0,0,0,0,0,1
1,DRC01,5.92,0.019278,Soft Drinks,48.2692,OUT018,2009,443.4228,train,0,...,0,0,0,1,0,0,0,0,0,0
2,FDN15,17.5,0.01676,Meat,141.618,OUT049,1999,2097.27,train,0,...,0,0,0,0,0,0,0,0,0,1
3,FDX07,19.2,0.0,Fruits and Vegetables,182.095,OUT010,1998,732.38,train,0,...,1,0,0,0,0,0,0,0,0,0
4,NCD19,8.93,0.0,Household,53.8614,OUT013,1987,994.7052,train,0,...,0,1,0,0,0,0,0,0,0,0


In [45]:
data.dtypes

Item_Identifier               object
Item_Weight                  float64
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Item_Outlet_Sales            float64
source                        object
Item_Fat_Content_0             uint8
Item_Fat_Content_1             uint8
Item_Fat_Content_2             uint8
Item_Fat_Content_3             uint8
Item_Fat_Content_4             uint8
Outlet_Location_Type_0         uint8
Outlet_Location_Type_1         uint8
Outlet_Location_Type_2         uint8
Outlet_Size_0                  uint8
Outlet_Size_1                  uint8
Outlet_Size_2                  uint8
Outlet_Size_3                  uint8
Outlet_Type_0                  uint8
Outlet_Type_1                  uint8
Outlet_Type_2                  uint8
Outlet_Type_3                  uint8
Item_Type_Identifier_0         uint8
Item_Type_Identifier_1         uint8
I

In [46]:
data.drop(columns=['Outlet_Size_3'],axis=1,inplace=True)
data.dtypes

Item_Identifier               object
Item_Weight                  float64
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Item_Outlet_Sales            float64
source                        object
Item_Fat_Content_0             uint8
Item_Fat_Content_1             uint8
Item_Fat_Content_2             uint8
Item_Fat_Content_3             uint8
Item_Fat_Content_4             uint8
Outlet_Location_Type_0         uint8
Outlet_Location_Type_1         uint8
Outlet_Location_Type_2         uint8
Outlet_Size_0                  uint8
Outlet_Size_1                  uint8
Outlet_Size_2                  uint8
Outlet_Type_0                  uint8
Outlet_Type_1                  uint8
Outlet_Type_2                  uint8
Outlet_Type_3                  uint8
Item_Type_Identifier_0         uint8
Item_Type_Identifier_1         uint8
Item_Type_Identifier_2         uint8
O

In [47]:
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Item_Outlet_Sales,source,Item_Fat_Content_0,...,Outlet_0,Outlet_1,Outlet_2,Outlet_3,Outlet_4,Outlet_5,Outlet_6,Outlet_7,Outlet_8,Outlet_9
0,FDA15,9.3,0.016047,Dairy,249.8092,OUT049,1999,3735.138,train,0,...,0,0,0,0,0,0,0,0,0,1
1,DRC01,5.92,0.019278,Soft Drinks,48.2692,OUT018,2009,443.4228,train,0,...,0,0,0,1,0,0,0,0,0,0
2,FDN15,17.5,0.01676,Meat,141.618,OUT049,1999,2097.27,train,0,...,0,0,0,0,0,0,0,0,0,1
3,FDX07,19.2,0.0,Fruits and Vegetables,182.095,OUT010,1998,732.38,train,0,...,1,0,0,0,0,0,0,0,0,0
4,NCD19,8.93,0.0,Household,53.8614,OUT013,1987,994.7052,train,0,...,0,1,0,0,0,0,0,0,0,0


In [48]:
# Saving data cleansing results for model building
import warnings
warnings.filterwarnings('ignore')
data.drop(columns=['Item_Type','Outlet_Establishment_Year'],axis=1,inplace=True)

train = data.loc[data['source']=='train']
test = data.loc[data['source']=='test']

test.drop(['Item_Outlet_Sales','source'],axis=1,inplace=True)
train.drop(['source'],axis=1,inplace=True)

train.to_csv('train1.csv',index=False)
test.to_csv('test1.csv',index=False)

In [49]:
train1 = pd.read_csv('train1.csv')
test1 = pd.read_csv('test1.csv')

In [50]:
train1.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Visibility,Item_MRP,Outlet_Identifier,Item_Outlet_Sales,Item_Fat_Content_0,Item_Fat_Content_1,Item_Fat_Content_2,Item_Fat_Content_3,...,Outlet_0,Outlet_1,Outlet_2,Outlet_3,Outlet_4,Outlet_5,Outlet_6,Outlet_7,Outlet_8,Outlet_9
0,FDA15,9.3,0.016047,249.8092,OUT049,3735.138,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,DRC01,5.92,0.019278,48.2692,OUT018,443.4228,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
2,FDN15,17.5,0.01676,141.618,OUT049,2097.27,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,FDX07,19.2,0.0,182.095,OUT010,732.38,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
4,NCD19,8.93,0.0,53.8614,OUT013,994.7052,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0


In [53]:
# Model building

x_train = train1.drop(['Item_Outlet_Sales','Outlet_Identifier','Item_Identifier'],axis=1)
y_train = train1.Item_Outlet_Sales

In [54]:
x_test = test1.drop(['Outlet_Identifier','Item_Identifier'],axis=1)

In [55]:
x_train.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Item_Fat_Content_0,Item_Fat_Content_1,Item_Fat_Content_2,Item_Fat_Content_3,Item_Fat_Content_4,Outlet_Location_Type_0,Outlet_Location_Type_1,...,Outlet_0,Outlet_1,Outlet_2,Outlet_3,Outlet_4,Outlet_5,Outlet_6,Outlet_7,Outlet_8,Outlet_9
0,9.3,0.016047,249.8092,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,5.92,0.019278,48.2692,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,17.5,0.01676,141.618,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,19.2,0.0,182.095,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,8.93,0.0,53.8614,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [56]:
# Linear Regression Model

model = LinearRegression()
model.fit(x_train,y_train)

LinearRegression()

In [57]:
y_pred = model.predict(x_test)

In [59]:
y_pred

array([1848.53604783, 1472.81670435, 1875.65285894, ..., 1809.18796433,
       3565.6645235 , 1267.46171871])

In [61]:
model_accuracy = round(model.score(x_train,y_train)*100,2)
model_accuracy

56.36

In [64]:
r2_score(y_train,model.predict(x_train))*100

56.35892777270479

In [66]:
# RMSE score

print('RMSE: %.4g'%np.sqrt(mean_squared_error(y_train,model.predict(x_train))))

RMSE: 1127


In [67]:
saving_prediction_test = pd.DataFrame({'Item_Identifier':test1['Item_Identifier'],'Outlet_Identifier':test1['Outlet_Identifier'],
                                      'Item_Outlet_Sales':y_pred},columns=['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales'])

In [68]:
saving_prediction_test.to_csv('Bigmart_sales_linear_reg-final.csv',index=False)

In [69]:
sub = pd.read_csv('Bigmart_sales_linear_reg-final.csv')


In [70]:
sub.head()

Unnamed: 0,Item_Identifier,Outlet_Identifier,Item_Outlet_Sales
0,FDW58,OUT049,1848.536048
1,FDW14,OUT017,1472.816704
2,NCN55,OUT010,1875.652859
3,FDQ58,OUT017,2593.64469
4,FDY38,OUT027,5181.558341
