# Big Mart Sale Prediction

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [3]:
# Combine test and train into one file

train["source"] = "train"
test["source"] = "test"
data = pd.concat([train,test],ignore_index=True)

In [4]:
print(train.shape,test.shape,data.shape)

(8523, 13) (5681, 12) (14204, 13)


In [5]:
data.tail()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,source
14199,FDB58,10.5,Regular,0.013496,Snack Foods,141.3154,OUT046,1997,Small,Tier 1,Supermarket Type1,,test
14200,FDD47,7.6,Regular,0.142991,Starchy Foods,169.1448,OUT018,2009,Medium,Tier 3,Supermarket Type2,,test
14201,NCO17,10.0,Low Fat,0.073529,Health and Hygiene,118.744,OUT045,2002,,Tier 2,Supermarket Type1,,test
14202,FDJ26,15.3,Regular,0.0,Canned,214.6218,OUT017,2007,,Tier 2,Supermarket Type1,,test
14203,FDU37,9.5,Regular,0.10472,Canned,79.796,OUT045,2002,,Tier 2,Supermarket Type1,,test


In [6]:
data.duplicated().count

<bound method Series.count of 0        False
1        False
2        False
3        False
4        False
         ...  
14199    False
14200    False
14201    False
14202    False
14203    False
Length: 14204, dtype: bool>

In [7]:
data.describe()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,11765.0,14204.0,14204.0,14204.0,8523.0
mean,12.792854,0.065953,141.004977,1997.830681,2181.288914
std,4.652502,0.051459,62.086938,8.371664,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,8.71,0.027036,94.012,1987.0,834.2474
50%,12.6,0.054021,142.247,1999.0,1794.331
75%,16.75,0.094037,185.8556,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14204 entries, 0 to 14203
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            14204 non-null  object 
 1   Item_Weight                11765 non-null  float64
 2   Item_Fat_Content           14204 non-null  object 
 3   Item_Visibility            14204 non-null  float64
 4   Item_Type                  14204 non-null  object 
 5   Item_MRP                   14204 non-null  float64
 6   Outlet_Identifier          14204 non-null  object 
 7   Outlet_Establishment_Year  14204 non-null  int64  
 8   Outlet_Size                10188 non-null  object 
 9   Outlet_Location_Type       14204 non-null  object 
 10  Outlet_Type                14204 non-null  object 
 11  Item_Outlet_Sales          8523 non-null   float64
 12  source                     14204 non-null  object 
dtypes: float64(4), int64(1), object(8)
memory usag

# Data cleaning

In [9]:
data.isnull().sum()

Item_Identifier                 0
Item_Weight                  2439
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  4016
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales            5681
source                          0
dtype: int64

# Filling missing values

In [10]:
data.Item_Weight = data.Item_Weight.fillna(data.Item_Weight.mean())

In [11]:
data.Item_Outlet_Sales = data.Item_Outlet_Sales.fillna(data.Item_Outlet_Sales.mean())

In [12]:
data["Outlet_Size"].value_counts()

Medium    4655
Small     3980
High      1553
Name: Outlet_Size, dtype: int64

In [13]:
data.Outlet_Size = data.Outlet_Size.fillna("Medium")

In [14]:
data.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
source                       0
dtype: int64

In [15]:
# Item type combine
data["Item_Identifier"].value_counts()
data["Item_Type_Combined"] = data["Item_Identifier"].apply(lambda x: x[0:2])
data["Item_Type_Combined"] = data["Item_Type_Combined"].map({"FD":"Food","NC":"Non-Consumable","DR":"Drinks"})
data["Item_Type_Combined"].value_counts()

Food              10201
Non-Consumable     2686
Drinks             1317
Name: Item_Type_Combined, dtype: int64

# Numerical and One-Hot Coding of Categorical variables

In [16]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
le = LabelEncoder()

# New variable for outlet
data["Outlet"] = le.fit_transform(data["Outlet_Identifier"])
var_mod = ["Item_Fat_Content","Outlet_Location_Type","Outlet_Size","Item_Type_Combined","Outlet_Type","Outlet"]
le = LabelEncoder()
for i in var_mod:
    data[i] = le.fit_transform(data[i])

In [17]:
#One Hot Coding:
data = pd.get_dummies(data, columns=['Item_Fat_Content','Outlet_Location_Type','Outlet_Size','Outlet_Type','Item_Type_Combined','Outlet'])

In [18]:
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Item_Outlet_Sales,source,Item_Fat_Content_0,...,Outlet_0,Outlet_1,Outlet_2,Outlet_3,Outlet_4,Outlet_5,Outlet_6,Outlet_7,Outlet_8,Outlet_9
0,FDA15,9.3,0.016047,Dairy,249.8092,OUT049,1999,3735.138,train,0,...,0,0,0,0,0,0,0,0,0,1
1,DRC01,5.92,0.019278,Soft Drinks,48.2692,OUT018,2009,443.4228,train,0,...,0,0,0,1,0,0,0,0,0,0
2,FDN15,17.5,0.01676,Meat,141.618,OUT049,1999,2097.27,train,0,...,0,0,0,0,0,0,0,0,0,1
3,FDX07,19.2,0.0,Fruits and Vegetables,182.095,OUT010,1998,732.38,train,0,...,1,0,0,0,0,0,0,0,0,0
4,NCD19,8.93,0.0,Household,53.8614,OUT013,1987,994.7052,train,0,...,0,1,0,0,0,0,0,0,0,0


In [19]:
data.dtypes

Item_Identifier               object
Item_Weight                  float64
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Item_Outlet_Sales            float64
source                        object
Item_Fat_Content_0             uint8
Item_Fat_Content_1             uint8
Item_Fat_Content_2             uint8
Item_Fat_Content_3             uint8
Item_Fat_Content_4             uint8
Outlet_Location_Type_0         uint8
Outlet_Location_Type_1         uint8
Outlet_Location_Type_2         uint8
Outlet_Size_0                  uint8
Outlet_Size_1                  uint8
Outlet_Size_2                  uint8
Outlet_Type_0                  uint8
Outlet_Type_1                  uint8
Outlet_Type_2                  uint8
Outlet_Type_3                  uint8
Item_Type_Combined_0           uint8
Item_Type_Combined_1           uint8
Item_Type_Combined_2           uint8
O

# Exporting Data

In [20]:
import warnings
warnings.filterwarnings('ignore')
#Drop the columns which have been converted to different types:
data.drop(['Item_Type','Outlet_Establishment_Year'],axis=1,inplace=True)

In [21]:
#Divide into test and train:
train = data.loc[data['source']=="train"]
test = data.loc[data['source']=="test"]

In [22]:
#Drop unnecessary columns:
test.drop(['Item_Outlet_Sales','source'],axis=1,inplace=True)
train.drop(['source'],axis=1,inplace=True)

In [23]:
#Export files as modified versions:
train.to_csv("train_modified.csv",index=False)
test.to_csv("test_modified.csv",index=False)

# Model Building

In [24]:
# Reading modified data
train2 = pd.read_csv("train_modified.csv")
test2 = pd.read_csv("test_modified.csv")

In [25]:
train2.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Visibility,Item_MRP,Outlet_Identifier,Item_Outlet_Sales,Item_Fat_Content_0,Item_Fat_Content_1,Item_Fat_Content_2,Item_Fat_Content_3,...,Outlet_0,Outlet_1,Outlet_2,Outlet_3,Outlet_4,Outlet_5,Outlet_6,Outlet_7,Outlet_8,Outlet_9
0,FDA15,9.3,0.016047,249.8092,OUT049,3735.138,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,DRC01,5.92,0.019278,48.2692,OUT018,443.4228,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
2,FDN15,17.5,0.01676,141.618,OUT049,2097.27,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,FDX07,19.2,0.0,182.095,OUT010,732.38,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
4,NCD19,8.93,0.0,53.8614,OUT013,994.7052,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0


In [26]:
X_train = train2.drop(['Item_Outlet_Sales', 'Outlet_Identifier','Item_Identifier'], axis=1)
y_train = train2.Item_Outlet_Sales

In [27]:
X_test = test2.drop(['Outlet_Identifier','Item_Identifier'], axis=1)

In [28]:
X_train.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Item_Fat_Content_0,Item_Fat_Content_1,Item_Fat_Content_2,Item_Fat_Content_3,Item_Fat_Content_4,Outlet_Location_Type_0,Outlet_Location_Type_1,...,Outlet_0,Outlet_1,Outlet_2,Outlet_3,Outlet_4,Outlet_5,Outlet_6,Outlet_7,Outlet_8,Outlet_9
0,9.3,0.016047,249.8092,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,5.92,0.019278,48.2692,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,17.5,0.01676,141.618,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,19.2,0.0,182.095,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,8.93,0.0,53.8614,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [29]:
y_train.head()

0    3735.1380
1     443.4228
2    2097.2700
3     732.3800
4     994.7052
Name: Item_Outlet_Sales, dtype: float64

# Linear Regression Model

In [30]:
from sklearn.linear_model import  LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
y_pred

array([1848.53604783, 1472.81670435, 1875.65285894, ..., 1809.18796433,
       3565.6645235 , 1267.46171871])

In [31]:
from sklearn.metrics import accuracy_score, r2_score,mean_squared_error
from sklearn.model_selection import cross_val_score

In [32]:
lr_accuracy = round(regressor.score(X_train,y_train) * 100,2)
lr_accuracy

56.36

In [33]:
r2_score(y_train, regressor.predict(X_train))

0.5635892777270479

In [34]:
#Perform cross-validation:
cv_score = cross_val_score(regressor, X_train, y_train, cv=5, scoring="r2")
print(cv_score)

[0.57110717 0.55516376 0.54752996 0.56634024 0.56507437]


In [35]:
mse = mean_squared_error(y_train,regressor.predict(X_train))

In [36]:
RMSE = np.sqrt(mse)
print(RMSE)

1127.2712261909362


In [37]:
submission = pd.DataFrame({'Item_Identifier':test2['Item_Identifier'],
                           'Outlet_Identifier':test2['Outlet_Identifier'],
                           'Item_Outlet_Sales': y_pred},
                          columns=['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales'])

In [38]:
#submission.to_csv('Linear1.csv',index=False)

# Random Forest Model

In [39]:
# Fitting Random Forest Regression to the dataset
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=100,max_depth=6, min_samples_leaf=50,n_jobs=4)
regressor.fit(X_train, y_train)

RandomForestRegressor(max_depth=6, min_samples_leaf=50, n_jobs=4)

In [40]:
# Predicting the test set results
y_pred = regressor.predict(X_test)
y_pred

array([1663.27112585, 1363.31395228,  590.4896251 , ..., 1963.49423975,
       3745.80937496, 1289.80029599])

In [41]:
rf_accuracy = round(regressor.score(X_train,y_train),2)
rf_accuracy

0.61

In [42]:
r2_score(y_train, regressor.predict(X_train))

0.6127759958585801

In [43]:
cv_score = cross_val_score(regressor, X_train, y_train, cv=5, scoring='r2')
print(cv_score)

[0.60815518 0.58680126 0.57622999 0.60922923 0.60775263]


In [44]:
mse = mean_squared_error(y_train,regressor.predict(X_train))
RMSE = np.sqrt(mse)
print(RMSE)

1061.8467723866054


In [45]:
submission = pd.DataFrame({
'Item_Identifier':test2['Item_Identifier'],
'Outlet_Identifier':test2['Outlet_Identifier'],
'Item_Outlet_Sales': y_pred
},columns=['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales'])

In [46]:
#submission.to_csv('submission3.csv',index=False)

# XGBoost

In [47]:
from sklearn.ensemble import GradientBoostingRegressor
regressor = GradientBoostingRegressor(n_estimators=100,max_depth=4)
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_test)
y_pred

array([1626.74159055, 1376.56188133,  641.2168557 , ..., 1780.97985926,
       3934.93320492, 1249.89250505])

In [48]:
GB_accuracy = round(regressor.score(X_train,y_train),2)
GB_accuracy

0.65

In [49]:
r2_score(y_train, regressor.predict(X_train))

0.6520578024386801

In [50]:
cv_score = cross_val_score(regressor, X_train, y_train, cv=5, scoring='r2')
print(cv_score)

[0.60731036 0.56981565 0.57296844 0.59530516 0.5973693 ]


In [51]:
mse = mean_squared_error(y_train,regressor.predict(X_train))
RMSE = np.sqrt(mse)
print(RMSE)

1006.5474816251743


In [52]:
submission = pd.DataFrame({
'Item_Identifier':test2['Item_Identifier'],
'Outlet_Identifier':test2['Outlet_Identifier'],
'Item_Outlet_Sales': y_pred
},columns=['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales'])

In [53]:
#submission.to_csv('XGB.csv',index=False)