# Problem: Predict the sales of a store.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
train_data = pd.read_csv('train_Bigmart_sales.csv')
test_data = pd.read_csv('test_Bigmart_sales.csv')


In [3]:
train_data.head(50)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
5,FDP36,10.395,Regular,0.0,Baking Goods,51.4008,OUT018,2009,Medium,Tier 3,Supermarket Type2,556.6088
6,FDO10,13.65,Regular,0.012741,Snack Foods,57.6588,OUT013,1987,High,Tier 3,Supermarket Type1,343.5528
7,FDP10,,Low Fat,0.12747,Snack Foods,107.7622,OUT027,1985,Medium,Tier 3,Supermarket Type3,4022.7636
8,FDH17,16.2,Regular,0.016687,Frozen Foods,96.9726,OUT045,2002,,Tier 2,Supermarket Type1,1076.5986
9,FDU28,19.2,Regular,0.09445,Frozen Foods,187.8214,OUT017,2007,,Tier 2,Supermarket Type1,4710.535


In [4]:
test_data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.3,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,,Tier 2,Supermarket Type1
4,FDY38,,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3


In [5]:
train_data['source'] = 'train'
test_data['source'] = 'test'
data = pd.concat([train_data,test_data],ignore_index=True)


In [7]:
print(train_data.shape,test_data.shape,data.shape)

(8523, 13) (5681, 12) (14204, 13)


In [8]:
data.head()


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,source
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,train
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,train
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,train
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,train
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,train


In [9]:
data.describe()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,11765.0,14204.0,14204.0,14204.0,8523.0
mean,12.792854,0.065953,141.004977,1997.830681,2181.288914
std,4.652502,0.051459,62.086938,8.371664,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,8.71,0.027036,94.012,1987.0,834.2474
50%,12.6,0.054021,142.247,1999.0,1794.331
75%,16.75,0.094037,185.8556,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


In [11]:
#check missing values

data.apply(lambda x: sum(x.isnull()))


Item_Identifier                 0
Item_Weight                  2439
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  4016
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales            5681
source                          0
dtype: int64

In [12]:
# Filling missing values

data.Item_Outlet_Sales = data.Item_Outlet_Sales.fillna(data.Item_Outlet_Sales.mean())

In [13]:
data.Item_Weight = data.Item_Weight.fillna(data.Item_Weight.mean())

In [14]:
data['Outlet_Size'].value_counts()

Medium    4655
Small     3980
High      1553
Name: Outlet_Size, dtype: int64

In [15]:
data.Outlet_Size = data.Outlet_Size.fillna('Medium')

In [16]:
data.apply(lambda x: sum(x.isnull()))

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
source                       0
dtype: int64

In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14204 entries, 0 to 14203
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            14204 non-null  object 
 1   Item_Weight                14204 non-null  float64
 2   Item_Fat_Content           14204 non-null  object 
 3   Item_Visibility            14204 non-null  float64
 4   Item_Type                  14204 non-null  object 
 5   Item_MRP                   14204 non-null  float64
 6   Outlet_Identifier          14204 non-null  object 
 7   Outlet_Establishment_Year  14204 non-null  int64  
 8   Outlet_Size                14204 non-null  object 
 9   Outlet_Location_Type       14204 non-null  object 
 10  Outlet_Type                14204 non-null  object 
 11  Item_Outlet_Sales          14204 non-null  float64
 12  source                     14204 non-null  object 
dtypes: float64(4), int64(1), object(8)
memory usag

In [19]:
# Item type combine:
data['Item_Identifier'].value_counts()
data['Item_Type_Combined'] = data['Item_Identifier'].apply(lambda x: x[0:2])
data['Item_Type_Combined'] = data['Item_Type_Combined'].map({'FD':'Food',
                                                             'NC':'Non-Consumable',
                                                             'DR':'Drinks'})
data['Item_Type_Combined'].value_counts()

Food              10201
Non-Consumable     2686
Drinks             1317
Name: Item_Type_Combined, dtype: int64

# Numerical and one-Hot coding of categorical variables

In [20]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

encode = LabelEncoder()
# New variable for outlet
data['Outlet'] = encode.fit_transform(data['Outlet_Identifier'])
var_mod = ['Item_Fat_Content','Outlet_Location_Type','Outlet_Size','Outlet_Type','Item_Type_Combined','Outlet']
encode = LabelEncoder()
for i in var_mod:
    data[i] = encode.fit_transform(data[i])

In [21]:
# One Hot coding
data = pd.get_dummies(data, columns = ['Item_Fat_Content','Outlet_Location_Type',
                                       'Outlet_Size','Outlet_Type','Item_Type_Combined','Outlet'])

In [22]:
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Item_Outlet_Sales,source,Item_Fat_Content_0,...,Outlet_0,Outlet_1,Outlet_2,Outlet_3,Outlet_4,Outlet_5,Outlet_6,Outlet_7,Outlet_8,Outlet_9
0,FDA15,9.3,0.016047,Dairy,249.8092,OUT049,1999,3735.138,train,0,...,0,0,0,0,0,0,0,0,0,1
1,DRC01,5.92,0.019278,Soft Drinks,48.2692,OUT018,2009,443.4228,train,0,...,0,0,0,1,0,0,0,0,0,0
2,FDN15,17.5,0.01676,Meat,141.618,OUT049,1999,2097.27,train,0,...,0,0,0,0,0,0,0,0,0,1
3,FDX07,19.2,0.0,Fruits and Vegetables,182.095,OUT010,1998,732.38,train,0,...,1,0,0,0,0,0,0,0,0,0
4,NCD19,8.93,0.0,Household,53.8614,OUT013,1987,994.7052,train,0,...,0,1,0,0,0,0,0,0,0,0


In [23]:
data.dtypes

Item_Identifier               object
Item_Weight                  float64
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Item_Outlet_Sales            float64
source                        object
Item_Fat_Content_0             uint8
Item_Fat_Content_1             uint8
Item_Fat_Content_2             uint8
Item_Fat_Content_3             uint8
Item_Fat_Content_4             uint8
Outlet_Location_Type_0         uint8
Outlet_Location_Type_1         uint8
Outlet_Location_Type_2         uint8
Outlet_Size_0                  uint8
Outlet_Size_1                  uint8
Outlet_Size_2                  uint8
Outlet_Type_0                  uint8
Outlet_Type_1                  uint8
Outlet_Type_2                  uint8
Outlet_Type_3                  uint8
Item_Type_Combined_0           uint8
Item_Type_Combined_1           uint8
Item_Type_Combined_2           uint8
O

# Exporting data

In [24]:
import warnings
warnings.filterwarnings('ignore')
# Drop the columns which have been converted to diffrenet types
data.drop(['Item_Type','Outlet_Establishment_Year'], axis=1,inplace=True)

# divide into test and train
train = data.loc[data['source']=='train']
test = data.loc[data['source']=='test']

# drop unnecessary columns
test.drop(['Item_Outlet_Sales','source'],axis=1,inplace=True)
train.drop(['source'],axis=1,inplace=True)

#EXPORT files as modified versions:
train.to_csv('train_modified.csv',index=False)
test.to_csv('test_modified.csv',index=False)

# Model Building

In [25]:
# Reading modified data

train2 = pd.read_csv('train_modified.csv')
test2 = pd.read_csv('test_modified.csv')

In [26]:
train2.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Visibility,Item_MRP,Outlet_Identifier,Item_Outlet_Sales,Item_Fat_Content_0,Item_Fat_Content_1,Item_Fat_Content_2,Item_Fat_Content_3,...,Outlet_0,Outlet_1,Outlet_2,Outlet_3,Outlet_4,Outlet_5,Outlet_6,Outlet_7,Outlet_8,Outlet_9
0,FDA15,9.3,0.016047,249.8092,OUT049,3735.138,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,DRC01,5.92,0.019278,48.2692,OUT018,443.4228,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
2,FDN15,17.5,0.01676,141.618,OUT049,2097.27,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,FDX07,19.2,0.0,182.095,OUT010,732.38,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
4,NCD19,8.93,0.0,53.8614,OUT013,994.7052,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0


In [27]:
test2.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Visibility,Item_MRP,Outlet_Identifier,Item_Fat_Content_0,Item_Fat_Content_1,Item_Fat_Content_2,Item_Fat_Content_3,Item_Fat_Content_4,...,Outlet_0,Outlet_1,Outlet_2,Outlet_3,Outlet_4,Outlet_5,Outlet_6,Outlet_7,Outlet_8,Outlet_9
0,FDW58,20.75,0.007565,107.8622,OUT049,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,FDW14,8.3,0.038428,87.3198,OUT017,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
2,NCN55,14.6,0.099575,241.7538,OUT010,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,FDQ58,7.315,0.015388,155.034,OUT017,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,FDY38,12.792854,0.118599,234.23,OUT027,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [28]:
X_train = train2.drop(['Item_Outlet_Sales','Outlet_Identifier','Item_Identifier'],axis=1)
y_train = train2.Item_Outlet_Sales

In [29]:
X_test = test2.drop(['Outlet_Identifier','Item_Identifier'], axis=1)

In [30]:
X_train.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Item_Fat_Content_0,Item_Fat_Content_1,Item_Fat_Content_2,Item_Fat_Content_3,Item_Fat_Content_4,Outlet_Location_Type_0,Outlet_Location_Type_1,...,Outlet_0,Outlet_1,Outlet_2,Outlet_3,Outlet_4,Outlet_5,Outlet_6,Outlet_7,Outlet_8,Outlet_9
0,9.3,0.016047,249.8092,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,5.92,0.019278,48.2692,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,17.5,0.01676,141.618,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,19.2,0.0,182.095,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,8.93,0.0,53.8614,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [31]:
y_train.head()

0    3735.1380
1     443.4228
2    2097.2700
3     732.3800
4     994.7052
Name: Item_Outlet_Sales, dtype: float64

# GBM Model

In [42]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score
reg = GradientBoostingRegressor(max_depth=15,min_samples_leaf=300)
reg.fit(X_train,y_train)

GradientBoostingRegressor(max_depth=15, min_samples_leaf=300)

In [43]:
y_pred = reg.predict(X_test)

In [44]:
y_pred

array([1573.98670521, 1346.5730504 ,  633.23767077, ..., 1778.30591851,
       3689.20081984, 1355.25889861])

In [45]:
tree_accuracy = round(reg.score(X_train,y_train),2)
tree_accuracy

0.64

In [46]:
r2_score(y_train,reg.predict(X_train))

0.6353609807583667

In [47]:
import warnings
warnings.filterwarnings('ignore')
# Measuring accuracy
cv_score = cross_val_score(reg, X_train, y_train, cv=5)
print(np.sqrt(np.abs(cv_score)))

[0.77761159 0.76293999 0.75379004 0.77658647 0.77552269]


In [48]:
print('RMSE : %.4g' %np.sqrt(mean_squared_error(y_train,reg.predict(X_train))))

RMSE : 1030


In [49]:
submission = pd.DataFrame({'Item_Identifier':test2['Item_Identifier'],'Outlet_Identifier':test2['Outlet_Identifier'],
                          'Item_Outlet_Sales': y_pred},columns = ['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales'])

In [50]:
submission.to_csv('submission2.csv',index=False)

In [51]:
sub = pd.read_csv('submission2.csv')

In [52]:
sub.head()

Unnamed: 0,Item_Identifier,Outlet_Identifier,Item_Outlet_Sales
0,FDW58,OUT049,1573.986705
1,FDW14,OUT017,1346.57305
2,NCN55,OUT010,633.237671
3,FDQ58,OUT017,2324.730782
4,FDY38,OUT027,5683.192828
