In [55]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn import metrics

In [56]:
big_data = pd.read_csv("Train.csv")

In [57]:
big_data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [17]:
big_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [58]:
big_data.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [None]:
# Fill NaN cells, with either the Mode or Mean

In [None]:
# Filling the Item Weght column, with the mean

In [59]:
mean_value_weight_column = big_data['Item_Weight'].mean()

In [60]:
big_data["Item_Weight"].fillna(mean_value_weight_column, inplace=True)

In [61]:
big_data.isnull().sum()

Item_Identifier                 0
Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [None]:
# To fill the missing Outlet_Size, since its related to the Outlet_type

In [63]:
# filling the missing values in "Outlet_Size" column with Mode
mode_of_Outlet_size = big_data.pivot_table(values='Outlet_Size', columns='Outlet_Type', aggfunc=(lambda x: x.mode()[0]))

In [64]:
mode_of_outlet_size

Outlet_Type,Grocery Store,Supermarket Type1,Supermarket Type2,Supermarket Type3
Outlet_Size,Small,Small,Medium,Medium


### We have 4 types of outlet_type, 
    
- Of all the Grocery Store, the most common of their outlet_size is small, 
- Of all the Supermarket Type1, the most common of their outlet_size is small, 
- Of all the Supermarket Type2, the most common of their outlet_size is medium, 
- Of all the Supermarket Type3, the most common of their outlet_size is medium, 


In [32]:
missing_values = big_data["Outlet_Size"].isnull()

In [None]:
# We locate (.loc) all the missing values in the outlet_size column, 
# and replace them with the corresponding mode values of the outlet_type 

In [67]:
big_data.loc[missing_values, 'Outlet_Size'] = big_data.loc[missing_values,'Outlet_Type'].apply(lambda x: mode_of_Outlet_size[x])

In [69]:
big_data.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

In [None]:
# NOTE: 
#     - In the Item_Fat_Content column, the values are mixed, Low Fat, LF, low fat
#     belong to the same categories 
#     - reg, Regular is the same as well
    
#     A user must have been wrongly inputing this data. So we need to unify them
    

In [70]:
big_data['Item_Fat_Content'].value_counts()

Item_Fat_Content
Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: count, dtype: int64

In [71]:
big_data.replace({
    "Item_Fat_Content":{
        "low fat": "Low Fat",
        "LF": "Low Fat",
        "reg": "Regular",
    },
}, inplace=True)

In [72]:
big_data['Item_Fat_Content'].value_counts()

Item_Fat_Content
Low Fat    5517
Regular    3006
Name: count, dtype: int64

##### Label Encoding - Take all the string categorical values and encode into numerical values 

### CATEGORICAL COLUMNS 

1. Item_Identifier
2. Item_Fat_Content
3. Item_Type
4. Outlet_Identifier
5. Outlet_Size
6. Outlet_Location_Type
7. Outlet_Type

In [73]:
encoder = LabelEncoder()

In [75]:
big_data["Item_Identifier"] = encoder.fit_transform(big_data["Item_Identifier"])
big_data["Item_Fat_Content"] = encoder.fit_transform(big_data["Item_Fat_Content"])
big_data["Item_Type"] = encoder.fit_transform(big_data["Item_Type"])
big_data["Outlet_Identifier"] = encoder.fit_transform(big_data["Outlet_Identifier"])


In [77]:
big_data["Outlet_Size"] = encoder.fit_transform(big_data["Outlet_Size"])
big_data["Outlet_Location_Type"] = encoder.fit_transform(big_data["Outlet_Location_Type"])
big_data["Outlet_Type"] = encoder.fit_transform(big_data["Outlet_Type"])

In [78]:
big_data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,156,9.3,0,0.016047,4,249.8092,9,1999,1,0,1,3735.138
1,8,5.92,1,0.019278,14,48.2692,3,2009,1,2,2,443.4228
2,662,17.5,0,0.01676,10,141.618,9,1999,1,0,1,2097.27
3,1121,19.2,1,0.0,6,182.095,0,1998,2,2,0,732.38
4,1297,8.93,0,0.0,9,53.8614,1,1987,0,2,1,994.7052


In [80]:
# Splitting Features, & Target 
X = big_data.drop(columns="Item_Outlet_Sales", axis=1)
y = big_data["Item_Outlet_Sales"]

In [81]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, random_state=2, test_size=0.2)

In [82]:
# Model Training 

regressor = XGBRegressor()

In [83]:
regressor.fit(X_train, Y_train)

In [84]:
# Evaluation 

In [89]:
training_data_prediction = regressor.predict(X_train)

# R-Square value - Train Data 
r2_train = metrics.r2_score(Y_train, training_data_prediction)

# Value should range btw 0 & 1, i.e 0.8 means its 80% accurately tally with the original targets 

In [90]:
r2_train

0.8762174618111388

In [91]:
test_data_prediction = regressor.predict(X_test)

# R-Square value - Train Data 
r2_test = metrics.r2_score(Y_test, test_data_prediction)

# Value should range btw 0 & 1, i.e 0.5 means its 50% accurately tally with the original targets 
r2_test

0.5017253991620692