### Data Exploration and Preprocessing

In [84]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

train_data = pd.read_csv('Data.csv')
train_data = train_data.drop(columns=train_data.columns[0])

train_data.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [85]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Weight                7060 non-null   float64
 1   Item_Fat_Content           8523 non-null   object 
 2   Item_Visibility            8523 non-null   float64
 3   Item_Type                  8523 non-null   object 
 4   Item_MRP                   8523 non-null   float64
 5   Outlet_Identifier          8523 non-null   object 
 6   Outlet_Establishment_Year  8523 non-null   int64  
 7   Outlet_Size                6113 non-null   object 
 8   Outlet_Location_Type       8523 non-null   object 
 9   Outlet_Type                8523 non-null   object 
 10  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(6)
memory usage: 732.6+ KB


In [86]:

# check for the null values
train_data.isna().sum()

Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [87]:
# impute missing values in item weight by mean
train_data['Item_Weight'] = train_data['Item_Weight'].fillna(train_data['Item_Weight'].mean())

# impute outlet size in training data by mode
train_data['Outlet_Size'] = train_data['Outlet_Size'].fillna(train_data['Outlet_Size'].mode()[0])

In [88]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Weight                8523 non-null   float64
 1   Item_Fat_Content           8523 non-null   object 
 2   Item_Visibility            8523 non-null   float64
 3   Item_Type                  8523 non-null   object 
 4   Item_MRP                   8523 non-null   float64
 5   Outlet_Identifier          8523 non-null   object 
 6   Outlet_Establishment_Year  8523 non-null   int64  
 7   Outlet_Size                8523 non-null   object 
 8   Outlet_Location_Type       8523 non-null   object 
 9   Outlet_Type                8523 non-null   object 
 10  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(6)
memory usage: 732.6+ KB


In [89]:
train_data.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Medium,Tier 3,Grocery Store,732.38
4,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


### Encode the categorical variables

#### Fix Item_Fat_Content Column

In [90]:
# train_data["Item_Fat_Content"].unique()
train_data['Item_Fat_Content'].value_counts()

Item_Fat_Content
Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: count, dtype: int64

In [91]:
train_data['Item_Fat_Content'] = train_data['Item_Fat_Content'].str.strip().str.lower()
train_data['Item_Fat_Content'] = train_data['Item_Fat_Content'].replace({'lf': 'low fat', 'reg': 'regular'})

In [92]:
# train_data["Item_Fat_Content"].unique()
train_data['Item_Fat_Content'].value_counts()

Item_Fat_Content
low fat    5517
regular    3006
Name: count, dtype: int64

### Conclusion = Apply Ordinal Encoding
Order = (low fat, regular)

#### Fix Item_Type column

In [93]:
# train_data["Item_Type"].unique()
train_data['Item_Type'].value_counts()

Item_Type
Fruits and Vegetables    1232
Snack Foods              1200
Household                 910
Frozen Foods              856
Dairy                     682
Canned                    649
Baking Goods              648
Health and Hygiene        520
Soft Drinks               445
Meat                      425
Breads                    251
Hard Drinks               214
Others                    169
Starchy Foods             148
Breakfast                 110
Seafood                    64
Name: count, dtype: int64

### Conclusion = Apply One Hot Encoding

#### Fix Outlet_Identifier column

In [94]:
train_data["Outlet_Identifier"].unique()
train_data['Outlet_Identifier'].value_counts()

Outlet_Identifier
OUT027    935
OUT013    932
OUT035    930
OUT049    930
OUT046    930
OUT045    929
OUT018    928
OUT017    926
OUT010    555
OUT019    528
Name: count, dtype: int64

### Conclusion = Apply One Hot Encoding

#### Fix Outlet_Size column

In [95]:
# train_data["Outlet_Size"].unique()
train_data['Outlet_Size'].value_counts()

Outlet_Size
Medium    5203
Small     2388
High       932
Name: count, dtype: int64

### Conclusion =  Apply Ordinal Encoding 
order = (Small, Medium, High)

#### Fix Outlet_Location_Type column

In [96]:
# train_data["Outlet_Location_Type"].unique()
train_data['Outlet_Location_Type'].value_counts()

Outlet_Location_Type
Tier 3    3350
Tier 2    2785
Tier 1    2388
Name: count, dtype: int64

### Conclusion = Ordinal Encoding
order = (Tier 3, Tier 2, Tier 1)

#### Fix Outlet_Type column

In [97]:
# train_data["Outlet_Type"].unique()
train_data['Outlet_Type'].value_counts()

Outlet_Type
Supermarket Type1    5577
Grocery Store        1083
Supermarket Type3     935
Supermarket Type2     928
Name: count, dtype: int64

### conclusion = Ordinal Encoding
order = (Grocery Store, Supermarket Type3, Supermarket Type2, Supermarket Type1)

### Apply column Transformer

In [98]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [99]:

transformer = ColumnTransformer(transformers=[
    ('trnf1', OrdinalEncoder(categories=[ ['low fat', 'regular'], 
                                            ['Small', 'Medium', 'High'], 
                                            ['Tier 3', 'Tier 2', 'Tier 1'],
                                            ['Grocery Store', 'Supermarket Type3', 'Supermarket Type2', 'Supermarket Type1'] ]),
        ['Item_Fat_Content', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
    ),
    ('trnf2', OneHotEncoder(sparse_output=False), ['Item_Type', 'Outlet_Identifier'] ),
    ('scaler', StandardScaler(), ['Item_MRP', 'Outlet_Establishment_Year'])
]
, remainder='passthrough')

transform_data = transformer.fit_transform(train_data)

In [100]:
feature_names = transformer.get_feature_names_out()
transform_data = pd.DataFrame(transform_data, columns=feature_names)
transform_data.columns = [col.split('__')[-1] for col in transform_data.columns]
transform_data.head()

Unnamed: 0,Item_Fat_Content,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Type_Baking Goods,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,Item_Type_Dairy,Item_Type_Frozen Foods,...,Outlet_Identifier_OUT027,Outlet_Identifier_OUT035,Outlet_Identifier_OUT045,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049,Item_MRP,Outlet_Establishment_Year,Item_Weight,Item_Visibility,Item_Outlet_Sales
0,0.0,1.0,2.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.747454,0.139541,9.3,0.016047,3735.138
1,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-1.489023,1.334103,5.92,0.019278,443.4228
2,0.0,1.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.01004,0.139541,17.5,0.01676,2097.27
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.66005,0.020085,19.2,0.0,732.38
4,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-1.39922,-1.293934,8.93,0.0,994.7052


## Model Building

### Train Test Split

In [101]:
from sklearn.model_selection import train_test_split
X = transform_data.iloc[:, :-1]
y = transform_data.iloc[ :, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6818, 34), (1705, 34), (6818,), (1705,))

### Implement Linear Regression

In [102]:
from sklearn.linear_model import LinearRegression
model_LR = LinearRegression()
model_LR.fit(X_train, y_train)
y_pred_LR = model_LR.predict(X_test)


In [103]:
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score
print("root_mean_squared_error = ", root_mean_squared_error(y_test, y_pred_LR))
print("R2 Score = ", r2_score(y_test, y_pred_LR))


root_mean_squared_error =  1185.5814410162104
R2 Score =  0.5446811350184797


### Implement RandomForestRegressor

In [104]:
from sklearn.ensemble import  RandomForestRegressor
model_RFR = RandomForestRegressor(max_depth=10)
model_RFR.fit(X_train, y_train)
y_pred_RFR = model_RFR.predict(X_test)

In [105]:
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score
print("root_mean_squared_error = ", root_mean_squared_error(y_test, y_pred_RFR))
print("R2 Score = ", r2_score(y_test, y_pred_RFR))

root_mean_squared_error =  1142.9610371889494
R2 Score =  0.5768291813270469
