In [7]:
## Typical Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
## Modeling & preprocessing import
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer,make_column_transformer,make_column_selector
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score,mean_squared_error

In [2]:
## Load dataset from published web view link
import pandas as pd
fpath ="https://docs.google.com/spreadsheets/d/e/2PACX-1vS6Sn9LaMSc_E1EHQpuRK6BTpKp6h27obTP_dTpAVu_xtoqsge30jBGh9vYlO4DYe-utRKMgMqYChU_/pub?output=csv"
df = pd.read_csv(fpath)
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB
None


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [3]:
## replace inconsistent categories
fat_content_map = {'LF':'Low Fat',
                   'reg':'Regular',
                   'low fat':'Low Fat'}
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace(fat_content_map)
## Verify 
df['Item_Fat_Content'].value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

In [4]:
## Define X and y
target = 'Item_Outlet_Sales'
X = df.drop(columns=target).copy()
y = df[target].copy()

## Drop unwanted/inappropriate columns 
bad_cols = ['Item_Identifier','Outlet_Identifier','Outlet_Establishment_Year']
X = X.drop(columns=bad_cols)
## Perform a train-test-split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)


In [5]:
## Create categorical pipeline
cat_selector = make_column_selector(dtype_include='object')
# create pipeline for handling categorical data
impute_most_freq = SimpleImputer(strategy='most_frequent')
encoder = OneHotEncoder(handle_unknown='ignore',sparse=False)
cat_pipe = make_pipeline(impute_most_freq,encoder)
## Create numeric pipelien
num_selector = make_column_selector(dtype_include='number')
num_selector(X_train)
# create pipeline for handling categorical data
impute_mean = SimpleImputer(strategy='mean')
scaler = StandardScaler()
num_pipe = make_pipeline(impute_mean, scaler)
## Combine into 1 column transformer
preprocessor = make_column_transformer( (cat_pipe,cat_selector),
                                       (num_pipe,num_selector),
                                      verbose_feature_names_out=False)
preprocessor

# Modeling


## Model 1 - LinearRegression


In [6]:
## Make & Fit the modeling pipeline
pipe = make_pipeline(preprocessor, LinearRegression())
pipe.fit(X_train, y_train)
## Quick peek at the R^2 value for test data
print(f"Training R2: {pipe.score(X_train,y_train) :.3f}")
print(f"Test R2: {pipe.score(X_test,y_test): .3f}")

Training R2: 0.561
Test R2:  0.566




# Extracting Values from a Modeling Pipeline


In [9]:
pipe

In [10]:
# Extracting the feature names from the pipeline
feature_names = pipe[0].get_feature_names_out()
feature_names

array(['Item_Fat_Content_Low Fat', 'Item_Fat_Content_Regular',
       'Item_Type_Baking Goods', 'Item_Type_Breads',
       'Item_Type_Breakfast', 'Item_Type_Canned', 'Item_Type_Dairy',
       'Item_Type_Frozen Foods', 'Item_Type_Fruits and Vegetables',
       'Item_Type_Hard Drinks', 'Item_Type_Health and Hygiene',
       'Item_Type_Household', 'Item_Type_Meat', 'Item_Type_Others',
       'Item_Type_Seafood', 'Item_Type_Snack Foods',
       'Item_Type_Soft Drinks', 'Item_Type_Starchy Foods',
       'Outlet_Size_High', 'Outlet_Size_Medium', 'Outlet_Size_Small',
       'Outlet_Location_Type_Tier 1', 'Outlet_Location_Type_Tier 2',
       'Outlet_Location_Type_Tier 3', 'Outlet_Type_Grocery Store',
       'Outlet_Type_Supermarket Type1', 'Outlet_Type_Supermarket Type2',
       'Outlet_Type_Supermarket Type3', 'Item_Weight', 'Item_Visibility',
       'Item_MRP'], dtype=object)

In [11]:
# Extracting the ceofficients from the pipeline
pipe[-1].coef_

array([ 1.01112944e+12,  1.01112944e+12, -5.25543052e+14, -5.25543052e+14,
       -5.25543052e+14, -5.25543052e+14, -5.25543052e+14, -5.25543052e+14,
       -5.25543052e+14, -5.25543052e+14, -5.25543052e+14, -5.25543052e+14,
       -5.25543052e+14, -5.25543052e+14, -5.25543052e+14, -5.25543052e+14,
       -5.25543052e+14, -5.25543052e+14, -2.45322399e+14, -2.45322399e+14,
       -2.45322399e+14, -2.60780480e+15, -2.60780480e+15, -2.60780480e+15,
        3.14103994e+14,  3.14103994e+14,  3.14103994e+14,  3.14103994e+14,
       -7.55025889e+00, -2.19612136e+01,  9.84193130e+02])

In [12]:
# put them together
feature_names = pipe[0].get_feature_names_out()
coeffs = pd.Series(pipe[-1].coef_, index=feature_names)
coeffs

Item_Fat_Content_Low Fat           1.011129e+12
Item_Fat_Content_Regular           1.011129e+12
Item_Type_Baking Goods            -5.255431e+14
Item_Type_Breads                  -5.255431e+14
Item_Type_Breakfast               -5.255431e+14
Item_Type_Canned                  -5.255431e+14
Item_Type_Dairy                   -5.255431e+14
Item_Type_Frozen Foods            -5.255431e+14
Item_Type_Fruits and Vegetables   -5.255431e+14
Item_Type_Hard Drinks             -5.255431e+14
Item_Type_Health and Hygiene      -5.255431e+14
Item_Type_Household               -5.255431e+14
Item_Type_Meat                    -5.255431e+14
Item_Type_Others                  -5.255431e+14
Item_Type_Seafood                 -5.255431e+14
Item_Type_Snack Foods             -5.255431e+14
Item_Type_Soft Drinks             -5.255431e+14
Item_Type_Starchy Foods           -5.255431e+14
Outlet_Size_High                  -2.453224e+14
Outlet_Size_Medium                -2.453224e+14
Outlet_Size_Small                 -2.453

In [13]:
#extract feature importance
feature_names = tree_pipe[0].get_feature_names_out()
importances = pd.Series(tree_pipe[-1].feature_importances_, index=feature_names)
importances

Item_Fat_Content_Low Fat           0.005371
Item_Fat_Content_Regular           0.002609
Item_Type_Baking Goods             0.003825
Item_Type_Breads                   0.003084
Item_Type_Breakfast                0.002322
Item_Type_Canned                   0.003760
Item_Type_Dairy                    0.006378
Item_Type_Frozen Foods             0.006462
Item_Type_Fruits and Vegetables    0.007398
Item_Type_Hard Drinks              0.002644
Item_Type_Health and Hygiene       0.004196
Item_Type_Household                0.005554
Item_Type_Meat                     0.002009
Item_Type_Others                   0.002091
Item_Type_Seafood                  0.001981
Item_Type_Snack Foods              0.007275
Item_Type_Soft Drinks              0.005513
Item_Type_Starchy Foods            0.002233
Outlet_Size_High                   0.005518
Outlet_Size_Medium                 0.008678
Outlet_Size_Small                  0.009739
Outlet_Location_Type_Tier 1        0.005699
Outlet_Location_Type_Tier 2     

## Model 2 - Decision Tree Regressor


In [8]:
## Make and fit model
tree_pipe = make_pipeline(preprocessor,DecisionTreeRegressor())
tree_pipe.fit(X_train, y_train)
## Quick peek at the R^2 value for test data
print(f"Training R2: {tree_pipe.score(X_train,y_train) :.3f}")
print(f"Test R2: {tree_pipe.score(X_test,y_test): .3f}")

Training R2: 1.000
Test R2:  0.118


