# Sales Predictions - Linear Regression

## Imports

In [1]:
#Standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm

#Machine learning imports
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_transformer, make_column_selector, ColumnTransformer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression

#Set sklearn output to pandas
from sklearn import set_config
set_config(transform_output = 'pandas')

## Custom Functions

In [15]:
def regression_metrics(y_true, y_pred, label='', verbose = True, output_dict=False):
  # Get metrics
  mae = mean_absolute_error(y_true, y_pred)
  mse = mean_squared_error(y_true, y_pred)
  rmse = mean_squared_error(y_true, y_pred, squared=False)
  r_squared = r2_score(y_true, y_pred)
  if verbose == True:
    # Print Result with Label and Header
    header = "-"*60
    print(header, f"Regression Metrics: {label}", header, sep='\n')
    print(f"- MAE = {mae:,.3f}")
    print(f"- MSE = {mse:,.3f}")
    print(f"- RMSE = {rmse:,.3f}")
    print(f"- R^2 = {r_squared:,.3f}")
  if output_dict == True:
      metrics = {'Label':label, 'MAE':mae,
                 'MSE':mse, 'RMSE':rmse, 'R^2':r_squared}
      return metrics

In [21]:
def evaluate_regression(reg, X_train_tf, y_train, X_test_tf, y_test, verbose = True,
                        output_frame=False):
  # Get predictions for training data
  y_train_pred = reg.predict(X_train_tf)

  # Call the helper function to obtain regression metrics for training data
  results_train = regression_metrics(y_train, y_train_pred, verbose = verbose,
                                     output_dict=output_frame,
                                     label='Training Data')
  print()
  # Get predictions for test data
  y_test_pred = reg.predict(X_test_tf)
  # Call the helper function to obtain regression metrics for test data
  results_test = regression_metrics(y_test, y_test_pred, verbose = verbose,
                                  output_dict=output_frame,
                                    label='Test Data' )

  # Store results in a dataframe if ouput_frame is True
  if output_frame:
    results_df = pd.DataFrame([results_train,results_test])
    # Set the label as the index
    results_df = results_df.set_index('Label')
    # Set index.name to none to get a cleaner looking result
    results_df.index.name=None
    # Return the dataframe
    return results_df.round(3)

## Load and Inspect Data

In [2]:
#Read in data csv
df = pd.read_csv('Data/sales_predictions_2023.csv')
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [3]:
#Preview info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


## Clean Data

- The cleaning steps taken here are to fix inconsistent values in categorical columns and remove unneeded columns for modeling. These columns were determined during EDA. Missing values will be explored in this section, but not filled until preprocessing occurs.

In [4]:
#Change inconsistent values in each column
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({'LF': 'Low Fat', 
                                                         'low fat': 'Low Fat', 
                                                         'reg': 'Regular'})
df['Outlet_Size'] = df['Outlet_Size'].replace({'High': 'Large'})
#Verify the values
print(df['Item_Fat_Content'].value_counts())
print('\n')
print(df['Outlet_Size'].value_counts())

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64


Medium    2793
Small     2388
Large      932
Name: Outlet_Size, dtype: int64


In [5]:
#Remove unnecessary columns
df = df.drop(columns = ['Item_Identifier', 'Outlet_Establishment_Year', 
                        'Item_Weight'])
#Verify the drop
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Item_Fat_Content      8523 non-null   object 
 1   Item_Visibility       8523 non-null   float64
 2   Item_Type             8523 non-null   object 
 3   Item_MRP              8523 non-null   float64
 4   Outlet_Identifier     8523 non-null   object 
 5   Outlet_Size           6113 non-null   object 
 6   Outlet_Location_Type  8523 non-null   object 
 7   Outlet_Type           8523 non-null   object 
 8   Item_Outlet_Sales     8523 non-null   float64
dtypes: float64(3), object(6)
memory usage: 599.4+ KB


In [6]:
#Explore missing values
df.isna().sum()

Item_Fat_Content           0
Item_Visibility            0
Item_Type                  0
Item_MRP                   0
Outlet_Identifier          0
Outlet_Size             2410
Outlet_Location_Type       0
Outlet_Type                0
Item_Outlet_Sales          0
dtype: int64

- Now, the only column with missing values is outlet size. I will impute 'Missing' in for these null values in the preprocessing stage.

## Preprocessing

In [8]:
#Define X and y
target = 'Item_Outlet_Sales'
X = df.drop(columns = target)
y = df[target]

#Perform train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

#Verify split
X_train.head()

Unnamed: 0,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type
4776,Low Fat,0.029565,Household,256.4646,OUT018,Medium,Tier 3,Supermarket Type2
7510,Regular,0.0,Snack Foods,179.766,OUT018,Medium,Tier 3,Supermarket Type2
5828,Regular,0.158716,Meat,157.2946,OUT049,Medium,Tier 1,Supermarket Type1
5327,Low Fat,0.014628,Baking Goods,82.325,OUT035,Small,Tier 2,Supermarket Type1
4810,Low Fat,0.016645,Frozen Foods,120.9098,OUT045,,Tier 2,Supermarket Type1


In [10]:
##Create numeric pipeline
#Define numeric columns
num_cols = X_train.select_dtypes('number').columns

#Instantiate transformers
scaler = StandardScaler()

#Instantiate pipeline
num_pipe = make_pipeline(scaler)

#Create tuple for column transformer
num_tuple = ('Numeric', num_pipe, num_cols)

In [11]:
##Create categorical pipeline
#Define categorical columns
cat_cols = X_train.select_dtypes('object').drop(columns = 'Outlet_Size').columns

#Instantiate transformers
ohe = OneHotEncoder(sparse_output= False, handle_unknown= 'ignore')

#Instantiate pipeline
cat_pipe = make_pipeline(ohe)

#Create tuple for column transformer
cat_tuple = ('Categorical', cat_pipe, cat_cols)

In [12]:
##Create ordinal pipeline
#Define ordinal columns
ord_cols = ['Outlet_Size']

#Define order of values
size_order = ['Missing', 'Small', 'Medium', 'Large']
#Create list for transformer
ordinal_order = [size_order]

#Instantiate transformers
impute_missing = SimpleImputer(strategy = 'constant', fill_value = 'Missing')
ord_encoder = OrdinalEncoder(categories = ordinal_order)
scaler = StandardScaler()

#Instantiate pipeline
ord_pipe = make_pipeline(impute_missing, ord_encoder, scaler)

#Create tuple for column transformer
ord_tuple = ('Ordinal', ord_pipe, ord_cols)

In [13]:
#Create column transformer
preprocessor = ColumnTransformer([num_tuple, cat_tuple, ord_tuple], 
                                 verbose_feature_names_out= False)

#View preprocessor
preprocessor

In [18]:
#Fit preprocessor on training data
preprocessor.fit(X_train)

#Transform training and testing data
X_train_tf = preprocessor.transform(X_train)
X_test_tf = preprocessor.transform(X_test)

#Verify transformation
X_train_tf.head()

Unnamed: 0,Item_Visibility,Item_MRP,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Type_Baking Goods,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,Item_Type_Dairy,Item_Type_Frozen Foods,...,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3,Outlet_Size
4776,-0.712775,1.828109,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.748125
7510,-1.291052,0.603369,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.748125
5828,1.813319,0.244541,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.748125
5327,-1.004931,-0.952591,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,-0.26437
4810,-0.965484,-0.33646,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,-1.276865


## Default Linear Regression Model

In [17]:
#Instantiate linear regression model
lin_reg = LinearRegression()

#Create model pipeline
lin_reg_pipe = make_pipeline(preprocessor, lin_reg)

#Fit onto training data
lin_reg_pipe.fit(X_train, y_train)

#Evaluate model
evaluate_regression(lin_reg_pipe, X_train, y_train, X_test, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 847.114
- MSE = 1,297,613.079
- RMSE = 1,139.128
- R^2 = 0.562

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 803.999
- MSE = 1,194,053.573
- RMSE = 1,092.728
- R^2 = 0.567


In [23]:
#Instantiate model
lin_reg_2 = LinearRegression()

#Fit model to transformed training data
lin_reg_2.fit(X_train_tf, y_train)

#Evaluate model
evaluate_regression(lin_reg_2, X_train_tf, y_train, X_test_tf, y_test, output_frame = True)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 847.114
- MSE = 1,297,613.079
- RMSE = 1,139.128
- R^2 = 0.562

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 803.999
- MSE = 1,194,053.573
- RMSE = 1,092.728
- R^2 = 0.567


Unnamed: 0,MAE,MSE,RMSE,R^2
Training Data,847.114,1297613.079,1139.128,0.562
Test Data,803.999,1194053.573,1092.728,0.567


## Tuned Hyperparameters

## Check Assumptions

### Linearity

### Multicollinearity

### Normality

### Homoscedasticity