<a href="https://colab.research.google.com/github/jdjones91/Sales_Predictions/blob/main/Sales_Predictions_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# James Jones
###09-15-2022

## We will begin pre-processing a dataset to develop a machine learning model that can help predict product sales for grocery retailer

In [12]:
# Load in necessary libraries

import pandas as pd # To load and manipulate our dataframe
import numpy as np # To view our results
from sklearn.model_selection import train_test_split # To split our data into training and testing sets
from sklearn.compose import make_column_transformer, make_column_selector # To create our transformers
from sklearn.preprocessing import StandardScaler, OneHotEncoder # To scale our numeric data and OneHotEncode our nominal data
from sklearn.pipeline import make_pipeline # To create our pipelines
from sklearn.impute import SimpleImputer # To impute missing values
from sklearn import set_config # To create simple diagrams showing our processing steps
set_config(display='diagram')

In [13]:
# Load in data
df = pd.read_csv('/content/drive/MyDrive/Data Sets (CD)/sales_predictions.csv')
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


### We will give this dataframe a once over before moving to pre-processing, just to ensure that it is as clean as we can make it

In [14]:
# Make a copy to manipulate for machine learning, so we don't lose our original
DF = df.copy()

In [15]:
# Now, let's inspect our dataset. We want to perform a little cleaning prior to pre-processing

DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [16]:
# Item_Weight and Item_Size appear to be missing values. This will be addressed later
  # Let's check for duplicates
DF.duplicated().sum()

0

In [17]:
# No duplicates. We can move on to inconsistencies in ORDINAL data
DF['Item_Fat_Content'].value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [18]:
# This is our only ordinal category, because it travels from low->regular
  # Super market type and location are different, but not necessarily ordered
DF = DF.replace({'Low Fat':0, 'LF':0, 'low fat':0,
                      'Regular':1, 'reg':1})
# "0" Is used as our lowest value to stay consistent with Python's 0 indexing

In [19]:
# We could have done this one of two ways.
  # 1) re-named all values to "Low Fat" and "Regular", then replaced them with the corresponding number
  # 2) As we did above, and set all values corresponding to "Low Fat" to 0, and all corresponding to "Regular" to 1
DF['Item_Fat_Content'].value_counts()

0    5517
1    3006
Name: Item_Fat_Content, dtype: int64

In [20]:
# Now, we will remove unnecessary columns. Some of these wont affect our outcome, and therefore we will just remove them

DF = DF.drop(columns = ['Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Item_Type'])
DF.head()
  # Note: We will keep the "Item_Identifier" and the "Outlet_Identifier rows"...
  # These correspond to our specific items and specific stores, so the rest of the identifying information is accessory 

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Identifier,Item_Outlet_Sales
0,FDA15,9.3,0,0.016047,249.8092,OUT049,3735.138
1,DRC01,5.92,1,0.019278,48.2692,OUT018,443.4228
2,FDN15,17.5,0,0.01676,141.618,OUT049,2097.27
3,FDX07,19.2,1,0.0,182.095,OUT010,732.38
4,NCD19,8.93,0,0.0,53.8614,OUT013,994.7052


# Now, we want to identify our Features (denoted by 'X') and our Target(denoted by 'y')
  - Then, we will train test split our dataset

In [21]:
y = DF['Item_Outlet_Sales'] # What we're trying to predict
X = DF.drop(columns = 'Item_Outlet_Sales') # All data EXCEPT what we're trying to predict

In [22]:
# Train Test Split allows us to sepparate our data, at random, to a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [24]:
X_train.info() # This is an unnecessary step, that is only to demonstrate a successful split

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6392 entries, 4776 to 7270
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Item_Identifier    6392 non-null   object 
 1   Item_Weight        5285 non-null   float64
 2   Item_Fat_Content   6392 non-null   int64  
 3   Item_Visibility    6392 non-null   float64
 4   Item_MRP           6392 non-null   float64
 5   Outlet_Identifier  6392 non-null   object 
dtypes: float64(3), int64(1), object(2)
memory usage: 349.6+ KB


# Create a Pre-processing object to prepare our data for machine learning
  - Since we dropped those extra rows, we now only have one remaining column that is misssing data, "Item_Weight"
    - The "Item_Weight" column is a float. For this reason we can use "mean" when building our simple imputer 

In [49]:
# Instantiate imputer
mean_imputer = SimpleImputer(strategy = 'mean')

# Instantiate scaler
scaler = StandardScaler()

In [50]:
#Instantiate column selectors
cat_selector = make_column_selector(dtype_include='object')
num_selector = make_column_selector(dtype_include='number')

In [51]:
# Instantiate One Hot Encoder
ohe = OneHotEncoder(handle_unknown = 'ignore', sparse = False)
  # handle_unknown = 'ignore' allows values that were'nt encountered (if any) durring the fit process to pass through indtead of throwing an error
  # Sparse = False does not compress our data, and makes it more ledgible

In [52]:
# Instantiate numeric pipeline
num_pipeline = make_pipeline(mean_imputer, scaler)
num_pipeline

In [53]:
# Instantiate categorical pipeline
cat_pipeline = make_pipeline(ohe)
cat_pipeline

In [54]:
# Creat tuples for transformers
num_tuple = (num_pipeline, num_selector)
cat_tuple = (cat_pipeline, cat_selector)

In [55]:
# Create column transformer (we'll call it 'preprocessor')
preprocessor = make_column_transformer(num_tuple, cat_tuple, remainder = 'passthrough') # Anything un-transformed, if any
preprocessor

In [56]:
# Fit transformer on training data
preprocessor.fit(X_train)

## Now, we can actually transform our data

In [57]:
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [65]:
# Now, let's inspect the results for any inconsistencies

# X_train first
print(np.isnan(X_train_processed).sum().sum(), 'missing values in training data') # Check for missing values
print('\n')
print('All data in X_train_processed are', X_train_processed.dtype) # Check data type (should all be numeric)
print('\n')
print('Shape of data is', X_train_processed.shape) # Shows number of columns and rows (indicative of One Hot Encoding)
print('\n')
X_train_processed # Shows our processed data as a NumPy array

0 missing values in training data


All data in X_train_processed are float64


Shape of data is (6392, 1564)




array([[ 0.81724868, -0.7403206 , -0.71277507, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.5563395 ,  1.35076614, -1.29105225, ...,  0.        ,
         0.        ,  0.        ],
       [-0.13151196,  1.35076614,  1.81331864, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.11373638, -0.7403206 , -0.92052713, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.76600931, -0.7403206 , -0.2277552 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.81724868, -0.7403206 , -0.95867683, ...,  0.        ,
         1.        ,  0.        ]])

In [67]:
# For X_test
print(np.isnan(X_test_processed).sum().sum(), 'missing values in training data')
print('\n')
print('All data in X_train_processed are', X_test_processed.dtype)
print('\n')
print('Shape of data is', X_test_processed.shape) # Note, the number of columns match
print('\n')
X_test_processed

0 missing values in training data


All data in X_train_processed are float64


Shape of data is (2131, 1564)




array([[ 0.33100885, -0.7403206 , -0.77664625, ...,  0.        ,
         0.        ,  0.        ],
       [-1.17989246, -0.7403206 ,  0.1003166 , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.37844688,  1.35076614, -0.48299432, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-1.13957013, -0.7403206 ,  1.21832428, ...,  1.        ,
         0.        ,  0.        ],
       [-1.49772727, -0.7403206 , -0.77809567, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.52076098, -0.7403206 , -0.77976293, ...,  0.        ,
         0.        ,  0.        ]])

# Pause, more to come later