<a href="https://colab.research.google.com/github/krheams60/Sales-predictions/blob/main/Project_1_Part_5_Ken_Rheams.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, \
OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn import set_config
set_config(display='diagram')

In [2]:
df = pd.read_csv('/content/sales_predictions (1).csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [3]:
# copy of original df to avoid any manipulations
eda_ml = df.copy()

In [4]:
# Check for missing or erroneous data
eda_ml.isna().sum()


Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [5]:
# check for any duplicates
eda_ml.duplicated().sum()

0

In [6]:
# show summary stats for all numeric columns
eda_ml.describe(include='all')

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
count,8523,7060.0,8523,8523.0,8523,8523.0,8523,8523.0,6113,8523,8523,8523.0
unique,1559,,5,,16,,10,,3,3,4,
top,FDW13,,Low Fat,,Fruits and Vegetables,,OUT027,,Medium,Tier 3,Supermarket Type1,
freq,10,,5089,,1232,,935,,2793,3350,5577,
mean,,12.857645,,0.066132,,140.992782,,1997.831867,,,,2181.288914
std,,4.643456,,0.051598,,62.275067,,8.37176,,,,1706.499616
min,,4.555,,0.0,,31.29,,1985.0,,,,33.29
25%,,8.77375,,0.026989,,93.8265,,1987.0,,,,834.2474
50%,,12.6,,0.053931,,143.0128,,1999.0,,,,1794.331
75%,,16.85,,0.094585,,185.6437,,2004.0,,,,3101.2964


In [7]:
# replace and correct inconsistent categories of data
df.replace(['LF', 'low fat'], ['Low Fat', 'Low Fat'], inplace = True)

In [8]:
# replace and correct inconsistent categories of data
df.replace(['reg'], ['Regular'], inplace = True)

In [9]:
# this is a ordinal variable column
df['Outlet_Size'].value_counts()


Medium    2793
Small     2388
High       932
Name: Outlet_Size, dtype: int64

In [10]:
# used ordinal encoding to replace dictionary with numeric
replacement_dictionary = {'Small':0, 'Medium':1, 'High':2}
df['Outlet_Size'].replace(replacement_dictionary, inplace=True)
df['Outlet_Size']


0       1.0
1       1.0
2       1.0
3       NaN
4       2.0
       ... 
8518    2.0
8519    NaN
8520    0.0
8521    1.0
8522    0.0
Name: Outlet_Size, Length: 8523, dtype: float64

In [11]:
# This is to split X and y for features and the target
X = eda_ml.drop(columns=['Item_Outlet_Sales'])
y = eda_ml['Item_Outlet_Sales']


In [12]:
# This splits training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)



In [13]:
# instantiate SimpleImputers with most_frequent and mean strategies
freq_imputer = SimpleImputer(strategy='most_frequent')
mean_imputer = SimpleImputer(strategy='mean')
missing_imputer = SimpleImputer(strategy='constant', fill_value='missing')
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)




In [14]:
# instantiate the selectors to for numeric and categorical data types
num_selector = make_column_selector(dtype_include='number')
cat_selector = make_column_selector(dtype_include='object')

In [15]:
# select the numeric columns of each type
num_columns = num_selector(X_train)
num_columns = num_selector(X_train)


In [16]:
# create tuples of (imputer, selector) for each datatype then instantiate ColumnTransformer
num_tuple = (mean_imputer, num_selector)
cat_tuple = (freq_imputer, cat_selector)
col_transformer = make_column_transformer(num_tuple, cat_tuple, remainder='passthrough')
col_transformer


In [17]:
# fit ColumnTransformer on the training data then transform both the training and testing data
col_transformer.fit(X_train)
X_train_imputed = col_transformer.transform(X_train)
X_test_imputed = col_transformer.transform(X_test)
X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train.columns)
X_train_imputed.isna().any()



Item_Identifier              False
Item_Weight                  False
Item_Fat_Content             False
Item_Visibility              False
Item_Type                    False
Item_MRP                     False
Outlet_Identifier            False
Outlet_Establishment_Year    False
Outlet_Size                  False
Outlet_Location_Type         False
Outlet_Type                  False
dtype: bool

In [18]:
# returns a list of column names that are objects
cat_selector(X_train)


['Item_Identifier',
 'Item_Fat_Content',
 'Item_Type',
 'Outlet_Identifier',
 'Outlet_Size',
 'Outlet_Location_Type',
 'Outlet_Type']

In [19]:
# create a subset of data for only categorical columns
train_cat_data = X_train[cat_selector(X_train)]
test_cat_data = X_test[cat_selector(X_test)]
train_cat_data


Unnamed: 0,Item_Identifier,Item_Fat_Content,Item_Type,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type
4776,NCG06,Low Fat,Household,OUT018,Medium,Tier 3,Supermarket Type2
7510,FDV57,Regular,Snack Foods,OUT018,Medium,Tier 3,Supermarket Type2
5828,FDM27,Regular,Meat,OUT049,Medium,Tier 1,Supermarket Type1
5327,FDG24,Low Fat,Baking Goods,OUT035,Small,Tier 2,Supermarket Type1
4810,FDD05,Low Fat,Frozen Foods,OUT045,,Tier 2,Supermarket Type1
...,...,...,...,...,...,...,...
5734,FDY08,Regular,Fruits and Vegetables,OUT010,,Tier 3,Grocery Store
5191,FDC41,Low Fat,Frozen Foods,OUT017,,Tier 2,Supermarket Type1
5390,NCQ53,Low Fat,Health and Hygiene,OUT045,,Tier 2,Supermarket Type1
860,FDL46,low fat,Snack Foods,OUT017,,Tier 2,Supermarket Type1


In [20]:
# fit the OneHotEncoder on the training data then transform both the training and the testing data
ohe_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')



In [21]:
# Creating scaler object to scale the data and imputer
scaler = StandardScaler()
mean_imputer = SimpleImputer(strategy='mean')



In [22]:
# combines the imputer and the scaler into a pipeline
preprocessing_pipeline = make_pipeline(mean_imputer, scaler)
preprocessing_pipeline

In [23]:
# creates a list of ordinal labels in order of least to most
size_labels = ['Small','Medium','High']
ordered_labels = [size_labels]
ordinal = OrdinalEncoder(categories = ordered_labels)


In [24]:
# Setup the pipelines for the numeric and categorical data
num_pipeline = make_pipeline(mean_imputer, scaler)
ord_pipeline = make_pipeline(freq_imputer, ordinal)
nom_pipeline = make_pipeline(missing_imputer, ohe)

In [25]:
# Create column lists for objects and a number selector
ordinal_cols = ['Outlet_Size']
nominal_cols = ['Item_Identifier', 'Item_Fat_Content', 'Item_Type','Outlet_Identifier','Outlet_Location_Type','Outlet_Type']
num_selector = make_column_selector(dtype_include='number')


In [26]:
# Setup the tuples to pair the processors with the make column selectors
numeric_tuple = (num_pipeline, num_selector)
ordinal_tuple = (ord_pipeline, ordinal_cols)
nominal_tuple = (nom_pipeline, nominal_cols)

In [27]:
# Instantiate the make column transformer
preprocessor = make_column_transformer(ordinal_tuple, 
                                          numeric_tuple, 
                                          nominal_tuple, 
                                          remainder='drop')

In [28]:
# Fit the column transformer on the X_train
preprocessor.fit(X_train)

In [29]:
# Transform the X_train and the X_test
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
X_train_transformed.shape


(6392, 1593)

In [30]:
# Check the results
X_train_transformed[:5]

array([[ 1.        ,  0.81724868, -0.71277507, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.        ,  0.5563395 , -1.29105225, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.        , -0.13151196,  1.81331864, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.        , -1.1692189 , -1.00493112, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.        ,  1.52881915, -0.96548425, ...,  1.        ,
         0.        ,  0.        ]])

In [31]:
np.isnan(X_train_transformed).sum()

0