<a href="https://colab.research.google.com/github/israa252/Prediction-of-Product-Sales/blob/main/Project_1_Part_5_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Project 1 - Part 5

 - By: Israa Rasheed

#Import libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn import set_config

# Return pandas DataFrames instead of numpy arrays
set_config(transform_output='pandas')


#Load data

In [5]:
# Load the raw dataset (unmodified)
path = "path/to/sales_dataset.csv"
df = pd.read_csv("/content/drive/MyDrive/AXSOSACADEMY/01-Fundamentals/Week02/Data/sales_predictions_2023.csv")

# Quick inspection
df.info()
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


#Clean before split

In [3]:
# Remove duplicates
df = df.drop_duplicates()

# Fix inconsistent categories for Item_Fat_Content
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({
    'LF': 'Low Fat',
    'low fat': 'Low Fat',
    'reg': 'Regular'
})

# Drop Item_Identifier (too many unique IDs)
df = df.drop(columns=['Item_Identifier'])


# Explanation:
 We standardized the categories in "Item_Fat_Content" because the same category
 was written in different ways (e.g., "LF", "low fat", "Low Fat").
 Without fixing this, the OneHotEncoder would treat them as separate categories.
 We dropped "Item_Identifier" because it is a unique product ID with very high
 cardinality. It does not provide useful predictive information but would create
 many unnecessary dummy columns after encoding.

#Separate features (X) and target (y)

In [6]:
target = "Item_Outlet_Sales"
y = df[target]
X = df.drop(columns=[target])


#Train-test split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, test_size=0.2
)


#Identify column types

In [8]:
num_cols = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year']
cat_cols = ['Item_Fat_Content', 'Item_Type', 'Outlet_Identifier',
            'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']


#Build preprocessing pipelines

In [9]:
# Numeric pipeline
num_pipe = make_pipeline(
    SimpleImputer(strategy="mean"),
    StandardScaler()
)

# Categorical pipeline
cat_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore", sparse_output=False)
)


#Create ColumnTransformer

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", num_pipe, num_cols),
        ("categorical", cat_pipe, cat_cols)
    ],
    verbose_feature_names_out=False
)


#Fit and transform

In [11]:
preprocessor.fit(X_train)

X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

X_train_processed.head()


Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Fat_Content_LF,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Fat_Content_low fat,Item_Fat_Content_reg,Item_Type_Baking Goods,...,Outlet_Size_High,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
549,-0.801383,-0.600703,0.470709,0.136169,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
7757,1.210152,-0.362159,0.457877,0.493521,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
764,1.115491,0.194933,-0.482625,-0.102066,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
6867,-1.079448,-0.704944,-1.603553,0.493521,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2716,-0.008602,1.383177,0.218375,-0.102066,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


#Validate processed data

In [12]:
# Ensure all columns numeric
print(all(X_train_processed.dtypes == "float64"))

# Confirm scaling worked on numeric features
X_train_processed[num_cols].describe().round(2)


True


Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year
count,6818.0,6818.0,6818.0,6818.0
mean,-0.0,-0.0,-0.0,0.0
std,1.0,1.0,1.0,1.0
min,-1.97,-1.29,-1.77,-1.53
25%,-0.83,-0.76,-0.76,-1.29
50%,0.0,-0.23,0.04,0.14
75%,0.76,0.56,0.72,0.73
max,2.0,5.1,2.0,1.33
