<a href="https://colab.research.google.com/github/israa252/Prediction-of-Product-Sales/blob/main/Project_1_Part_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Project 1 - Part 5

 - By: Israa Rasheed

#Import libraries

In [19]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn import set_config

# Return pandas DataFrames instead of numpy arrays
set_config(transform_output='pandas')


#Load data

In [20]:
# Load the raw dataset (unmodified)
path = "path/to/sales_dataset.csv"
df = pd.read_csv("/content/drive/MyDrive/AXSOSACADEMY/01-Fundamentals/Week02/Data/sales_predictions_2023.csv")

# Quick inspection
df.info()
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


#Clean before split

In [21]:
# Remove duplicates
df = df.drop_duplicates()

# Fix inconsistent categories for Item_Fat_Content
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({
    'LF': 'Low Fat',
    'low fat': 'Low Fat',
    'reg': 'Regular'
})

# Drop Item_Identifier (too many unique IDs)
df = df.drop(columns=['Item_Identifier'])


# Explanation:
 standardized the categories in "Item_Fat_Content" because the same category
 was written in different ways (e.g., "LF", "low fat", "Low Fat").
 Without fixing this, the OneHotEncoder would treat them as separate categories.
 We dropped "Item_Identifier" because it is a unique product ID with very high
 cardinality. It does not provide useful predictive information but would create
 many unnecessary dummy columns after encoding.

#Separate features (X) and target (y)

In [22]:
target = "Item_Outlet_Sales"
y = df[target]
X = df.drop(columns=[target])


#Train-test split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, test_size=0.2
)


#Identify column types

In [24]:
num_cols = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year']
cat_cols = ['Item_Fat_Content', 'Item_Type', 'Outlet_Identifier',
            'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']


#Build preprocessing pipelines

In [25]:
# Numeric pipeline
num_pipe = make_pipeline(
    SimpleImputer(strategy="mean"),
    StandardScaler()
)

# Categorical pipeline
cat_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore", sparse_output=False)
)


#Create ColumnTransformer

In [26]:
preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", num_pipe, num_cols),
        ("categorical", cat_pipe, cat_cols)
    ],
    verbose_feature_names_out=False
)


#Fit and transform

In [27]:
preprocessor.fit(X_train)

X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

X_train_processed.head()


Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Type_Baking Goods,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,...,Outlet_Size_High,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
549,-0.801383,-0.600703,0.470709,0.136169,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
7757,1.210152,-0.362159,0.457877,0.493521,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
764,1.115491,0.194933,-0.482625,-0.102066,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
6867,-1.079448,-0.704944,-1.603553,0.493521,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2716,-0.008602,1.383177,0.218375,-0.102066,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [28]:

print(X_train_processed.columns)

Index(['Item_Weight', 'Item_Visibility', 'Item_MRP',
       'Outlet_Establishment_Year', 'Item_Fat_Content_Low Fat',
       'Item_Fat_Content_Regular', 'Item_Type_Baking Goods',
       'Item_Type_Breads', 'Item_Type_Breakfast', 'Item_Type_Canned',
       'Item_Type_Dairy', 'Item_Type_Frozen Foods',
       'Item_Type_Fruits and Vegetables', 'Item_Type_Hard Drinks',
       'Item_Type_Health and Hygiene', 'Item_Type_Household', 'Item_Type_Meat',
       'Item_Type_Others', 'Item_Type_Seafood', 'Item_Type_Snack Foods',
       'Item_Type_Soft Drinks', 'Item_Type_Starchy Foods',
       'Outlet_Identifier_OUT010', 'Outlet_Identifier_OUT013',
       'Outlet_Identifier_OUT017', 'Outlet_Identifier_OUT018',
       'Outlet_Identifier_OUT019', 'Outlet_Identifier_OUT027',
       'Outlet_Identifier_OUT035', 'Outlet_Identifier_OUT045',
       'Outlet_Identifier_OUT046', 'Outlet_Identifier_OUT049',
       'Outlet_Size_High', 'Outlet_Size_Medium', 'Outlet_Size_Small',
       'Outlet_Location_Type_Tier 1'

In [29]:
print("Number of missing values:", X_train_processed.isna().sum().sum())

Number of missing values: 0


#Validate processed data

In [30]:
# Ensure all columns numeric
print(all(X_train_processed.dtypes == "float64"))

# Confirm scaling worked on numeric features
X_train_processed[num_cols].describe().round(2)


True


Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year
count,6818.0,6818.0,6818.0,6818.0
mean,-0.0,-0.0,-0.0,0.0
std,1.0,1.0,1.0,1.0
min,-1.97,-1.29,-1.77,-1.53
25%,-0.83,-0.76,-0.76,-1.29
50%,0.0,-0.23,0.04,0.14
75%,0.76,0.56,0.72,0.73
max,2.0,5.1,2.0,1.33


# Modeling

In [31]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error


# Evaluation function

In [37]:

def evaluate_model(model, X_train, y_train, X_test, y_test, name="Model"):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    r2_train = r2_score(y_train, y_train_pred)
    r2_test = r2_score(y_test, y_test_pred)
    rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
    rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
    mae_test = mean_absolute_error(y_test, y_test_pred)

    print(f"\n {name}")
    print(f"Train -> RÂ²: {r2_train:.3f}, RMSE: {rmse_train:.2f}")
    print(f"Test  -> RÂ²: {r2_test:.3f}, RMSE: {rmse_test:.2f}, MAE: {mae_test:.2f}")

    return {
        "R2_Train": r2_train, "R2_Test": r2_test,
        "RMSE_Train": rmse_train, "RMSE_Test": rmse_test,
        "MAE_Test": mae_test
    }



# Linear Regression

In [35]:
lin_reg = LinearRegression()
lin_reg.fit(X_train_processed, y_train)

lin_results = evaluate_model(lin_reg, X_train_processed, y_train, X_test_processed, y_test, "Linear Regression")


ðŸ“Š Linear Regression
Train -> RÂ²: 0.559, RMSE: 1141.53
Test  -> RÂ²: 0.579, RMSE: 1069.36, MAE: 792.02


#Random Forest (Default)

In [36]:
rf_default = RandomForestRegressor(random_state=42)
rf_default.fit(X_train_processed, y_train)

rf_default_results = evaluate_model(rf_default, X_train_processed, y_train, X_test_processed, y_test, "Random Forest (Default)")



ðŸ“Š Random Forest (Default)
Train -> RÂ²: 0.937, RMSE: 430.05
Test  -> RÂ²: 0.568, RMSE: 1083.22, MAE: 757.00


#Tuned Random Forest

In [38]:
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10, 20],
}

rf = RandomForestRegressor(random_state=42)

grid = GridSearchCV(rf, param_grid, cv=3, scoring="r2", n_jobs=-1)
grid.fit(X_train_processed, y_train)

print("Best Parameters:", grid.best_params_)

rf_tuned = grid.best_estimator_
rf_tuned_results = evaluate_model(rf_tuned, X_train_processed, y_train, X_test_processed, y_test, "Random Forest (Tuned)")

Best Parameters: {'max_depth': 10, 'n_estimators': 200}

 Random Forest (Tuned)
Train -> RÂ²: 0.716, RMSE: 915.85
Test  -> RÂ²: 0.602, RMSE: 1040.57, MAE: 728.20


In [40]:
print("Linear Regression:", lin_results)
print("Random Forest (Default):", rf_default_results)
print("Random Forest (Tuned):", rf_tuned_results)

Linear Regression: {'R2_Train': 0.5594752325556047, 'R2_Test': 0.579266411441941, 'RMSE_Train': np.float64(1141.5315993248257), 'RMSE_Test': np.float64(1069.3649048482189), 'MAE_Test': 792.0247141085217}
Random Forest (Default): {'R2_Train': 0.9374767741359172, 'R2_Test': 0.5682937068552985, 'RMSE_Train': np.float64(430.05446517418375), 'RMSE_Test': np.float64(1083.2196371638406), 'MAE_Test': 756.995692060997}
Random Forest (Tuned): {'R2_Train': 0.716444114327162, 'R2_Test': 0.6016184642637891, 'RMSE_Train': np.float64(915.8452677741735), 'RMSE_Test': np.float64(1040.5715204750807), 'MAE_Test': 728.1994161379126}


## Model Recommendation

After testing several models, I recommend using the **Tuned Random Forest**.  

### Why?
- It achieves the **best test performance** among all models.  
- The gap between training (RÂ² = 0.716) and testing (RÂ² = 0.602) is small, meaning the model is **not overfitting**.  
- Test RÂ² = 0.602 means the model explains about **60% of the variation in sales**. In simple terms: if sales were a puzzle with 100 pieces, our model can correctly predict about 60 of them.  

### Additional Metric
We also evaluate the model using **Root Mean Squared Error (RMSE)**, which measures the average prediction error in the same units as sales.  
- The RMSE â‰ˆ **1040** â†’ on average, our predictions are off by about **1,040 sales units**.  
- RMSE is chosen because it is intuitive: it shows how far our predictions deviate from actual sales in real-world numbers.  

### Overfitting/Underfitting Check
- Training RÂ² = 0.716  
- Testing RÂ² = 0.602  
- The difference is moderate, which shows the model generalizes well and does not memorize the training data.  

**Final Decision:** Implement the **Tuned Random Forest** as it offers the best balance between accuracy and generalization.  
