# Feature Engineering & Modeling Preparation

In [1]:
# lets import needed libraries

import pandas as pd
import numpy as np

In [2]:
# load dataset

df = pd.read_csv("../data/processed/cleaned_sales.csv")

In [3]:
# lets take a look with basic info

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185916 entries, 0 to 185915
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Order ID          185916 non-null  int64  
 1   Product           185916 non-null  object 
 2   Quantity Ordered  185916 non-null  float64
 3   Price Each        185916 non-null  float64
 4   Order Date        185916 non-null  object 
 5   Purchase Address  185916 non-null  object 
 6   Sales             185916 non-null  float64
 7   Year              185916 non-null  int64  
 8   Month             185916 non-null  int64  
 9   Month_Name        185916 non-null  object 
 10  Day               185916 non-null  int64  
 11  Weekday           185916 non-null  object 
 12  Hour              185916 non-null  int64  
 13  City              185916 non-null  object 
 14  State             185916 non-null  object 
dtypes: float64(3), int64(5), object(7)
memory usage: 21.3+ MB


### Problem Framing for Modeling

**2.1 Business Goal**

The goal of this model is to forecast future monthly sales so the business can plan inventory, staffing, and marketing campaigns more effectively.

**2.2 Why This Model Matters**

Sales forecasting is a vital strategic tool that transforms historical data into actionable insights, allowing a business to transition from reactive to proactive management. By anticipating market fluctuations, forecasting supports critical business decisions such as optimizing inventory to prevent stockouts, aligning staffing levels with predicted demand, and timing marketing campaigns for maximum ROI.

Success for this model is defined by achieving reasonable accuracy, typically within a 10 to 20 percent margin of error for 3-month forecasts, providing a reliable single source of truth that enables leadership to manage cash flow and resources with confidence.

### Time-Based Aggregation

Here we will perform some basic conversions and aggregation.

In [4]:
# convert date column to datetime type

df['Order Date'] = pd.to_datetime(df['Order Date'])
df = df.sort_values('Order Date')


In [5]:
# Create product-month aggregation

df['Month'] = df['Order Date'].dt.to_period('M')

product_monthly = df.groupby(['Product', 'Month']).agg({
    'Sales': 'sum',
    'Quantity Ordered': 'sum'
}).reset_index()

product_monthly['Month'] = product_monthly['Month'].dt.to_timestamp()
product_monthly.head()

Unnamed: 0,Product,Month,Sales,Quantity Ordered
0,20in Monitor,2019-01-01,23647.85,215.0
1,20in Monitor,2019-02-01,27057.54,246.0
2,20in Monitor,2019-03-01,35856.74,326.0
3,20in Monitor,2019-04-01,43226.07,393.0
4,20in Monitor,2019-05-01,37506.59,341.0


### Feature Engineering

In [6]:
# Time-based feature

product_monthly['Month_Num'] = product_monthly['Month'].dt.month
product_monthly['Quarter'] = product_monthly['Month'].dt.quarter


In [7]:
# lag and rolling features (per product)

product_monthly = product_monthly.sort_values(['Product', 'Month'])

product_monthly['Lag_1'] = product_monthly.groupby('Product')['Sales'].shift(1)
product_monthly['Lag_2'] = product_monthly.groupby('Product')['Sales'].shift(2)

product_monthly['Rolling_3'] = (
    product_monthly.groupby('Product')['Sales']
    .shift(1)
    .rolling(3)
    .mean()
    .reset_index(level=0, drop=True)
)

product_monthly = product_monthly.dropna()
product_monthly.head()

Unnamed: 0,Product,Month,Sales,Quantity Ordered,Month_Num,Quarter,Lag_1,Lag_2,Rolling_3
3,20in Monitor,2019-04-01,43226.07,393.0,4,2,35856.74,27057.54,28854.043333
4,20in Monitor,2019-05-01,37506.59,341.0,5,2,43226.07,35856.74,35380.116667
5,20in Monitor,2019-06-01,35416.78,322.0,6,2,37506.59,43226.07,38863.133333
6,20in Monitor,2019-07-01,35966.73,327.0,7,3,35416.78,37506.59,38716.48
7,20in Monitor,2019-08-01,28707.39,261.0,8,3,35966.73,35416.78,36296.7


In [8]:
# Lets save the feature engineered dataset

product_monthly.to_csv("../data/processed/feature_engineered_sales.csv", index=False)