In [8]:
# Import Required Libraries

# Data manipulation
import pandas as pd
import numpy as np

# For encoding and scaling
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Display all columns when checking data
pd.set_option("display.max_columns", None)

In [9]:
# Load the cleaned training data

df = pd.read_csv("../data/processed_data/train_clean.csv")

# Check shape and preview data
print(df.shape)
df.head()

  df = pd.read_csv("../data/processed_data/train_clean.csv")


(844392, 27)


Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,Week,Day,CompetitionDuration,Promo2Duration,CustomerBin,CustomerGroup,CompetitionGroup
0,1,4,2015-07-31,5263,555,1,1,0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,,2015,7,4,31,7.0,0.0,"(536.0, 609.0]",501–1000,1001–2000 m
1,2,4,2015-07-31,6064,625,1,1,0,1,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct",2015,7,4,31,8.0,5.0,"(609.0, 688.0]",501–1000,501–1000 m
2,3,4,2015-07-31,8314,821,1,1,0,1,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct",2015,7,4,31,9.0,4.0,"(780.0, 903.0]",501–1000,5000+ m
3,4,4,2015-07-31,13995,1498,1,1,0,1,c,c,620.0,9.0,2009.0,0,0.0,0.0,,2015,7,4,31,6.0,0.0,"(1116.0, 7388.0]",1000+,501–1000 m
4,5,4,2015-07-31,4822,559,1,1,0,1,a,a,29910.0,4.0,2015.0,0,0.0,0.0,,2015,7,4,31,0.0,0.0,"(536.0, 609.0]",501–1000,5000+ m


In [10]:
# Convert Date column to datetime format
df["Date"] = pd.to_datetime(df["Date"])

In [11]:
# Extract year from date
df["Year"] = df["Date"].dt.year

# Extract month (captures seasonality)
df["Month"] = df["Date"].dt.month

# Extract day of month
df["Day"] = df["Date"].dt.day

# Extract day of week (0 = Monday, 6 = Sunday)
df["WeekDay"] = df["Date"].dt.weekday

# Create a weekend indicator (Saturday & Sunday)
df["IsWeekend"] = df["WeekDay"].isin([5, 6]).astype(int)

In [12]:
# Drop Date column (already extracted useful information)
df.drop(columns=["Date"], inplace=True)

In [13]:
# List of categorical columns
categorical_cols = [
    "StoreType",
    "Assortment",
    "StateHoliday",
    "PromoInterval"
]

categorical_cols

['StoreType', 'Assortment', 'StateHoliday', 'PromoInterval']

In [14]:
# Ensure all categorical columns are treated as strings
for col in categorical_cols:
    df[col] = df[col].astype(str)

In [15]:
# Initialize label encoder
label_encoder = LabelEncoder()

# Apply label encoding to each categorical column
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])

In [16]:
# Seprate feature

# Target variable
y = df["Sales"].values.ravel()

# Feature set
X = df.drop(columns=["Sales"])

In [17]:
# Final feature list

X.columns

Index(['Store', 'DayOfWeek', 'Customers', 'Open', 'Promo', 'StateHoliday',
       'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'Year', 'Month',
       'Week', 'Day', 'CompetitionDuration', 'Promo2Duration', 'CustomerBin',
       'CustomerGroup', 'CompetitionGroup', 'WeekDay', 'IsWeekend'],
      dtype='object')

In [18]:
# Check data types and preview
X.dtypes

Store                          int64
DayOfWeek                      int64
Customers                      int64
Open                           int64
Promo                          int64
StateHoliday                   int64
SchoolHoliday                  int64
StoreType                      int64
Assortment                     int64
CompetitionDistance          float64
CompetitionOpenSinceMonth    float64
CompetitionOpenSinceYear     float64
Promo2                         int64
Promo2SinceWeek              float64
Promo2SinceYear              float64
PromoInterval                  int64
Year                           int32
Month                          int32
Week                           int64
Day                            int32
CompetitionDuration          float64
Promo2Duration               float64
CustomerBin                   object
CustomerGroup                 object
CompetitionGroup              object
WeekDay                        int32
IsWeekend                      int64
d

In [19]:
# checking missing value

X.isnull().sum()

Store                        0
DayOfWeek                    0
Customers                    0
Open                         0
Promo                        0
StateHoliday                 0
SchoolHoliday                0
StoreType                    0
Assortment                   0
CompetitionDistance          0
CompetitionOpenSinceMonth    0
CompetitionOpenSinceYear     0
Promo2                       0
Promo2SinceWeek              0
Promo2SinceYear              0
PromoInterval                0
Year                         0
Month                        0
Week                         0
Day                          0
CompetitionDuration          0
Promo2Duration               0
CustomerBin                  0
CustomerGroup                0
CompetitionGroup             0
WeekDay                      0
IsWeekend                    0
dtype: int64

In [20]:
# Checking Duplicate value

X.duplicated().sum()

np.int64(0)

In [21]:
# Define target variable

# Target shape
y.shape

(844392,)

In [22]:
# Target distribution summary

pd.Series(y).describe()

count    844392.000000
mean       6955.514291
std        3104.214680
min           0.000000
25%        4859.000000
50%        6369.000000
75%        8360.000000
max       41551.000000
dtype: float64

In [25]:
# Drop EDA-only helper columns (not used for modeling)
eda_cols_to_drop = [
    "CustomerBin",
    "CustomerGroup",
    "CompetitionGroup"
]

df.drop(columns=eda_cols_to_drop, inplace=True)

In [26]:
# Define target
y = df["Sales"].values.ravel()

# Define features
X = df.drop(columns=["Sales"])

In [27]:
X.dtypes

Store                          int64
DayOfWeek                      int64
Customers                      int64
Open                           int64
Promo                          int64
StateHoliday                   int64
SchoolHoliday                  int64
StoreType                      int64
Assortment                     int64
CompetitionDistance          float64
CompetitionOpenSinceMonth    float64
CompetitionOpenSinceYear     float64
Promo2                         int64
Promo2SinceWeek              float64
Promo2SinceYear              float64
PromoInterval                  int64
Year                           int32
Month                          int32
Week                           int64
Day                            int32
CompetitionDuration          float64
Promo2Duration               float64
WeekDay                        int32
IsWeekend                      int64
dtype: object

In [28]:
# Initialize scaler
scaler = StandardScaler()

# Scale features
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [29]:
# Final Validation after scaling

X_scaled.head()

Unnamed: 0,Store,DayOfWeek,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,Week,Day,CompetitionDuration,Promo2Duration,WeekDay,IsWeekend
0,-1.732571,0.858414,-0.517732,0.0,1.113726,-0.029796,2.041038,0.582814,-0.942988,-0.535816,0.950941,0.682279,-0.997372,-0.757527,-0.997371,0.909585,1.502796,0.347258,0.858414,1.746347,0.633683,-0.622407,0.858414,-0.460344
1,-1.729462,0.858414,-0.343268,0.0,1.113726,-0.029796,2.041038,-0.884146,-0.942988,-0.625549,1.417831,0.68121,1.002635,0.09171,1.00089,-0.877596,1.502796,0.347258,0.858414,1.746347,0.814779,2.387273,0.858414,-0.460344
2,-1.726354,0.858414,0.145233,0.0,1.113726,-0.029796,2.041038,-0.884146,-0.942988,1.112703,1.651277,0.680142,1.002635,0.157036,1.001885,-0.877596,1.502796,0.347258,0.858414,1.746347,0.995875,1.785337,0.858414,-0.460344
3,-1.723246,0.858414,1.832556,0.0,1.113726,-0.029796,2.041038,0.582814,1.070916,-0.619139,0.950941,0.683348,-0.997372,-0.757527,-0.997371,0.909585,1.502796,0.347258,0.858414,1.746347,0.452588,-0.622407,0.858414,-0.460344
4,-1.720138,0.858414,-0.507763,0.0,1.113726,-0.029796,2.041038,-0.884146,-0.942988,3.135536,-0.216285,0.689761,-0.997372,-0.757527,-0.997371,0.909585,1.502796,0.347258,0.858414,1.746347,-0.633987,-0.622407,0.858414,-0.460344


In [30]:
X_scaled.shape

(844392, 24)

In [32]:
# Save final feature-engineered dataset
# Save feature matrix
X_scaled.to_csv(
    "../data/processed_data/train_features.csv",
    index=False
)

# Save target variable
pd.Series(y).to_csv(
    "../data/processed_data/train_target.csv",
    index=False
)

print("Notebook 3 completed successfully.")
print("Feature matrix shape:", X_scaled.shape)
print("Target vector shape:", y.shape)

Notebook 3 completed successfully.
Feature matrix shape: (844392, 24)
Target vector shape: (844392,)
