## Load Cleaned Data

In [1]:
import pandas as pd
import os

# Define directories
DATA_DIR = "../data/"

# Load cleaned dataset
df_train = pd.read_csv(os.path.join(DATA_DIR, "train_cleaned.csv"))

print(" Cleaned dataset loaded!")
print(df_train.info())  # Check data structure


 Cleaned dataset loaded!
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128585 entries, 0 to 128584
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   record_ID        128585 non-null  int64  
 1   week             128585 non-null  object 
 2   store_id         128585 non-null  int64  
 3   sku_id           128585 non-null  int64  
 4   total_price      128585 non-null  float64
 5   base_price       128585 non-null  float64
 6   is_featured_sku  128585 non-null  int64  
 7   is_display_sku   128585 non-null  int64  
 8   units_sold       128585 non-null  int64  
 9   year             128585 non-null  int64  
 10  month            128585 non-null  int64  
 11  week_num         128585 non-null  int64  
 12  quarter          128585 non-null  int64  
 13  day_of_week      128585 non-null  int64  
dtypes: float64(2), int64(11), object(1)
memory usage: 13.7+ MB
None


In [2]:
categorical_cols = df_train.select_dtypes(include=["object"]).columns
print("Categorical columns:", categorical_cols)


Categorical columns: Index(['week'], dtype='object')


In [3]:
df_train.drop(columns=["week"], inplace=True)
print("Dropped 'week' column!")


Dropped 'week' column!


In [4]:
categorical_cols = df_train.select_dtypes(include=["object"]).columns
print("Categorical columns after dropping 'week':", categorical_cols)


Categorical columns after dropping 'week': Index([], dtype='object')


In [5]:
print(df_train.info())  # Check data structure


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128585 entries, 0 to 128584
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   record_ID        128585 non-null  int64  
 1   store_id         128585 non-null  int64  
 2   sku_id           128585 non-null  int64  
 3   total_price      128585 non-null  float64
 4   base_price       128585 non-null  float64
 5   is_featured_sku  128585 non-null  int64  
 6   is_display_sku   128585 non-null  int64  
 7   units_sold       128585 non-null  int64  
 8   year             128585 non-null  int64  
 9   month            128585 non-null  int64  
 10  week_num         128585 non-null  int64  
 11  quarter          128585 non-null  int64  
 12  day_of_week      128585 non-null  int64  
dtypes: float64(2), int64(11)
memory usage: 12.8 MB
None


In [8]:
## Scale Numerical Features
from sklearn.preprocessing import StandardScaler

In [9]:
scaler = StandardScaler()
numeric_cols = df_train.select_dtypes(include=["number"]).columns
df_train[numeric_cols] = scaler.fit_transform(df_train[numeric_cols])


In [10]:
# Save the Final Processed Dataset
df_train.to_csv(os.path.join(DATA_DIR, "train_processed.csv"), index=False)