From Master Weather + Store Subcluster + Holiday + Sales, take days from holiday 0 only (let model learn +- 7). remove everything else. then drop days from holiday column. also categorise holidays: race-related celebrations, sultan birthdays, etc. 

In [None]:
import pandas as pd
from datetime import datetime, timedelta
import re

# Loading dataset
df_phclusterweather = pd.read_csv(r" ... csv") 
daily_sales = pd.read_csv(r" ... csv")

# ───────────────────────────────────────────────────────────────────────────────
# 1) Preparing Data for Time Series
# ───────────────────────────────────────────────────────────────────────────────
df_holiday_days = df_phclusterweather[df_phclusterweather["Days From Holiday"] == 0.0]
df_holiday_days = df_holiday_days.drop(columns = "Days From Holiday")

PH_categories = [
    "International Celebrations",
    "Race/Religion-related Celebrations",
    "Leader's Birthdays",
    "Regional Independence Holidays",
    "Puasa"
    "Others"
]

def categorize_holiday(name):
    name_lower = name.lower()
    
    # Leader's Birthdays (includes Prophet's Birthday and Almarhum Sultan)
    if re.search(r"(birthday|coronation)", name_lower) or "almarhum sultan iskandar" in name_lower:
        return "Leader's Birthdays"

    # Race/Religion-related Celebrations (merged group)
    elif re.search(r"(chinese|thaipusam|gawai|harvest|deepavali|isra|ramadan|good friday|quran|aidilfitri|arafat|haji|muharram|wesak|prophet)", name_lower):
        return "Race/Religion-related Celebrations"

    # International Celebrations
    elif re.search(r"(new year|valentine|labour|christmas)", name_lower):
        return "International Celebrations"

    # Region Independence Holidays
    elif re.search(r"(national day|federal territory|independence)", name_lower):
        return "Regional Independence Holidays"

    # Puasa
    elif re.search(r"(puasa)", name_lower):
        return "Puasa"
    
    # Others
    else:
        return "Others"

df_holiday_days["Name"] = df_holiday_days["Name"].apply(categorize_holiday)

# Dropping Puasa duplicates - 520 rows
df_holiday_nodupe = df_holiday_days.drop_duplicates()

# Puasa column
df_holiday_nodupe["Puasa"] = (df_holiday_nodupe["Puasa Count"] != 0).astype(int)

# Making sure all rows with the same date have 1 if its during Puasa
df_holiday_nodupe["Puasa"] = (
    df_holiday_nodupe.groupby("Date")["Puasa"].transform("max")
)

# Step 1: Count number of unique holiday names per (Date, Store_No)
holiday_counts = df_holiday_nodupe.groupby(["Date", "Store_No"])["Name"].nunique()

# Step 2: Identify keys where multiple holidays exist
multi_holiday_keys = holiday_counts[holiday_counts > 1].index

# Step 3: Drop "Puasa" rows where other holidays exist for that store/date
mask = (df_holiday_nodupe["Name"] == "Puasa") & (df_holiday_nodupe.set_index(["Date", "Store_No"]).index.isin(multi_holiday_keys))
df_cleaned = df_holiday_nodupe[~mask].copy()

### To check which dates have duplicate rows (rows with 2 different public holidays that is not Puasa)
# # 1. Group by Date and Store_No, count unique holiday Names
# multi_name_counts = df_cleaned.groupby(["Date", "Store_No"])["Name"].nunique()

# # 2. Filter for groups with more than 1 holiday
# multi_holiday_days = multi_name_counts[multi_name_counts > 1]

# # 3. Reset index to view as DataFrame
# multi_holiday_days = multi_holiday_days.reset_index().rename(columns={"Name": "Holiday_Count"})

# unique_dates = sorted(multi_holiday_days["Date"].unique())
# print(unique_dates)

df_cleaned2 = df_cleaned.drop(columns=["Puasa Count"])

df_group = df_cleaned2.groupby(["Date", "Store_No"]).agg({
    "Name": lambda x: ", ".join(sorted(set(x))),
    "Net_Amount": "first",
    "TC": "first",
    "State": "first",
    "Opening_Date": "first",
    "CODE (subcluster 1)": "first",
    "CODE FY26 1 (subcluster 2)": "first",
    "CODE FY26 2 (subcluster 3)": "first",
    "Days_after_Opening": "first",
    "Average Daily Temperature (°C)": "first",
    "Rain?": "first",
    "Puasa": "first",
    "Public Holiday": "max"  # since PH values are 0s and 1s
}).reset_index()

# Dropping Net_Amount and TC to repopulate them with daily sales data (currently on sales data for PH, need include non-PH sales data too)
df_no_net_no_tc = df_group.drop(columns=["Net_Amount","TC"])

### Cleaning daily sales data
# Renaming columns so it matches the other merged data
daily_sales.rename(columns={
    "Store No": "Store_No",
    "Net Amount": "Net_Amount"
}, inplace=True)

## Keep only rows where Store_No is purely numeric - V is for vending machines
# Clean, then filter
daily_sales["Store_No"] = daily_sales["Store_No"].astype(str).str.strip()

# Filter purely numeric store numbers
sales_stores = daily_sales[daily_sales["Store_No"].str.fullmatch(r"\d+")].copy()

# Convert to integer after filtering
sales_stores["Store_No"] = sales_stores["Store_No"].astype(int) 

# Convert date to datetime
sales_stores["Date"] = pd.to_datetime(sales_stores["Date"], dayfirst=True, errors="coerce")

# Changing / format to - format
sales_stores["Date"] = sales_stores["Date"].dt.strftime("%Y-%m-%d")

# Convert date to datetime
sales_stores["Date"] = pd.to_datetime(sales_stores["Date"]) #, dayfirst=True, errors="coerce"

# Change columns to desired object types
sales_stores["Net_Amount"] = sales_stores["Net_Amount"].astype(float)
sales_stores["TC"] = sales_stores["TC"].astype(int)

# Obtain only 2022-12-25 to 2027-01-01
sales_stores = sales_stores[
    (sales_stores["Date"] >= "2022-12-25") &
    (sales_stores["Date"] <  "2027-01-01")
]

# Dropping columns we don't need 
sales_stores_cleaned = sales_stores.drop(columns=["Date.Month","Discount","Sales Day"])

# Change to desired object types
df_no_net_no_tc["Date"] = pd.to_datetime(df_no_net_no_tc["Date"]) #, dayfirst=True, errors="coerce"
df_no_net_no_tc["Name"] = df_no_net_no_tc["Name"].astype(str)
df_no_net_no_tc["State"] = df_no_net_no_tc["State"].astype(str)
df_no_net_no_tc["Opening_Date"] = pd.to_datetime(df_no_net_no_tc["Opening_Date"])
df_no_net_no_tc["CODE (subcluster 1)"] = df_no_net_no_tc["CODE (subcluster 1)"].astype(str)
df_no_net_no_tc["CODE FY26 1 (subcluster 2)"] = df_no_net_no_tc["CODE FY26 1 (subcluster 2)"].astype(str)
df_no_net_no_tc["CODE FY26 2 (subcluster 3)"] = df_no_net_no_tc["CODE FY26 2 (subcluster 3)"].astype(str)
df_no_net_no_tc["Days_after_Opening"] = df_no_net_no_tc["Days_after_Opening"].astype(int)
df_no_net_no_tc["Rain?"] = df_no_net_no_tc["Rain?"].astype(str)
df_no_net_no_tc["Public Holiday"] = df_no_net_no_tc["Public Holiday"].astype(int)

# Merging final dataframe
df_final = pd.merge(df_no_net_no_tc, sales_stores_cleaned, on=["Store_No","Date","Name","State","Opening_Date","CODE (subcluster 1)","CODE FY26 1 (subcluster 2)","CODE FY26 2 (subcluster 3)"
                                                               "Days_after_Opening","Average Daily Temperature (°C)","Rain?","Puasa SET TO ZERO","Public Holiday SET TO ZERO"], how="left")

df_final.to_excel(r" ... xlsx", index= False)