To create PH data - final.

In [None]:
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re

################# 2023 Data
# 1. Load the html page 
html_file = "Holidays and Observances in Malaysia in 2023.html"   # adjust path if needed
with open(html_file, "r", encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser")

# 2. Select only the visible holiday rows
rows = soup.select("table#holidays-table tbody tr.showrow")   # 63 rows in 2023 :contentReference[oaicite:0]{index=0}

# 3. Pull the four columns we want
records = []
for r in rows:
    cells = r.find_all(["th", "td"])
    if len(cells) >= 5:                      # Date | (blank) | Name | Type | Details
        records.append({
            "Date":    cells[0].get_text(strip=True),
            "Name":    cells[2].get_text(strip=True),
            "Type":    cells[3].get_text(strip=True),
            "Details": cells[4].get_text(" ", strip=True)   # keep commas/spaces inside
        })

# 4. Build a DataFrame & inspect
df = pd.DataFrame(records, columns=["Date", "Name", "Type", "Details"])
print(df.head())          # quick peek
print(len(df), "rows")    # should print 63 for 2023

# Load your CSV (make sure it matches your working file)
df = pd.read_csv("malaysia_holidays_2023.csv")

# --- 1. Convert 'Date' to dd/mm/yyyy format ----------------------------------
def convert_date(date_str):
    try:
        dt = datetime.strptime(date_str + " 2023", "%d %b %Y")
        return dt.strftime("%d/%m/%Y")
    except:
        return date_str  # leave as-is if parse fails

df["Date"] = df["Date"].apply(convert_date)

# --- 2. Replace state codes with full names ----------------------------------
state_map = {
    "JHR": "Johor", "KDH": "Kedah", "KTN": "Kelantan", "MLK": "Melaka", "NSN": "Negeri Sembilan",
    "PHG": "Pahang", "PRK": "Perak", "PLS": "Perlis", "PNG": "Penang", "SGR": "Selangor",
    "TRG": "Terengganu", "SBH": "Sabah", "SWK": "Sarawak", "KUL": "Kuala Lumpur",
    "LBN": "Labuan", "PJY": "Putrajaya"
}

def replace_codes(text):
    if pd.isna(text):
        return ""
    for code, name in state_map.items():
        text = re.sub(rf"\b{code}\b", name, text)
    return text

df["Details"] = df["Details"].apply(replace_codes)

# --- 3. Count empty/missing "Details" values ---------------------------------
missing_details_count = df["Details"].apply(lambda x: x.strip() == "").sum()
print("Number of empty 'Details' entries:", missing_details_count)

In [None]:
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re

# --- 1. Load the page ---------------------------------------------------------
html_file = "Holidays and Observances in Malaysia in 2024.html"   # adjust path if needed
with open(html_file, "r", encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser")

# --- 2. Select only the visible holiday rows ---------------------------------
rows = soup.select("table#holidays-table tbody tr.showrow")   # 63 rows in 2023 :contentReference[oaicite:0]{index=0}

# --- 3. Pull the four columns we care about ----------------------------------
records = []
for r in rows:
    cells = r.find_all(["th", "td"])
    if len(cells) >= 5:                      # Date | (blank) | Name | Type | Details
        records.append({
            "Date":    cells[0].get_text(strip=True),
            "Name":    cells[2].get_text(strip=True),
            "Type":    cells[3].get_text(strip=True),
            "Details": cells[4].get_text(" ", strip=True)   # keep commas/spaces inside
        })

# --- 4. Build a DataFrame & inspect ------------------------------------------
df = pd.DataFrame(records, columns=["Date", "Name", "Type", "Details"])
print(df.head())          # quick peek
print(len(df), "rows")    # should print 63 for 2023

# Load your CSV (make sure it matches your working file)
df = pd.read_csv("malaysia_holidays_2024.csv")

# --- 1. Convert 'Date' to dd/mm/yyyy format ----------------------------------
def convert_date(date_str):
    try:
        dt = datetime.strptime(date_str + " 2024", "%d %b %Y")
        return dt.strftime("%d/%m/%Y")
    except:
        return date_str  # leave as-is if parse fails

df["Date"] = df["Date"].apply(convert_date)

# --- 2. Replace state codes with full names ----------------------------------
state_map = {
    "JHR": "Johor", "KDH": "Kedah", "KTN": "Kelantan", "MLK": "Melaka", "NSN": "Negeri Sembilan",
    "PHG": "Pahang", "PRK": "Perak", "PLS": "Perlis", "PNG": "Penang", "SGR": "Selangor",
    "TRG": "Terengganu", "SBH": "Sabah", "SWK": "Sarawak", "KUL": "Kuala Lumpur",
    "LBN": "Labuan", "PJY": "Putrajaya"
}

def replace_codes(text):
    if pd.isna(text):
        return ""
    for code, name in state_map.items():
        text = re.sub(rf"\b{code}\b", name, text)
    return text

df["Details"] = df["Details"].apply(replace_codes)

# --- 3. Count empty/missing "Details" values ---------------------------------
missing_details_count = df["Details"].apply(lambda x: x.strip() == "").sum()
print("Number of empty 'Details' entries:", missing_details_count)

In [None]:
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re

# --- 1. Load the page ---------------------------------------------------------
html_file = "Holidays and Observances in Malaysia in 2025.html"   # adjust path if needed
with open(html_file, "r", encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser")

# --- 2. Select only the visible holiday rows ---------------------------------
rows = soup.select("table#holidays-table tbody tr.showrow")   # 63 rows in 2023 :contentReference[oaicite:0]{index=0}

# --- 3. Pull the four columns we care about ----------------------------------
records = []
for r in rows:
    cells = r.find_all(["th", "td"])
    if len(cells) >= 5:                      # Date | (blank) | Name | Type | Details
        records.append({
            "Date":    cells[0].get_text(strip=True),
            "Name":    cells[2].get_text(strip=True),
            "Type":    cells[3].get_text(strip=True),
            "Details": cells[4].get_text(" ", strip=True)   # keep commas/spaces inside
        })

# --- 4. Build a DataFrame & inspect ------------------------------------------
df = pd.DataFrame(records, columns=["Date", "Name", "Type", "Details"])
print(df.head())          # quick peek
print(len(df), "rows")    # should print 63 for 2023

# Load your CSV (make sure it matches your working file)
df = pd.read_csv("malaysia_holidays_2025.csv")

# --- 1. Convert 'Date' to dd/mm/yyyy format ----------------------------------
def convert_date(date_str):
    try:
        dt = datetime.strptime(date_str + " 2025", "%d %b %Y")
        return dt.strftime("%d/%m/%Y")
    except:
        return date_str  # leave as-is if parse fails

df["Date"] = df["Date"].apply(convert_date)

# --- 2. Replace state codes with full names ----------------------------------
state_map = {
    "JHR": "Johor", "KDH": "Kedah", "KTN": "Kelantan", "MLK": "Melaka", "NSN": "Negeri Sembilan",
    "PHG": "Pahang", "PRK": "Perak", "PLS": "Perlis", "PNG": "Penang", "SGR": "Selangor",
    "TRG": "Terengganu", "SBH": "Sabah", "SWK": "Sarawak", "KUL": "Kuala Lumpur",
    "LBN": "Labuan", "PJY": "Putrajaya"
}

def replace_codes(text):
    if pd.isna(text):
        return ""
    for code, name in state_map.items():
        text = re.sub(rf"\b{code}\b", name, text)
    return text

df["Details"] = df["Details"].apply(replace_codes)

# --- 3. Count empty/missing "Details" values ---------------------------------
missing_details_count = df["Details"].apply(lambda x: x.strip() == "").sum()
print("Number of empty 'Details' entries:", missing_details_count)

In [None]:
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re

# --- 1. Load the page ---------------------------------------------------------
html_file = "Holidays and Observances in Malaysia in 2026.html"   # adjust path if needed
with open(html_file, "r", encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser")

# --- 2. Select only the visible holiday rows ---------------------------------
rows = soup.select("table#holidays-table tbody tr.showrow")   # 63 rows in 2023 :contentReference[oaicite:0]{index=0}

# --- 3. Pull the four columns we care about ----------------------------------
records = []
for r in rows:
    cells = r.find_all(["th", "td"])
    if len(cells) >= 5:                      # Date | (blank) | Name | Type | Details
        records.append({
            "Date":    cells[0].get_text(strip=True),
            "Name":    cells[2].get_text(strip=True),
            "Type":    cells[3].get_text(strip=True),
            "Details": cells[4].get_text(" ", strip=True)   # keep commas/spaces inside
        })

# --- 4. Build a DataFrame & inspect ------------------------------------------
df = pd.DataFrame(records, columns=["Date", "Name", "Type", "Details"])
print(df.head())          # quick peek
print(len(df), "rows")    # should print 63 for 2023

# Load your CSV (make sure it matches your working file)
df = pd.read_csv("malaysia_holidays_2026.csv")

# --- 1. Convert 'Date' to dd/mm/yyyy format ----------------------------------
def convert_date(date_str):
    try:
        dt = datetime.strptime(date_str + " 2026", "%d %b %Y")
        return dt.strftime("%d/%m/%Y")
    except:
        return date_str  # leave as-is if parse fails

df["Date"] = df["Date"].apply(convert_date)

# --- 2. Replace state codes with full names ----------------------------------
state_map = {
    "JHR": "Johor", "KDH": "Kedah", "KTN": "Kelantan", "MLK": "Melaka", "NSN": "Negeri Sembilan",
    "PHG": "Pahang", "PRK": "Perak", "PLS": "Perlis", "PNG": "Pulau Pinang", "SGR": "Selangor",
    "TRG": "Terengganu", "SBH": "Sabah", "SWK": "Sarawak", "KUL": "Kuala Lumpur",
    "LBN": "Labuan", "PJY": "Putrajaya"
}

def replace_codes(text):
    if pd.isna(text):
        return ""
    for code, name in state_map.items():
        text = re.sub(rf"\b{code}\b", name, text)
    return text

df["Details"] = df["Details"].apply(replace_codes)

# --- 3. Count empty/missing "Details" values ---------------------------------
missing_details_count = df["Details"].apply(lambda x: x.strip() == "").sum()
print("Number of empty 'Details' entries:", missing_details_count)

# Optional: save updated DataFrame
df.to_csv("malaysia_holidays_2026_cleaned.csv", index=False)

In [None]:
import pandas as pd

# Load and label each year
df_2023 = pd.read_csv("malaysia_holidays_2023_cleaned.csv")
df_2023["Year"] = 2023

df_2024 = pd.read_csv("malaysia_holidays_2024_cleaned.csv")
df_2024["Year"] = 2024

df_2025 = pd.read_csv("malaysia_holidays_2025_cleaned.csv")
df_2025["Year"] = 2025

df_2026 = pd.read_csv("malaysia_holidays_2026_cleaned.csv")
df_2026["Year"] = 2026

# Combine all into one DataFrame
combined_df = pd.concat([df_2023, df_2024, df_2025, df_2026], ignore_index=True)

combined_df["Date"] = pd.to_datetime(combined_df["Date"], format="%d/%m/%Y", errors="coerce")

In [None]:
df_other = pd.read_csv(r" ... csv")

df_other["Date"] = pd.to_datetime(df_other["Date"], format="%Y-%m-%d", errors="coerce")


start_date = pd.to_datetime("01/01/2023", format="%d/%m/%Y")
end_date   = pd.to_datetime("31/12/2026", format="%d/%m/%Y")
df_other_filtered = df_other[
    (df_other["Date"] >= start_date) & (df_other["Date"] <= end_date)
]

merged_df = pd.merge(combined_df, df_other_filtered, on="Date", how="outer")

# 1. New Year's Day (& Holiday)
# 2. Birthday of Yang di-Pertuan Besar (& Holiday)
# 3. Chinese New Year Holiday
# 4. Federal Territory Day (& Holiday)
# 5. Thaipusam (& Holiday)
# 6. Valentine's Day
# 7. Isra and Mi-raj
# 8. Anniversary of the Coronation of the Sultan of Terengganu
# 9. Awal Ramadan
# 10. Birthday of the Sultan of Johor
# 11. Good Friday
# 12. Nuzul Al-Quran
# 13. Declaration of Melaka as Historical City
# 14. Hari Raya Aidilfitri Holiday
# 15. Birthday of the Sultan of Terengganu
# 16. Labour Day
# 17. Wesak Day
# 18. Pahang State Holiday
# 19. Harvest Festival
# 20. Gawai Dayak (& Holiday)
# 21. The Yang di-Pertuan Agong's Birthday
# 22. Sultan of Kedah's Birthday
# 23. Day of Arafat
# 24. Hari Raya Haji
# 25. George Town World Heritage City Day
# 26. Penang Governor's Birthday
# 27. Birthday of the Raja of Perlis
# 28. Awal Muharram (& Holiday)
# 29. Sarawak Independence Day
# 30 Birthday of the Sultan of Pahang (& Holiday)
# 31. Special Public Holiday
# 32. Almarhum Sultan Iskandar Holiday
# 33. Birthday of the Governor of Melaka
# 34. Malaysia's National Day (& Holiday)
# 35. Malaysia Day
# 36. The Prophet Muhammad's Birthday
# 37. Birthday of the Governor of Sabah (& Holiday)
# 38. Birthday of the Governor of Sarawak
# 39. Birthday of the Sultan of Perak
# 40. Birthday of the Sultan of Kelantan (& Holiday)
# 41. Deepavali Holiday
# 42. Birthday of the Sultan of Selangor
# 43. Football Win Holiday
# 44. Christmas Day
# 45. FA Cup Final Holiday
# 46. Independence Day Declaration Day 

In [None]:
df_manually_filtered = pd.read_excel(r" ... xlsx ")

# 1. Clean up the raw “States Affected” text:
#    - split on commas
#    - strip whitespace on each element
#    - re-join into a single comma-separated string
df_manually_filtered["States Affected"] = (
    df_manually_filtered["States Affected"]
      .str.split(",") 
      .apply(lambda lst: ",".join(s.strip() for s in lst))
)

# 2. Now get your dummies in one line:
state_dummies = df_manually_filtered["States Affected"].str.get_dummies(sep=",")

# 3. Merge them back on:
df_manually_filtered = pd.concat([df_manually_filtered, state_dummies], axis=1)

In [None]:
import pandas as pd
import numpy as np

# -------------------------------------------------
# (A)  Original tidy table BEFORE any date-shifting
#      ──  one row per holiday  ──
# -------------------------------------------------
# cols: Date | Name | Day | Johor | Kedah | … | Terengganu
base = df_manually_filtered.copy()          # 335 rows in your latest workbook
special_names = ["Puasa", "Valentine's Day"]

state_cols = ["Johor", "Kedah", "Kelantan", "Kuala Lumpur", "Labuan", "Melaka",
              "Negeri Sembilan", "Pahang", "Penang", "Perak", "Perlis",
              "Putrajaya", "Sabah", "Sarawak", "Selangor", "Terengganu"]

# -------------------------------------------------
# (B)  MELT   →  one row per STATE  (16×)
#       keep the original date as CentralDate
# -------------------------------------------------
long = (
    base
      .melt(id_vars=["Date", "Name", "Day"],
            value_vars=state_cols,
            var_name="States Affected",
            value_name="dummy")             # 0/1 from the one-hot columns
      .assign(CentralDate=lambda d: d["Date"])     # remember the true date
)

# -------------------------------------------------
# (C)  SPLIT  special vs expandable holidays
# -------------------------------------------------
to_expand   = long[~long["Name"].isin(special_names)].copy()
no_expand   = long[ long["Name"].isin(special_names)].copy()

# -------------------------------------------------
# (D)  EXPAND dates  (-7 … +7)  **ONLY** for to_expand
# -------------------------------------------------
shifts = pd.DataFrame({"shift": np.arange(-7, 8)})      # 15 rows

to_expand = (
    to_expand
      .assign(key=1)                         # cartesian trick
      .merge(shifts.assign(key=1), on="key") # 15× duplication
      .drop("key", axis=1)
)

# new shifted date
to_expand["Date"] = to_expand["Date"] + pd.to_timedelta(to_expand["shift"], "D")

# -------------------------------------------------
# (E)  Public Holiday  &  Days From Holiday
# -------------------------------------------------
# 1)  default all rows to 0
to_expand["Public Holiday"] = 0
no_expand["Public Holiday"] = 0

# 2)  mark only the 212 true-holiday rows (shift == 0) with 1
to_expand.loc[to_expand["shift"] == 0, "Public Holiday"] = 1

# 3)  Days-from-holiday
to_expand["Days From Holiday"] = to_expand["shift"]
no_expand["Days From Holiday"] = 0            # Puasa / Valentines stay at zero

# 4)  clean-up
to_expand.drop("shift", axis=1, inplace=True)
to_expand.drop("dummy", axis=1, inplace=True)
no_expand.drop("dummy", axis=1, inplace=True)

# -------------------------------------------------
# (F)  FINAL tidy table (52 848 rows)
# -------------------------------------------------
final = (
    pd.concat([to_expand, no_expand], ignore_index=True)
      .loc[:, ["Date", "Name", "States Affected", "Day", "Public Holiday", "Days From Holiday"]]
      .sort_values(["Name", "States Affected", "Date"])
      .reset_index(drop=True)
)

# … everything up to the construction of `final` stays the same …

final = (
    pd.concat([to_expand, no_expand], ignore_index=True)
      .loc[:, ["Date", "Name", "States Affected", "Day", "Public Holiday", "Days From Holiday"]]
      .sort_values(["Name", "States Affected", "Date"])
      .reset_index(drop=True)
)

# -------------------------------------------------
# NEW: make Day follow the (possibly shifted) Date
# -------------------------------------------------
final["Day"] = final["Date"].dt.day_name()   # <─ one-liner fix

# -------------------------------------------------
# Puasa Count  –– sequential per-year counter
#   • only filled on rows where Name == "Puasa"
#   • same number across the 16 states for the same date
#   • restarts at 1 every new calendar year
# -------------------------------------------------
final["Puasa Count"] = 0                         # default

puasa_mask = final["Name"] == "Puasa"

final.loc[puasa_mask, "Puasa Count"] = (
    final.loc[puasa_mask]
         .groupby(final.loc[puasa_mask, "Date"].dt.year)["Date"]
         .rank(method="dense")                  # 1, 2, 3 … within each year
         .astype(int)
)

# quick sanity-check
print(
    final.loc[puasa_mask, ["Date", "States Affected", "Puasa Count"]]
         .drop_duplicates()
         .head(20)
)
print("2023 max :", final.loc[(puasa_mask) & (final['Date'].dt.year == 2023), 'Puasa Count'].max())
print("2024 max :", final.loc[(puasa_mask) & (final['Date'].dt.year == 2024), 'Puasa Count'].max())


# quick sanity checks
print("Rows :", len(final))                       # 52 848
print("Public Holiday == 1 :", final["Public Holiday"].sum())  # 212
print(final.head())

final.to_excel(" ... xlsx", index=False)