In [1]:
import pandas as pd
import unicodedata

# Load dataset
df = pd.read_csv("AbsenseReport_Cleaned_Final.csv")

# 1. Standardise column names
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
)

# 2. Fix encoding / special characters
def fix_text(x):
    if isinstance(x, str):
        return unicodedata.normalize("NFKC", x)
    return x

text_cols = df.select_dtypes(include="object").columns
df[text_cols] = df[text_cols].applymap(fix_text)

# 3. Parse dates (adjust column names if needed)
date_cols = [c for c in df.columns if "date" in c]
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors="coerce")

# 4. Time features for Power BI
if "start_date" in df.columns:
    df["year"] = df["start_date"].dt.year
    df["month"] = df["start_date"].dt.month
    df["month_year"] = df["start_date"].dt.strftime("%Y-%m")

# 5. Absence category mapping
def map_absence(x):
    x = str(x).lower()
    if "annual" in x:
        return "Annual"
    if any(k in x for k in ["sick", "medical", "dental", "maternity"]):
        return "Medical"
    if any(k in x for k in ["wfh", "home"]):
        return "Work From Home"
    if "travel" in x:
        return "Travel"
    return "Other"

if "absence_type" in df.columns:
    df["absence_category"] = df["absence_type"].apply(map_absence)

# 6. Numeric cleanup
if "absence_days" in df.columns:
    df["absence_days"] = pd.to_numeric(df["absence_days"], errors="coerce").fillna(0)

# 7. Final trim
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# 8. Export Power BI–ready file
output_path = "Absence_PowerBI_Ready.csv"
df.to_csv(output_path, index=False)

output_path


  df[text_cols] = df[text_cols].applymap(fix_text)
  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


'Absence_PowerBI_Ready.csv'

In [1]:
pip install tzdata

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import re
import unicodedata

INPUT_CSV  = "AbsenseReport_Cleaned_Final.csv"
OUTPUT_CSV = "Absence_PowerBI_Ready.csv"

# -----------------------------
# Helpers
# -----------------------------
def clean_colname(c: str) -> str:
    c = str(c).strip().lower()
    c = re.sub(r"[^\w\s]", "", c)          # remove punctuation
    c = re.sub(r"\s+", "_", c)             # spaces -> underscore
    c = re.sub(r"_+", "_", c).strip("_")   # collapse underscores
    return c

def fix_mojibake(s: str) -> str:
    """
    Fix common UTF-8-in-Latin1 mojibake like:
    'BarÄ±ÅŸ AkyÃ¼z' -> 'Barış Akyüz'
    """
    if not isinstance(s, str):
        return s
    s = s.strip()
    if not s:
        return s

    # First: normalize unicode form
    s_norm = unicodedata.normalize("NFKC", s)

    # Try: latin1 -> utf8 decode (common corruption)
    try:
        repaired = s_norm.encode("latin1", errors="strict").decode("utf-8", errors="strict")
        # If repair looks better (contains more non-ASCII letters in a reasonable way), keep it
        # (Very simple heuristic: repaired has fewer 'Ã'/'Å' artifacts)
        if ("Ã" in s_norm or "Å" in s_norm or "Ä" in s_norm) and not ("Ã" in repaired or "Å" in repaired or "Ä" in repaired):
            s_norm = repaired
    except Exception:
        pass

    return unicodedata.normalize("NFKC", s_norm).strip()

def map_absence_category(x: str) -> str:
    x = str(x).lower().strip()

    # annual
    if "annual" in x:
        return "Annual"

    # medical bucket
    if any(k in x for k in ["sick", "sickness", "medical", "dental", "maternity", "bereave", "compassion", "appointment"]):
        return "Medical"

    # WFH bucket (also catches "work from home", "home working" etc.)
    if any(k in x for k in ["wfh", "work from home", "working from home", "home working", "remote"]):
        return "Work From Home"

    # travel bucket
    if "travel" in x:
        return "Travel"

    return "Other"

def robust_to_date(series: pd.Series) -> pd.Series:
    """
    
    Parse to pandas datetime then drop time (Option 1: keep as date type).
    Uses dayfirst=True to match UK dd/mm formats.
    """
    dt = pd.to_datetime(series, errors="coerce", dayfirst=True)
    return dt.dt.date  # <- removes time; stays date-like for Power BI


# -----------------------------
# Load
# -----------------------------
df = pd.read_csv(INPUT_CSV)

# -----------------------------
# 1) Standardise column names
# -----------------------------
df.columns = [clean_colname(c) for c in df.columns]

# -----------------------------
# 2) Fix text columns (trim + encoding repair)
# -----------------------------
obj_cols = df.select_dtypes(include="object").columns
for c in obj_cols:
    df[c] = df[c].map(fix_mojibake)

# -----------------------------
# 3) Identify likely date columns and convert to DATE-ONLY (Option 1)
# -----------------------------
# Common date column names we might see
candidate_date_cols = []
for c in df.columns:
    if any(tok in c for tok in ["date", "start", "end", "from", "to"]):
        # avoid false positives like "updated_by" etc.
        if "name" not in c and "type" not in c and "reason" not in c:
            candidate_date_cols.append(c)

# Apply conversion to date-only for columns that actually parse like dates
for c in candidate_date_cols:
    parsed = pd.to_datetime(df[c], errors="coerce", dayfirst=True)
    if parsed.notna().mean() > 0.6:  # only convert if most values look like dates
        df[c] = parsed.dt.date

# Prefer a primary "start_date" for time slicing; create it if needed
# Adjust these aliases based on your real columns:
start_aliases = ["start_date", "start", "from_date", "date_from", "absence_start", "startdate"]
end_aliases   = ["end_date", "end", "to_date", "date_to", "absence_end", "enddate"]

def first_existing(cols):
    for c in cols:
        if c in df.columns:
            return c
    return None

start_col = first_existing(start_aliases)
end_col   = first_existing(end_aliases)

# If you have a single date column (e.g., "date"), treat it as start_date
if start_col is None and "date" in df.columns:
    start_col = "date"

# -----------------------------
# 4) Add Power BI-friendly time features
# -----------------------------
if start_col is not None:
    # Convert the date-only back to datetime temporarily for easy feature extraction
    start_dt = pd.to_datetime(df[start_col], errors="coerce")
    df["year"] = start_dt.dt.year
    df["month"] = start_dt.dt.month
    df["month_name"] = start_dt.dt.strftime("%b")         # Jan, Feb...
    df["month_year"] = start_dt.dt.strftime("%Y-%m")      # 2026-01 (sortable)
    df["week_start"] = (start_dt - pd.to_timedelta(start_dt.dt.weekday, unit="D")).dt.date  # Monday week start

# -----------------------------
# 5) Create a clean absence_category
# -----------------------------
# Try to find the absence type column
type_aliases = ["absence_type", "type", "absence", "leave_type", "category", "reason"]
type_col = first_existing(type_aliases)

if type_col is not None:
    df["absence_category"] = df[type_col].map(map_absence_category)

# -----------------------------
# 6) Numeric cleanup (days/hours)
# -----------------------------
numeric_aliases = ["absence_days", "days", "duration_days", "total_days", "days_absent", "duration"]
num_col = first_existing(numeric_aliases)
if num_col is not None:
    df[num_col] = pd.to_numeric(df[num_col], errors="coerce")

# If there is no numeric duration, try deriving it from start/end (inclusive)
if num_col is None and start_col is not None and end_col is not None:
    s = pd.to_datetime(df[start_col], errors="coerce")
    e = pd.to_datetime(df[end_col], errors="coerce")
    df["absence_days"] = (e - s).dt.days + 1
    df["absence_days"] = df["absence_days"].where(df["absence_days"].notna(), 0).clip(lower=0)

# Fill numeric NaNs to 0 only where that makes sense (duration measures)
for c in ["absence_days"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0)

# -----------------------------
# 7) Final string trim (again, safe)
# -----------------------------
obj_cols = df.select_dtypes(include="object").columns
for c in obj_cols:
    df[c] = df[c].astype(str).str.strip().replace({"nan": None, "None": None, "": None})

# -----------------------------
# 8) Export Power BI-ready CSV
# -----------------------------
df.to_csv(OUTPUT_CSV, index=False)

print("Saved Power BI-ready file to:", OUTPUT_CSV)
print("Columns:", list(df.columns))


Saved Power BI-ready file to: Absence_PowerBI_Ready.csv
Columns: ['first_name', 'last_name', 'team_names', 'leave_entitlement', 'entitlement_unit', 'absence_type', 'absence_duration_total_in_days', 'absence_duration_for_period_in_days', 'absence_description', 'absence_start_date', 'absence_end_date', 'organisation', 'suborganisation', 'absence_category']


  parsed = pd.to_datetime(df[c], errors="coerce", dayfirst=True)
  parsed = pd.to_datetime(df[c], errors="coerce", dayfirst=True)
