# BT4103 Data Cleaning
Please read through and let me know if there are any issues with regard to the cleaning.

## Import packages and datasets
I have imported all the packages that I used up here for ease of reference. Please add your own filepath so that you can import the data correctly.

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import re
import unicodedata
import datetime
import numpy as np
import string

### Importing dataset
Add filepath below

In [None]:
data = pd.read_excel("oots-cleaned2.xlsx")
data

validation_data = pd.read_excel("oots-cleaned-unlocked.xlsx")

## Define Helper Functions
Here are the helper functions that I have created for easier readability in the actual code below.

In [None]:
def OHE(df, col, drop_first=False): #One Hot Encode a column in a df
    dummies = pd.get_dummies(df[col], prefix=col, drop_first=drop_first)
    df = pd.concat([df.drop(columns=[col]), dummies], axis=1)
    legend = {new_col: category 
              for new_col, category in zip(dummies.columns, dummies.columns.str.replace(f"{col}_", "", regex=False))}
    return df, legend


def LabelEncode(df, col): #Label Encode a column in a df
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    legend = dict(zip(le.classes_, le.transform(le.classes_)))
    return df, legend

def inspect_column(df, col, top_n=20): #Get Unique Value information on column in df
    print(f"Column: {col}")
    print(f"Unique values: {df[col].nunique(dropna=False)}")
    print("\nTop value counts:")
    print(df[col].value_counts(dropna=False).head(top_n))

def apply_mapping(df, col, mapping, new_col_suffix="_clean"): #Apply new mapping to column in df
    new_col = col + new_col_suffix
    df[new_col] = df[col].replace(mapping)
    return df

def normalize_text(df, col, to_lower=True, replace_symbols=True, unknown_vals=None): #Normalise text of a column in a df
    s = df[col].astype(str).str.strip()
    if to_lower:
        s = s.str.lower()
    s = s.str.replace(r"\s+", " ", regex=True)
    if replace_symbols:
        s = s.str.replace(r"[-_/]", " ", regex=True)
    if unknown_vals:
        s = s.replace(unknown_vals, "0")
    df[col] = s
    return df

def _normalize_text_series(s: pd.Series) -> pd.Series: #Normalise text in series
    s = s.astype(str).map(lambda x: unicodedata.normalize("NFKC", x))
    s = s.str.strip()
    s = s.str.lower()
    s = s.str.replace(r"\s+", " ", regex=True)         
    s = s.str.replace(r"\s*/\s*", " / ", regex=True)   
    s = s.str.replace(r"\s*,\s*", ", ", regex=True)    
    s = s.str.strip(" ,")                              
    return s

def _remove_trailing_code_in_parens(name_s: pd.Series, code_s: pd.Series) -> pd.Series: #remove white spaces in ()
    code_up = code_s.astype(str).str.strip().str.upper()
    pattern = r"\(\s*{}\s*\)\s*$"
    out = name_s.copy()
    mask = code_up.notna() & code_up.ne("")
    out.loc[mask] = [
        re.sub(pattern.format(re.escape(c)), "", n, flags=re.IGNORECASE)
        for n, c in zip(out.loc[mask].tolist(), code_up.loc[mask].tolist())
    ]
    return out.str.strip(" ,")

def _choose_canonical_name(name_series: pd.Series) -> str: #Chooses best name
    s = name_series.dropna().astype(str)
    s = s[s.str.strip().ne("").values]
    s = s[s.str.strip().ne("unknown").values]
    if s.empty:
        return "0"
    vc = s.value_counts()
    top_freq = vc.iloc[0]
    candidates = vc[vc.eq(top_freq)].index.tolist()
    return max(candidates, key=len)

def build_operation_legend_and_drop_nature( #Nature cleaning
    df: pd.DataFrame,
    code_col: str = "OPERATION_CODE",
    nature_col: str = "NATURE",
    drop_nature: bool = True,
    keep_title_case_copy: bool = False,
    unknown_tokens = ("0", "na", "n/a", "-", "null", "nan")
):
    if code_col not in df.columns or nature_col not in df.columns:
        raise KeyError(f"Expected columns '{code_col}' and '{nature_col}' in df.")
    work = pd.DataFrame({
        "operation_code": df[code_col].astype(str).str.strip().str.upper(),
        "operation_name_raw": df[nature_col]
    })
    name_norm = _normalize_text_series(work["operation_name_raw"])
    name_norm = name_norm.replace(list(unknown_tokens), "unknown")
    name_clean = _remove_trailing_code_in_parens(name_norm, work["operation_code"])
    tmp = pd.DataFrame({"operation_code": work["operation_code"], "operation_name": name_clean})
    tmp = tmp[tmp["operation_code"].str.len() > 0]
    legend = (
        tmp.groupby("operation_code", as_index=False)["operation_name"]
           .apply(_choose_canonical_name)
           .rename(columns={"operation_name": "operation_name"})
    )
    if keep_title_case_copy:
        legend["operation_name_title"] = legend["operation_name"].str.title()
    df_out = df.copy()
    if drop_nature:
        df_out.drop(columns=[nature_col], inplace=True, errors="ignore")
    return df_out, legend

  
def clean_equipment( #EQUIPMENT cleaning
    df,
    col="EQUIPMENT",
    sep=";",
    tags_to_strip=(r"#nuh",),            
    unknown_vals=("0","na","n/a","-","null","nan",""),
    synonym_map=None         
):
    if synonym_map is None:
        synonym_map = {}
    pattern = r"|".join(fr"{re.escape(tag)}[_-]?" for tag in tags_to_strip)
    df[col] = (
        df[col]
        .astype(str)
        .str.replace(pattern, "", regex=True, flags=re.IGNORECASE)
    )
    df = normalize_text(df, col, to_lower=True, replace_symbols=True, unknown_vals=unknown_vals)
    def _clean_token(tok: str) -> str:
        t = tok.strip()
        if not t: return ""
        t = re.sub(r"\s+", " ", t)
        t = synonym_map.get(t, t)
        if t in ("unknown",): return ""
        return t
    def _process_cell(cell: str) -> str:
        parts = re.split(rf"\s*{re.escape(sep)}\s*", cell) if cell else []
        cleaned = [_clean_token(p) for p in parts]
        cleaned = [c for c in cleaned if c]
        if not cleaned:
            return "unknown"
        cleaned = sorted(set(cleaned))
        return f"{sep} ".join(cleaned)
    df[col] = df[col].apply(_process_cell)
    return df

## General Cleaning
We are dropping index and case number as they are labels, and dropping patient name because it has been completely removed.

In [None]:
data = data.drop(data.columns[0], axis=1) # drop INDEX
data = data.drop(columns="PATIENT_NAME")
data = data.drop(columns="CASE_NUMBER")
data

## Ensure correct data types
yet to do, still waiting on update from joey regarding the dates

In [None]:
data.info()

## Handle Missing Data
Handling missing data is important to ensure that we do not run into any issues with the EDA, as well as our AI/ML implementations. It is a vital step in data cleaning to ensure that the dataset can be used efficiently and properly.

### Handling optional columns
These columns potentially will be blank as there is nothing to write for some operations, hence we will fill those with 0.

In [None]:
data["Delay_Reason"].fillna(0, inplace=True)
data["Remarks"].fillna(0, inplace=True)
data["IMPLANT"].fillna(0, inplace=True)
data["EQUIPMENT"].fillna(0, inplace=True)
data["EMERGENCY_PRIORITY"].fillna(0, inplace=True)

### Error Correction (By Eyeballing)

- Date for row OPERATION_ID == 582117 should be 2010-04-19.

In [None]:
planned_cols = [
        "PLANNED_PATIENT_CALL_TIME",
        "PLANNED_PATIENT_FETCH_TIME",
        "PLANNED_RECEPTION_IN_TIME",
        "PLANNED_ENTER_OR_TIME",
        "PLANNED_SURGERY_PREP_TIME",
        "PLANNED_ANAESTHESIA_INDUCTION",
        "PLANNED_KNIFE_TO_SKIN_TIME",
        "PLANNED_SKIN_CLOSURE",
        "PLANNED_PATIENT_REVERSAL_TIME",
        "PLANNED_EXIT_OR_TIME",
        "PLANNED_OR_CLEANUP_TIME",
        "PLANNED_EXIT_RECOVERY_TIME",        
    ]

actual_cols = [
        "PATIENT_CALL_TIME",
        "PATIENT_FETCH_TIME",
        "ACTUAL_RECEPTION_IN_TIME",
        "ACTUAL_ENTER_OR_TIME",
        "ACTUAL_SURGERY_PREP_TIME",
        "ACTUAL_ANAESTHESIA_INDUCTION",
        "ACTUAL_KNIFE_TO_SKIN_TIME",
        "ACTUAL_SKIN_CLOSURE",
        "ACTUAL_PATIENT_REVERSAL_TIME",
        "ACTUAL_EXIT_OR_TIME",
        "ACTUAL_OR_CLEANUP_TIME",
        "ACTUAL_EXIT_RECOVERY_TIME",        
    ]

# target row
target_id = 582117
fix_date = pd.Timestamp("2010-04-19")

mask = data["OPERATION_ID"] == target_id

def force_date(val, base_date):
    if pd.isna(val):
        return val
    # If it's already a Timestamp
    if isinstance(val, pd.Timestamp):
        return pd.Timestamp.combine(base_date, val.time())
    # If it's a datetime.time
    if isinstance(val, datetime.time):
        return pd.Timestamp.combine(base_date, val)
    # Try to parse strings or other objects
    try:
        parsed = pd.to_datetime(val, errors="coerce")
        if pd.isna(parsed):
            return val
        return pd.Timestamp.combine(base_date, parsed.time())
    except Exception:
        return val

for col in planned_cols:
    if col in data.columns:
        data.loc[mask, col] = data.loc[mask, col].apply(lambda v: force_date(v, fix_date))

for col in actual_cols:
    if col in data.columns:
        data.loc[mask, col] = data.loc[mask, col].apply(lambda v: force_date(v, fix_date))        

data.iloc[:, 7:31]

### Date Handling and Imputation Rules

This section handles messy date/time data by enforcing **consistent start dates** and applying **domain-specific sync rules**.

---

#### 1. Detecting Date vs. Time-only Strings
- `_looks_like_date_string(s)` → checks if a string contains a date-like pattern (`YYYY-MM-DD`, `DD/MM/YYYY`, etc.).  
- `_is_time_only_string(s)` → checks if a string looks like a time-only entry (`08:15`, `8:15:00 AM`, etc.).

This distinction allows us to avoid misinterpreting time-only values as full datetimes.

---

#### 2. Determining Constant Start Dates
- `find_start_date_from_row(row, cols)` scans a list of columns in a row and finds the **first valid date**.  
  - A valid date is a `Timestamp` with `year > 1900` or a parseable date string.  
  - Returns the **normalized date** (time set to 00:00:00).  

- `attach_constant_dates(row, planned_cols, actual_cols)`:
  - Finds one **planned_start** and one **actual_start** per row.
  - If `actual_start` is missing but `planned_start` exists, use the planned date as fallback.
  - For any time-only strings, attach the corresponding start date to construct a full `Timestamp`.
  - Full datetime values are preserved as-is.

---

#### 2b. Date Sanitisation 
We discovered that some of the input data, while they had dates, had **corrupted or incorrectly manipulated dates**.  
To ensure all downstream imputations are built on reliable timelines, we enforce a **sanity check**:

1. **Valid date range**  
   - Earliest allowed date: **2016-12-31**  
   - Latest allowed date: **2022-02-25**

2. **Correction procedure**  
   - For each row, check if `planned_start` and `actual_start` fall within the valid range.  
   - If either date is outside this range, attempt to backfill the correct value from  
     `oots-cleaned-unlocked.xlsx` using `OPERATION_ID`.  
   - If no match is found in the validation file, replace the invalid date with `NaT`.

3. **Guarantees after cleaning**  
   - Every `planned_start` and `actual_start` is either:
     - Within the valid range, or  
     - Backfilled from the validation dataset, or  
     - Explicitly marked as `NaT` if no trusted source is available.  

This step ensures that **all subsequent imputations** (e.g., filling missing times)  
operate only on dates within the trusted window.

---

#### 3. Row-wise Imputation Rules
- `impute_with_rules(row, planned_cols, actual_cols)`:
  1. **Attach constant start dates** using `attach_constant_dates`.
  2. **Sync critical columns**:
     - `PLANNED_PATIENT_CALL_TIME` ↔ `PLANNED_PATIENT_FETCH_TIME`  
     - `PLANNED_OR_CLEANUP_TIME` ↔ `PLANNED_EXIT_OR_TIME`  
     Preference is given to whichever value exists.
  3. **Enforce ordering constraints**:
     - Knife-to-skin ≤ Skin closure ≤ Patient reversal ≤ Exit OR ≤ Exit recovery ≤ OR cleanup
     - If any step goes backwards, adjust forward.
  4. **Fill missing anchor values**:
     - If missing, `PLANNED_ANAESTHESIA_INDUCTION` and `PLANNED_SURGERY_PREP_TIME` are set to `PLANNED_KNIFE_TO_SKIN_TIME`.
     - If missing, `PLANNED_PATIENT_REVERSAL_TIME` is set to `PLANNED_SKIN_CLOSURE`.

In [None]:
MIN_DATE = pd.Timestamp("2016-12-31")
MAX_DATE = pd.Timestamp("2022-02-25")

warnings = []

_time_only_re = re.compile(r'^\s*\d{1,2}:\d{2}(:\d{2})?\s*(?:[AaPp][Mm])?\s*$')
_date_like_re = re.compile(r'\d{4}[-/]\d{1,2}[-/]\d{1,2}|\d{1,2}[-/]\d{1,2}[-/]\d{2,4}')

def _looks_like_date_string(s: str) -> bool:
    """Rudimentary check whether a string contains an explicit date."""
    if not isinstance(s, str):
        return False
    s = s.strip()
    return bool(_date_like_re.search(s))

def _is_time_only_string(s: str) -> bool:
    """True if string appears to contain time only (e.g. '08:15' or '8:15:00 AM')."""
    if not isinstance(s, str):
        return False
    return bool(_time_only_re.match(s.strip()))

def find_start_date_from_row(row, cols):
    """
    Scan cols in order and return the first discovered 'date' (normalized).
    We consider a value to contain a date if:
      - it's a pandas Timestamp / datetime with a year > 1900
      - or the original string contains a date-like pattern and parses to a Timestamp with sensible year
    """
    for col in cols:
        if col not in row.index:
            continue
        val = row[col]
        if pd.isna(val):
            continue

        # Already a Timestamp / datetime
        if isinstance(val, pd.Timestamp):
            if val.year > 1900:  # treat as containing a real date
                return val.normalize()
            else:
                # likely a parsed time-only; skip
                continue
        if isinstance(val, datetime.datetime):
            if val.year > 1900:
                return pd.Timestamp(val).normalize()

        # If it's a string, check if it looks like a date first
        try:
            s = str(val).strip()
        except Exception:
            continue

        if _looks_like_date_string(s):
            parsed = pd.to_datetime(s, errors="coerce", dayfirst=False)
            if not pd.isna(parsed) and parsed.year > 1900:
                return parsed.normalize()

        # If it wasn't date-like, skip (likely time-only)
    return None


def attach_constant_dates(row, planned_cols, actual_cols):
    """
    For a given row:
     - find planned_start = first planned col that contains a date
     - find actual_start  = first actual col that contains a date
     - verify both are within MIN_DATE .. MAX_DATE; if not, try to retrieve from validation_date by OPERATION_ID
     - for each planned col: if its value is time-only, attach planned_start
     - for each actual col: if its value is time-only, attach actual_start
    Does NOT override full datetimes.
    """
    # Find constants (scan entire list until first real-date found)
    planned_start = find_start_date_from_row(row, planned_cols)
    actual_start = find_start_date_from_row(row, actual_cols)

    # --- SANITY: if either start is missing or out of acceptable range, try to pull from validation sheet ---
    def _in_range(ts):
        return isinstance(ts, pd.Timestamp) and (MIN_DATE <= ts <= MAX_DATE)

    # helper to attempt retrieval from validation_data using OPERATION_ID
    def _try_validation_lookup(opid, cols):
        if opid is None:
            return None
        # attempt direct match first; fall back to string match if needed
        matches = pd.DataFrame()
        try:
            matches = validation_data.loc[validation_data["OPERATION_ID"] == opid]
        except Exception:
            try:
                matches = validation_data.loc[validation_data["OPERATION_ID"].astype(str) == str(opid)]
            except Exception:
                matches = pd.DataFrame()

        if matches is not None and not matches.empty:
            val_row = matches.iloc[0]
            return find_start_date_from_row(val_row, cols)
        return None

    opid = row.get("OPERATION_ID", None)

    # planned_start: if missing or out-of-range, attempt fallback from validation sheet
    if not _in_range(planned_start):
        alt_planned = _try_validation_lookup(opid, planned_cols)
        if _in_range(alt_planned):
            planned_start = alt_planned
            warnings.append(f"OPERATION_ID={opid}: planned_start replaced from validation file ({planned_start.date()}).")
        else:
            warnings.append(f"OPERATION_ID={opid}: planned_start {planned_start} out of range or missing; no valid replacement found in validation file.")
            planned_start = None

    # actual_start: if missing or out-of-range, attempt fallback from validation sheet
    if not _in_range(actual_start):
        alt_actual = _try_validation_lookup(opid, actual_cols)
        if _in_range(alt_actual):
            actual_start = alt_actual
            warnings.append(f"OPERATION_ID={opid}: actual_start replaced from validation file ({actual_start.date()}).")
        else:
            # if no actual start in validation, we will keep None for now and allow later fallback to planned_start
            warnings.append(f"OPERATION_ID={opid}: actual_start {actual_start} out of range or missing; no valid replacement found in validation file.")
            actual_start = None

    # --- NEW: fallback (if actual still missing, use planned_start) ---
    if actual_start is None and planned_start is not None:
        actual_start = planned_start

    # Helper to combine time-only string with a start date
    def _combine_time_with_date(s, base_date):
        # parse the time string into a datetime (may get today's date, we only use .time())
        parsed = pd.to_datetime(s, errors="coerce")
        if pd.isna(parsed) or base_date is None:
            return None
        return pd.Timestamp.combine(base_date, parsed.time())

    # Fill planned cols
    for col in planned_cols:
        if col not in row.index:
            continue
        val = row[col]
        if pd.isna(val):
            continue

        # if it's already a Timestamp or datetime
        if isinstance(val, (pd.Timestamp, datetime.datetime)):
            ts = pd.Timestamp(val)
            if MIN_DATE <= ts <= MAX_DATE:
                row[col] = ts
                continue
            else:
                # invalid → try to replace with same time on planned_start
                if planned_start is not None:
                    row[col] = pd.Timestamp.combine(planned_start, ts.time())
                else:
                    row[col] = pd.NaT
            continue

        # otherwise, treat it as string
        s = str(val).strip()
        if _is_time_only_string(s):
            if planned_start is not None:
                combined = _combine_time_with_date(s, planned_start)
                if combined is not None:
                    row[col] = combined
        else:
            parsed = pd.to_datetime(s, errors="coerce", dayfirst=False)
            if not pd.isna(parsed):
                if MIN_DATE <= parsed <= MAX_DATE:
                    row[col] = parsed
                elif planned_start is not None:
                    row[col] = pd.Timestamp.combine(planned_start, parsed.time())
                else:
                    row[col] = pd.NaT

    # Fill actual cols (same logic)
    for col in actual_cols:
        if col not in row.index:
            continue
        val = row[col]
        if pd.isna(val):
            continue

        if isinstance(val, (pd.Timestamp, datetime.datetime)):
            ts = pd.Timestamp(val)
            if MIN_DATE <= ts <= MAX_DATE:
                row[col] = ts
                continue
            else:
                if actual_start is not None:
                    row[col] = pd.Timestamp.combine(actual_start, ts.time())
                else:
                    row[col] = pd.NaT
            continue

        s = str(val).strip()
        if _is_time_only_string(s):
            if actual_start is not None:
                combined = _combine_time_with_date(s, actual_start)
                if combined is not None:
                    row[col] = combined
        else:
            parsed = pd.to_datetime(s, errors="coerce", dayfirst=False)
            if not pd.isna(parsed):
                if MIN_DATE <= parsed <= MAX_DATE:
                    row[col] = parsed
                elif actual_start is not None:
                    row[col] = pd.Timestamp.combine(actual_start, parsed.time())
                else:
                    row[col] = pd.NaT


    return row


def impute_with_rules(row, planned_cols, actual_cols):
    """
    Step 1: Attach constant planned/actual start dates
    Step 2: Apply sync rules (with bias toward existing non-null values)
    Step 3: Fill missing anaesthesia/prep times from knife-to-skin
    """
    # --- Step 1: attach constant dates ---
    row = attach_constant_dates(row, planned_cols, actual_cols)

    # --- Step 2: enforce logical sync rules ---
    def sync_cols(col_a, col_b, prefer="a"):
        """Sync two columns with preference if one is missing."""
        a, b = row.get(col_a, pd.NaT), row.get(col_b, pd.NaT)
        if pd.isna(a) and pd.notna(b):
            row[col_a] = b
        elif pd.isna(b) and pd.notna(a):
            row[col_b] = a
        elif pd.notna(a) and pd.notna(b):
            if prefer == "a":
                row[col_b] = a
            else:
                row[col_a] = b

    # Rule 1: PLANNED_PATIENT_CALL_TIME == PLANNED_PATIENT_FETCH_TIME
    sync_cols("PLANNED_PATIENT_CALL_TIME", "PLANNED_PATIENT_FETCH_TIME", prefer="fetch")

    # Rule 2: PLANNED_OR_CLEANUP_TIME == PLANNED_EXIT_OR_TIME
    sync_cols("PLANNED_OR_CLEANUP_TIME", "PLANNED_EXIT_OR_TIME", prefer="exit")

    # Rule 3: Ensure ordering constraints (only if both present)
    def enforce_order(before, after):
        if before in row.index and after in row.index:
            if pd.notna(row[before]) and pd.notna(row[after]):
                try:
                    if row[after] < row[before]:
                        row[after] = row[before]
                except Exception:
                    pass

    enforce_order("PLANNED_KNIFE_TO_SKIN_TIME", "PLANNED_SKIN_CLOSURE")
    enforce_order("PLANNED_SKIN_CLOSURE", "PLANNED_PATIENT_REVERSAL_TIME")
    enforce_order("PLANNED_PATIENT_REVERSAL_TIME", "PLANNED_EXIT_OR_TIME")
    enforce_order("PLANNED_EXIT_OR_TIME", "PLANNED_EXIT_RECOVERY_TIME")
    enforce_order("PLANNED_EXIT_RECOVERY_TIME", "PLANNED_OR_CLEANUP_TIME")

    # --- Step 3: fill missing times from anchors ---
    knife = row.get("PLANNED_KNIFE_TO_SKIN_TIME", pd.NaT)
    closure = row.get("PLANNED_SKIN_CLOSURE", pd.NaT)

    if pd.notna(knife):
        if "PLANNED_ANAESTHESIA_INDUCTION" in row.index and pd.isna(row["PLANNED_ANAESTHESIA_INDUCTION"]):
            row["PLANNED_ANAESTHESIA_INDUCTION"] = knife
        if "PLANNED_SURGERY_PREP_TIME" in row.index and pd.isna(row["PLANNED_SURGERY_PREP_TIME"]):
            row["PLANNED_SURGERY_PREP_TIME"] = knife

    if pd.notna(closure):
        if "PLANNED_PATIENT_REVERSAL_TIME" in row.index and pd.isna(row["PLANNED_PATIENT_REVERSAL_TIME"]):
            row["PLANNED_PATIENT_REVERSAL_TIME"] = closure

    return row

# Apply row-wise
data = data.apply(lambda r: impute_with_rules(r, planned_cols, actual_cols), axis=1)
data.iloc[:, 7:31]

Date processing for ACTUAL columns

In [None]:
def impute_patient_times(row):
    fetch_col = "PATIENT_FETCH_TIME"
    call_col = "PATIENT_CALL_TIME"
    reception_col = "ACTUAL_RECEPTION_IN_TIME"

    # Helper: coerce any value to Timestamp
    def _to_ts(val):
        if pd.isna(val):
            return None
        if isinstance(val, pd.Timestamp):
            return val
        try:
            return pd.to_datetime(val, errors="coerce")
        except Exception:
            return None

    # --- Step 1: if CALL is empty, copy RECEPTION ---
    call_time = _to_ts(row.get(call_col, pd.NaT))
    reception_time = _to_ts(row.get(reception_col, pd.NaT))

    if call_time is None and reception_time is not None:
        row[call_col] = reception_time
        call_time = reception_time

    # --- Step 2: if FETCH is empty, fill it ---
    fetch_time = _to_ts(row.get(fetch_col, pd.NaT))
    if fetch_time is None:
        if call_time is not None and reception_time is not None:
            midpoint = call_time + (reception_time - call_time) / 2
            row[fetch_col] = midpoint.floor("min")  # round down
        elif call_time is not None:
            row[fetch_col] = call_time.floor("min")
        elif reception_time is not None:
            row[fetch_col] = reception_time.floor("min")

    return row

data = data.apply(impute_patient_times, axis=1)

data.iloc[:, 19:31]

### Imputing Missing Induction, Prep, and Reversal Times

For some rows, `ACTUAL_ANAESTHESIA_INDUCTION`, `ACTUAL_SURGERY_PREP_TIME`,   
`ACTUAL_PATIENT_REVERSAL_TIME`, and `ACTUAL_OR_CLEANUP_TIME` are missing.  
To fill these values in a consistent and data-driven way, we treat the **OR workflow as a timeline**.

---

#### Case A: Induction & Prep together (baseline method)
- `ACTUAL_ENTER_OR_TIME` → **0% mark**  
- `ACTUAL_KNIFE_TO_SKIN_TIME` → **100% mark**  

For rows where both induction and prep times are available, we compute their relative positions:
- **Induction mark** = (Induction − Enter OR) ÷ (Knife-to-skin − Enter OR)  
- **Prep mark** = (Prep − Enter OR) ÷ (Knife-to-skin − Enter OR)  

We then take the **average mark** across all valid rows.  
For rows with missing values:
- `ACTUAL_ANAESTHESIA_INDUCTION` is backfilled as  
  `Enter OR + (Knife-to-skin − Enter OR) × <avg induction mark>`, rounded to the nearest minute  
- `ACTUAL_SURGERY_PREP_TIME` is backfilled as  
  `Enter OR + (Knife-to-skin − Enter OR) × <avg prep mark>`, rounded to the nearest minute  

---

#### Case B: Prep missing, but induction & knife available
For rows with induction and knife-to-skin times but missing prep:  
- Compute average **prep-from-induction mark** = (Prep − Induction) ÷ (Knife − Induction)  
- Backfill missing prep as  
  `Induction + (Knife − Induction) × <avg prep-from-induction mark>`

---

#### Case C: Induction missing, but enter & prep available
For rows with enter and prep but missing induction:  
- Compute average **induction-from-enter mark** = (Induction − Enter OR) ÷ (Prep − Enter OR)  
- Backfill missing induction as  
  `Enter OR + (Prep − Enter OR) × <avg induction-from-enter mark>`

---

#### Case D: Reversal missing, but closure & exit available
For rows with closure and exit but missing reversal:  
- `ACTUAL_SKIN_CLOSURE` → **0% mark**  
- `ACTUAL_EXIT_OR_TIME` → **100% mark**  
- Compute average **reversal mark** = (Reversal − Closure) ÷ (Exit − Closure)  
- Backfill missing reversal as  
  `Closure + (Exit − Closure) × <avg reversal mark>`, rounded to the nearest minute  

---

#### Case E: Cleanup missing, but exit available
Unlike induction, prep, and reversal, cleanup is best modeled as a **fixed offset** after exit.  
- Compute average **cleanup offset** = (Cleanup − Exit) across rows with both values.  
- Backfill missing cleanup as  
  `Exit + <avg cleanup offset>`  
Rounded to the nearest minute.

---

This imputation strategy ensures that filled values preserve the natural ordering of OR events, are grounded in real observed distributions, and remain realistic within the surgical timeline.

In [None]:
def compute_marks(data):
    marks = {}

    # Case A: induction & prep relative to enter/knife
    mask = (
        data["ACTUAL_ENTER_OR_TIME"].notna()
        & data["ACTUAL_ANAESTHESIA_INDUCTION"].notna()
        & data["ACTUAL_SURGERY_PREP_TIME"].notna()
        & data["ACTUAL_KNIFE_TO_SKIN_TIME"].notna()
    )
    clean = data.loc[mask].copy()
    clean = clean[
        (clean["ACTUAL_ENTER_OR_TIME"] <= clean["ACTUAL_ANAESTHESIA_INDUCTION"])
        & (clean["ACTUAL_ANAESTHESIA_INDUCTION"] <= clean["ACTUAL_SURGERY_PREP_TIME"])
        & (clean["ACTUAL_SURGERY_PREP_TIME"] <= clean["ACTUAL_KNIFE_TO_SKIN_TIME"])
    ]
    if not clean.empty:
        total = (clean["ACTUAL_KNIFE_TO_SKIN_TIME"] - clean["ACTUAL_ENTER_OR_TIME"]).dt.total_seconds()
        marks["induction"] = ((clean["ACTUAL_ANAESTHESIA_INDUCTION"] - clean["ACTUAL_ENTER_OR_TIME"]).dt.total_seconds() / total).mean(skipna=True)
        marks["prep"] = ((clean["ACTUAL_SURGERY_PREP_TIME"] - clean["ACTUAL_ENTER_OR_TIME"]).dt.total_seconds() / total).mean(skipna=True)

    # Case B: prep relative to induction/knife
    mask = (
        data["ACTUAL_ANAESTHESIA_INDUCTION"].notna()
        & data["ACTUAL_SURGERY_PREP_TIME"].notna()
        & data["ACTUAL_KNIFE_TO_SKIN_TIME"].notna()
    )
    clean = data.loc[mask].copy()
    clean = clean[
        (clean["ACTUAL_ANAESTHESIA_INDUCTION"] <= clean["ACTUAL_SURGERY_PREP_TIME"])
        & (clean["ACTUAL_SURGERY_PREP_TIME"] <= clean["ACTUAL_KNIFE_TO_SKIN_TIME"])
    ]
    if not clean.empty:
        total = (clean["ACTUAL_KNIFE_TO_SKIN_TIME"] - clean["ACTUAL_ANAESTHESIA_INDUCTION"]).dt.total_seconds()
        marks["prep_from_induction"] = ((clean["ACTUAL_SURGERY_PREP_TIME"] - clean["ACTUAL_ANAESTHESIA_INDUCTION"]).dt.total_seconds() / total).mean(skipna=True)

    # Case C: induction relative to enter/prep
    mask = (
        data["ACTUAL_ENTER_OR_TIME"].notna()
        & data["ACTUAL_ANAESTHESIA_INDUCTION"].notna()
        & data["ACTUAL_SURGERY_PREP_TIME"].notna()
    )
    clean = data.loc[mask].copy()
    clean = clean[
        (clean["ACTUAL_ENTER_OR_TIME"] <= clean["ACTUAL_ANAESTHESIA_INDUCTION"])
        & (clean["ACTUAL_ANAESTHESIA_INDUCTION"] <= clean["ACTUAL_SURGERY_PREP_TIME"])
    ]
    if not clean.empty:
        total = (clean["ACTUAL_SURGERY_PREP_TIME"] - clean["ACTUAL_ENTER_OR_TIME"]).dt.total_seconds()
        marks["induction_from_enter"] = ((clean["ACTUAL_ANAESTHESIA_INDUCTION"] - clean["ACTUAL_ENTER_OR_TIME"]).dt.total_seconds() / total).mean(skipna=True)

    # Case D: reversal relative to closure/exit
    mask = (
        data["ACTUAL_SKIN_CLOSURE"].notna()
        & data["ACTUAL_PATIENT_REVERSAL_TIME"].notna()
        & data["ACTUAL_EXIT_OR_TIME"].notna()
    )
    clean = data.loc[mask].copy()
    clean = clean[
        (clean["ACTUAL_SKIN_CLOSURE"] <= clean["ACTUAL_PATIENT_REVERSAL_TIME"])
        & (clean["ACTUAL_PATIENT_REVERSAL_TIME"] <= clean["ACTUAL_EXIT_OR_TIME"])
    ]
    if not clean.empty:
        total = (clean["ACTUAL_EXIT_OR_TIME"] - clean["ACTUAL_SKIN_CLOSURE"]).dt.total_seconds()
        marks["reversal"] = ((clean["ACTUAL_PATIENT_REVERSAL_TIME"] - clean["ACTUAL_SKIN_CLOSURE"]).dt.total_seconds() / total).mean(skipna=True)

    # Case E: cleanup offset from exit
    mask = (
        data["ACTUAL_EXIT_OR_TIME"].notna()
        & data["ACTUAL_OR_CLEANUP_TIME"].notna()
    )
    clean = data.loc[mask].copy()

    # Keep only realistic differences (0 to 12 hours after exit)
    valid = (clean["ACTUAL_OR_CLEANUP_TIME"] >= clean["ACTUAL_EXIT_OR_TIME"]) & (
        (clean["ACTUAL_OR_CLEANUP_TIME"] - clean["ACTUAL_EXIT_OR_TIME"]) <= pd.Timedelta(hours=12)
    )
    clean = clean[valid]

    if not clean.empty:
        diffs = (clean["ACTUAL_OR_CLEANUP_TIME"] - clean["ACTUAL_EXIT_OR_TIME"]).dt.total_seconds()
        marks["cleanup_offset"] = round(diffs.mean(skipna=True) / 60.0)  # minutes


    return marks


def impute_induction_prep_reversal_cleanup(row, marks):
    enter, induction, prep, knife = row[["ACTUAL_ENTER_OR_TIME", "ACTUAL_ANAESTHESIA_INDUCTION", "ACTUAL_SURGERY_PREP_TIME", "ACTUAL_KNIFE_TO_SKIN_TIME"]]
    closure, reversal, exit_, cleanup = row[["ACTUAL_SKIN_CLOSURE", "ACTUAL_PATIENT_REVERSAL_TIME", "ACTUAL_EXIT_OR_TIME", "ACTUAL_OR_CLEANUP_TIME"]]

    # --- Case A: both missing induction & prep
    if pd.notna(enter) and pd.isna(induction) and pd.isna(prep) and pd.notna(knife):
        if "induction" in marks and "prep" in marks:
            total = knife - enter
            row["ACTUAL_ANAESTHESIA_INDUCTION"] = (enter + total * marks["induction"]).round("min")
            row["ACTUAL_SURGERY_PREP_TIME"] = (enter + total * marks["prep"]).round("min")

    # --- Case B: missing prep only
    if pd.notna(induction) and pd.isna(prep) and pd.notna(knife):
        if "prep_from_induction" in marks:
            total = knife - induction
            row["ACTUAL_SURGERY_PREP_TIME"] = (induction + total * marks["prep_from_induction"]).round("min")

    # --- Case C: missing induction only
    if pd.notna(enter) and pd.isna(induction) and pd.notna(prep):
        if "induction_from_enter" in marks:
            total = prep - enter
            row["ACTUAL_ANAESTHESIA_INDUCTION"] = (enter + total * marks["induction_from_enter"]).round("min")

    # --- Case D: missing reversal
    if pd.notna(closure) and pd.isna(reversal) and pd.notna(exit_):
        if "reversal" in marks:
            total = exit_ - closure
            row["ACTUAL_PATIENT_REVERSAL_TIME"] = (closure + total * marks["reversal"]).round("min")

    # --- Case E: missing cleanup
    if pd.notna(exit_) and pd.isna(cleanup):
        if "cleanup_offset" in marks:
            row["ACTUAL_OR_CLEANUP_TIME"] = (exit_ + pd.Timedelta(minutes=marks["cleanup_offset"])).round("min")

    return row


# Step 1: get average marks
marks = compute_marks(data)

# Step 2: apply backfill
data = data.apply(lambda r: impute_induction_prep_reversal_cleanup(r, marks), axis=1)

data.iloc[:, 19:31]

### Handle Rows with no Actual Data

For the purpose of our project, actual date/time data is needed to track any delays with the planned time.
By observation, we note that these rows with no actual data track to procedures marked with LOCATION == "OUT OF OT ROOMS".
These represent cases outside of operating theatres and should not be included in downstream time sequence analysis. We therefore remove them.


In [None]:
before = len(data)
data = data[data["LOCATION"] != "OUT OF OT ROOMS"].copy()
after = len(data)

print(f"Removed {before - after} rows with LOCATION == 'OUT OF OT ROOMS' (kept {after}).")

### Convert Planned Columns to Datetime

To make sure all planned time columns are in a consistent `datetime64[ns]` format, we explicitly convert them using `pd.to_datetime`.

In [None]:
# Convert all planned columns to datetime64[ns]
for col in planned_cols:
    if col in data.columns:
        data[col] = pd.to_datetime(data[col], errors="coerce")

### Handle admission related columns
Some of these surgeries may be day surgeries of from the A&E, hence might not have admission data. Hence, we will replace blanks with "Not Admitted".

In [None]:
admission_cols = ["ADMISSION_STATUS", "ADMISSION_CLASS_TYPE", 
                  "ADMISSION_TYPE", "ADMISSION_WARD", "ADMISSION_BED"]
data[admission_cols] = data[admission_cols].fillna("Not Admitted")

### Fill in missing staff data
Some surgeries are missing surgeon, anaesthetist, or diagnosis data, hence we will fill it with "Unknown" and "Not Recorded". This is because it is likely not possible for a surgery to proceed without them.

In [None]:
clinician_cols = ["SURGEON", "ANAESTHETIST_TEAM", "ANAESTHETIST_MCR_NO"]
data[clinician_cols] = data[clinician_cols].fillna("Unknown")
data["DIAGNOSIS"] = data["DIAGNOSIS"].fillna("Not Recorded")
data

### Drop remaining missing rows
After filling in the missing values that we are able to fill, there are some columns that are still missing data. We will thus drop them as they make up a very small portion of our overall data.

In [None]:
data.dropna(inplace=True)

### View current state of dataframe
Currently, the dataset no longer contains any missing data, and thus we are able to proceed with the next steps.

In [None]:
data.info()

## Handle Duplicate Data
Important to remove to prevent bias in our AI/ML solution

### Check for duplicate rows
This is to see if our dataset contains any rows that are completely identical. This means that the same surgery has been accidentally logged twice. We want to avoid having this in our dataset as it would cause our analysis in the future to skew.

In [None]:
data.duplicated().sum()

### Drop duplicate rows
We identified 3 duplicate rows, and hence we will want to drop them. 

In [None]:
data = data.drop_duplicates()

In [None]:
data.info()

## Deep cleaning each column
Looking into each individual column to clean up most of the free text portions. Please add more cleaning as we go, as there is quite alot to sieve through and I dont think i caught it all.

In [None]:
data.info()

### Inspect Location
No problems.

#### Unique values analysis

In [None]:
inspect_column(data, "LOCATION", top_n=30)

### Inspect Room
No problems.

#### Unique values analysis

In [None]:
inspect_column(data, "ROOM", top_n=30)

### Inspect case status
No problems.

#### Unique values analysis

In [None]:
inspect_column(data, "CASE_STATUS", top_n=30)

### Inspect OPERATION_TYPE
No problems.

#### UVA

In [None]:
inspect_column(data, "OPERATION_TYPE", top_n=30)

### Inspect Emergency Priority
No problems.

#### UVA

In [None]:
inspect_column(data, "EMERGENCY_PRIORITY", top_n=30)

### Inspect Patient Code
No problems.

#### UVA

In [None]:
inspect_column(data, "PATIENT_CODE", top_n=30)

### Inspect Nature
Removed this column entirely, and created a legend(can be found below) to map SURGICAL_CODE to NATURE, as they are the same thing.

#### UVA

In [None]:
inspect_column(data, "NATURE", top_n=30)

In [None]:
data, nature_legend = build_operation_legend_and_drop_nature(
    data,
    code_col="SURGICAL_CODE",
    nature_col="NATURE",
    drop_nature=True,           
    keep_title_case_copy=True    
)

In [None]:
print(data.columns)              
nature_legend.drop(columns='operation_name_title', inplace=True)
nature_legend

In [None]:
data.info()

### Inspect Surgical Code
Extension of NATURE.

#### UVA

In [None]:
inspect_column(data, "SURGICAL_CODE", top_n=30)

### Inspect discipline
No problems.

#### UVA

In [None]:
inspect_column(data, "DISCIPLINE", top_n=30)

### Inspect Surgeon
No problems.

#### UVA

In [None]:
inspect_column(data, "SURGEON", top_n=30)

### Inspect ANAESTHETIST_TEAM
No problems.

#### UVA

In [None]:
inspect_column(data, "ANAESTHETIST_TEAM", top_n=30)

### INSPECT ANAESTHETIST_MCR_NO
No problems.

#### UVA

In [None]:
inspect_column(data, "ANAESTHETIST_MCR_NO", top_n=30)

### INSPECT ANESTHESIA
No problems.

#### UVA

In [None]:
inspect_column(data, "ANESTHESIA", top_n=30)

### Inspect EQUIPMENT
Removed #NUH_ or #NUH from all entries, as well as alphabetically ordered the equipment such that even if they were in different orders, they would appear under the same unique value.

#### UVA

In [None]:
inspect_column(data, "EQUIPMENT", top_n=30)

In [None]:
synonyms = {}
data = clean_equipment(
    data,
    col="EQUIPMENT",
    tags_to_strip=(r"#nuh",),        
    unknown_vals=("0","na","n/a","-","null","nan",""),
    synonym_map=synonyms
)

# Inspect results
inspect_column(data, "EQUIPMENT", top_n=30)

### Inspect ADMISSION_STATUS
No problems.

#### UVA

In [None]:
inspect_column(data, "ADMISSION_STATUS", top_n=30)

In [None]:
data = data[data["ADMISSION_STATUS"] != "1518656227"]
data = data[data["ADMISSION_STATUS"] != "1518637975"]

### Inspect ADMISSION_CLASS_TYPE
No problems.

#### UVA

In [None]:
inspect_column(data, "ADMISSION_CLASS_TYPE", top_n=30)

### Inspect ADMISSION_TYPE
No problems.

#### UVA

In [None]:
inspect_column(data, "ADMISSION_TYPE", top_n=30)

### Inspect ADMISSION_WARD
No problems.

#### UVA

In [None]:
inspect_column(data, "ADMISSION_WARD", top_n=30)

### Inspect ADMISSION_BED
No problems.

#### UVA

In [None]:
inspect_column(data, "ADMISSION_BED", top_n=30)

### Inspect AOH
Fixed True False.

#### UVA

In [None]:
inspect_column(data, "AOH", top_n=30)

In [None]:
data = normalize_text(data, "AOH", unknown_vals=["0", "na", "n/a", "-", "null", "nan"])
inspect_column(data, "AOH", top_n=30)

### Inspect BLOOD
No problems.

#### UVA

In [None]:
inspect_column(data, "BLOOD", top_n=30)

### Inspect IMPLANT
remove 'yes' remove 'x1' remove multiple white spaces, leading and trailing whitespaces and symbols.

#### UVA

In [None]:
inspect_column(data, "IMPLANT", top_n=30)

In [None]:
data = normalize_text(data, "IMPLANT", unknown_vals=["0", "na", "n/a", "-", "null", "nan", "", "nil", "nil."])
data["IMPLANT"] = (
    data["IMPLANT"]
    .astype(str)
    .str.strip(" ;,.-")
    .str.replace(r"\bx\d+\b", "", regex=True)
    .str.replace(r"\byes\b", "", regex=True)
    .str.replace(r"\s+", " ", regex=True)    
    .str.strip()
    .str.replace("&", "and", regex=False)
)
inspect_column(data, "IMPLANT", top_n=30)

### Inspect DIAGNOSIS
just normal standardisation

#### UVA

In [None]:
inspect_column(data, "DIAGNOSIS", top_n=30)

In [None]:
data = normalize_text(data, "DIAGNOSIS", unknown_vals=["0", "na", "n/a", "-", "null", "nan", "", "nil"])
inspect_column(data, "DIAGNOSIS", top_n=100)

### Inspect CANCER_INDICATOR
just normal standardisation

#### UVA

In [None]:
inspect_column(data, "CANCER_INDICATOR", top_n=30)

In [None]:
data = normalize_text(data, "CANCER_INDICATOR", unknown_vals=["0", "na", "n/a", "-", "null", "nan"])
data = data[data["CANCER_INDICATOR"].isin(["false", "true"])]
inspect_column(data, "CANCER_INDICATOR", top_n=30)

### Inspect TUMOR_INDICATOR
just normal standardisation

#### UVA

In [None]:
inspect_column(data, "TRAUMA_INDICATOR", top_n=30)

In [None]:
data = normalize_text(data, "TRAUMA_INDICATOR", unknown_vals=["0", "na", "n/a", "-", "null", "nan"])
inspect_column(data, "TRAUMA_INDICATOR", top_n=30)


### Inspect Delay_Reason
No problems.

#### UVA

In [None]:
inspect_column(data, "Delay_Reason", top_n=30)

### Inspect Remarks
No problems.

#### UVA

In [None]:
inspect_column(data, "Remarks", top_n=30)

Save file from cleaning steps above into a seperate file (change file name if required)

In [None]:
output_path = "oots-data-cleaning-1.xlsx"
data.to_excel(output_path, index=False)

print(f"Saved to {output_path}")

## EMMAS PART

Find the highest frequency of words, bigrams, and trigrams to be used in taxonomy for categorisation

In [None]:
if "Delay_Reason" not in data.columns:
    raise KeyError(f"'Delay Reason' not found. Available columns: {list(data.columns)}")

data.head()

Normalize text in Delay_Reason (remove punctuation, standardise case, remove trialing spaces)

In [None]:
_punct_tbl = str.maketrans("", "", string.punctuation)

def normalize_text(s: str) -> str:

    s = str(s).lower()
    s = s.translate(_punct_tbl)
    s = re.sub(r"\s+", " ", s).strip()

    s = re.sub(r"\bo\.t\b", "operating theater", s)
    s = re.sub(r"\bot\b", "operating theater", s)
    s = re.sub(r"\bo\.r\b", "operating room", s)
    s = re.sub(r"\banaesth\b", "anaesthesia", s)
    s = re.sub(r"\banesth\b", "anaesthesia", s)
    s = re.sub(r"\bpt\b", "patient", s)
    s = re.sub(r"\bprev\b", "previous", s)
    s = re.sub(r"\bdr\b", "doctor", s)
    s = re.sub(r"\bpre-med\b", "premedication", s)

    return s

# Apply normalization
data["_Delay_norm"] = data["Delay_Reason"].astype(str).fillna("").map(normalize_text)
data[["_Delay_norm"]].head(10)

In [None]:
STOPWORDS = {
    "the","a","an","and","or","to","of","for","by","with","from",
    "is","are","was","were","be","been","being","due","because",
    "this","that","it","as","into","per","via", "eg", "etc"
}

# Initialize containers
words, bigrams, trigrams = [], [], []

# Tokenize each delay reason
for text in data["_Delay_norm"]:
    tokens = [t for t in text.split() if t and t not in STOPWORDS]
    if not tokens:
        continue

    words.extend(tokens)
    if len(tokens) >= 2:
        bigrams.extend([" ".join(tokens[i:i+2]) for i in range(len(tokens)-1)])
    if len(tokens) >= 3:
        trigrams.extend([" ".join(tokens[i:i+3]) for i in range(len(tokens)-2)])

In [None]:
OUTPUT_FILE = "oots-data-cleaning-3-flagged.xlsx"
OUTPUT_CSV  = "oots-data-cleaning-3-flagged.csv"

COL = "Delay_Reason"
s = data[COL].astype(str)

clean = (
    s.str.lower()
     .str.replace(r"[^\w\s]", "", regex=True)    
     .str.replace(r"\s+", " ", regex=True)       
     .str.strip()
)

raw = s.str.strip()
only_punct_or_numbers = raw.str.match(r'^(?=.*\S)(?!.*[A-Za-z]).*$', na=False)

data.loc[only_punct_or_numbers, COL] = "0"

not_late_phrases = [
    "no delay", "not delay", "not delayed", "not late",
    "na", "0", "null", "nan"
]

def phrase_to_token_pattern(p: str) -> str:
    p = p.strip().lower()
    esc = re.escape(p).replace(r"\ ", r"\s+")
    return rf"(?<!\w){esc}(?!\w)"

pattern = r"(?:{})".format("|".join(phrase_to_token_pattern(p) for p in not_late_phrases))
regex = re.compile(pattern, flags=re.IGNORECASE)

phrase_hit = clean.str.contains(regex, na=False)

data["Reason_Is_Late"] = np.where(only_punct_or_numbers | phrase_hit, 0, 1)

data[COL] = clean
data.loc[only_punct_or_numbers, COL] = "0"
data.drop(columns=COL, inplace=True)


## Adding target variables

In [None]:
# --- Surgery duration (knife → closure) ---
data["ACTUAL_SURGERY_DURATION"]  = data["ACTUAL_SKIN_CLOSURE"]  - data["ACTUAL_KNIFE_TO_SKIN_TIME"]
data["PLANNED_SURGERY_DURATION"] = data["PLANNED_SKIN_CLOSURE"] - data["PLANNED_KNIFE_TO_SKIN_TIME"]
data["DIFF_SURGERY_DURATION"]    = data["ACTUAL_SURGERY_DURATION"] - data["PLANNED_SURGERY_DURATION"]

# --- OR usage duration (enter OR → exit OR) ---
data["ACTUAL_USAGE_DURATION"]  = data["ACTUAL_EXIT_OR_TIME"]  - data["ACTUAL_ENTER_OR_TIME"]
data["PLANNED_USAGE_DURATION"] = data["PLANNED_EXIT_OR_TIME"] - data["PLANNED_ENTER_OR_TIME"]
data["DIFF_USAGE_DURATION"]    = data["ACTUAL_USAGE_DURATION"] - data["PLANNED_USAGE_DURATION"]



converting new target variables to minutes

In [None]:
to_min = lambda s: s.dt.total_seconds() / 60
for col in [
    "ACTUAL_SURGERY_DURATION","PLANNED_SURGERY_DURATION","DIFF_SURGERY_DURATION",
    "ACTUAL_USAGE_DURATION","PLANNED_USAGE_DURATION","DIFF_USAGE_DURATION",
]:
    data[col] = to_min(data[col])

data.head()

In [None]:
data.info()