# BT4103 Data Cleaning
Please read through and let me know if there are any issues with regard to the cleaning.

## Import packages and datasets
I have imported all the packages that I used up here for ease of reference. Please add your own filepath so that you can import the data correctly.

In [85]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import re
import unicodedata
import datetime
import numpy as np
import string
import difflib


import warnings
warnings.filterwarnings('ignore')

### Importing dataset
Add filepath below

In [86]:
data = pd.read_excel("oots-cleaned2.xlsx")
data

validation_data = pd.read_excel("oots-cleaned-unlocked.xlsx")

## Define Helper Functions
Here are the helper functions that I have created for easier readability in the actual code below.

In [87]:
def OHE(df, col, drop_first=False): #One Hot Encode a column in a df
    dummies = pd.get_dummies(df[col], prefix=col, drop_first=drop_first)
    df = pd.concat([df.drop(columns=[col]), dummies], axis=1)
    legend = {new_col: category 
              for new_col, category in zip(dummies.columns, dummies.columns.str.replace(f"{col}_", "", regex=False))}
    return df, legend


def LabelEncode(df, col): #Label Encode a column in a df
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    legend = dict(zip(le.classes_, le.transform(le.classes_)))
    return df, legend

def inspect_column(df, col, top_n=20): #Get Unique Value information on column in df
    print(f"Column: {col}")
    print(f"Unique values: {df[col].nunique(dropna=False)}")
    print("\nTop value counts:")
    print(df[col].value_counts(dropna=False).head(top_n))

def apply_mapping(df, col, mapping, new_col_suffix="_clean"): #Apply new mapping to column in df
    new_col = col + new_col_suffix
    df[new_col] = df[col].replace(mapping)
    return df

def normalize_text(df, col, to_lower=True, replace_symbols=True, unknown_vals=None): #Normalise text of a column in a df
    s = df[col].astype(str).str.strip()
    if to_lower:
        s = s.str.lower()
    s = s.str.replace(r"\s+", " ", regex=True)
    if replace_symbols:
        s = s.str.replace(r"[-_/]", " ", regex=True)
    if unknown_vals:
        s = s.replace(unknown_vals, "0")
    df[col] = s
    return df

def _normalize_text_series(s: pd.Series) -> pd.Series: #Normalise text in series
    s = s.astype(str).map(lambda x: unicodedata.normalize("NFKC", x))
    s = s.str.strip()
    s = s.str.lower()
    s = s.str.replace(r"\s+", " ", regex=True)         
    s = s.str.replace(r"\s*/\s*", " / ", regex=True)   
    s = s.str.replace(r"\s*,\s*", ", ", regex=True)    
    s = s.str.strip(" ,")                              
    return s

def _remove_trailing_code_in_parens(name_s: pd.Series, code_s: pd.Series) -> pd.Series: #remove white spaces in ()
    code_up = code_s.astype(str).str.strip().str.upper()
    pattern = r"\(\s*{}\s*\)\s*$"
    out = name_s.copy()
    mask = code_up.notna() & code_up.ne("")
    out.loc[mask] = [
        re.sub(pattern.format(re.escape(c)), "", n, flags=re.IGNORECASE)
        for n, c in zip(out.loc[mask].tolist(), code_up.loc[mask].tolist())
    ]
    return out.str.strip(" ,")

def _choose_canonical_name(name_series: pd.Series) -> str: #Chooses best name
    s = name_series.dropna().astype(str)
    s = s[s.str.strip().ne("").values]
    s = s[s.str.strip().ne("unknown").values]
    if s.empty:
        return "0"
    vc = s.value_counts()
    top_freq = vc.iloc[0]
    candidates = vc[vc.eq(top_freq)].index.tolist()
    return max(candidates, key=len)

def build_operation_legend_and_drop_nature( #Nature cleaning
    df: pd.DataFrame,
    code_col: str = "OPERATION_CODE",
    nature_col: str = "NATURE",
    drop_nature: bool = True,
    keep_title_case_copy: bool = False,
    unknown_tokens = ("0", "na", "n/a", "-", "null", "nan")
):
    if code_col not in df.columns or nature_col not in df.columns:
        raise KeyError(f"Expected columns '{code_col}' and '{nature_col}' in df.")
    work = pd.DataFrame({
        "operation_code": df[code_col].astype(str).str.strip().str.upper(),
        "operation_name_raw": df[nature_col]
    })
    name_norm = _normalize_text_series(work["operation_name_raw"])
    name_norm = name_norm.replace(list(unknown_tokens), "unknown")
    name_clean = _remove_trailing_code_in_parens(name_norm, work["operation_code"])
    tmp = pd.DataFrame({"operation_code": work["operation_code"], "operation_name": name_clean})
    tmp = tmp[tmp["operation_code"].str.len() > 0]
    legend = (
        tmp.groupby("operation_code", as_index=False)["operation_name"]
           .apply(_choose_canonical_name)
           .rename(columns={"operation_name": "operation_name"})
    )
    if keep_title_case_copy:
        legend["operation_name_title"] = legend["operation_name"].str.title()
    df_out = df.copy()
    if drop_nature:
        df_out.drop(columns=[nature_col], inplace=True, errors="ignore")
    return df_out, legend

  
def clean_equipment( #EQUIPMENT cleaning
    df,
    col="EQUIPMENT",
    sep=";",
    tags_to_strip=(r"#nuh",),            
    unknown_vals=("0","na","n/a","-","null","nan",""),
    synonym_map=None         
):
    if synonym_map is None:
        synonym_map = {}
    pattern = r"|".join(fr"{re.escape(tag)}[_-]?" for tag in tags_to_strip)
    df[col] = (
        df[col]
        .astype(str)
        .str.replace(pattern, "", regex=True, flags=re.IGNORECASE)
    )
    df = normalize_text(df, col, to_lower=True, replace_symbols=True, unknown_vals=unknown_vals)
    def _clean_token(tok: str) -> str:
        t = tok.strip()
        if not t: return ""
        t = re.sub(r"\s+", " ", t)
        t = synonym_map.get(t, t)
        if t in ("unknown",): return ""
        return t
    def _process_cell(cell: str) -> str:
        parts = re.split(rf"\s*{re.escape(sep)}\s*", cell) if cell else []
        cleaned = [_clean_token(p) for p in parts]
        cleaned = [c for c in cleaned if c]
        if not cleaned:
            return "unknown"
        cleaned = sorted(set(cleaned))
        return f"{sep} ".join(cleaned)
    df[col] = df[col].apply(_process_cell)
    return df

## General Cleaning
We are dropping index and case number as they are labels, and dropping patient name because it has been completely removed.

In [88]:
data = data.drop(data.columns[0], axis=1) # drop INDEX
data = data.drop(columns="PATIENT_NAME")
data = data.drop(columns="CASE_NUMBER")
data = data.drop(columns="BOOKING_DATE")
data = data.drop(columns="PATIENT_CODE_OLD")
data.head()

Unnamed: 0,OPERATION_ID,LOCATION,ROOM,CASE_STATUS,OPERATION_TYPE,EMERGENCY_PRIORITY,PLANNED_PATIENT_CALL_TIME,PLANNED_PATIENT_FETCH_TIME,PLANNED_RECEPTION_IN_TIME,PLANNED_ENTER_OR_TIME,...,ADMISSION_WARD,ADMISSION_BED,AOH,BLOOD,IMPLANT,DIAGNOSIS,CANCER_INDICATOR,TRAUMA_INDICATOR,Delay_Reason,Remarks
0,588456.0,Main Building OT,MBOR11,Final,Elective,,09:50:00,2019-04-11 09:50:00,09:50:00,09:50:00,...,NW6A,N06A036,False,NIL,required microscope,Right Breast CA,False,False,Surgeon (e.g. Surgeon not available & etc.),
1,590736.0,Main Building OT,MBOR05,Final,Elective,,10:40:00,2019-04-11 10:40:00,10:40:00,10:40:00,...,NWASW,NASWA11,False,NIL,,early pregnancy failure,False,False,,
2,591995.0,ICL,RoomC,Actualised,Elective,,10:55:00,2019-04-11 10:55:00,10:55:00,10:55:00,...,NW7B,N07B005,False,NIL,,,False,False,,
3,590451.0,Main Building OT,MBOR04,Final,Elective,,10:50:00,2019-04-11 10:50:00,10:50:00,10:50:00,...,NW41,N041004,False,NIL,Need Eustachian tube ballon,EUSTACHIAN TUBE DISORDER,False,False,,
4,573666.0,Medical Center OT,MCOR03,Final,Elective,,10:30:00,2019-04-11 10:30:00,10:30:00,10:30:00,...,NW2A,N02A025,False,NIL,,gall bladder stone,False,False,first case havent finished,first case havent finished


## Ensure correct data types
yet to do, still waiting on update from joey regarding the dates

In [89]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 276863 entries, 0 to 276862
Data columns (total 52 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   OPERATION_ID                   276856 non-null  float64       
 1   LOCATION                       276856 non-null  object        
 2   ROOM                           276854 non-null  object        
 3   CASE_STATUS                    276856 non-null  object        
 4   OPERATION_TYPE                 276856 non-null  object        
 5   EMERGENCY_PRIORITY             42137 non-null   object        
 6   PLANNED_PATIENT_CALL_TIME      276856 non-null  object        
 7   PLANNED_PATIENT_FETCH_TIME     208445 non-null  datetime64[ns]
 8   PLANNED_RECEPTION_IN_TIME      276856 non-null  object        
 9   PLANNED_ENTER_OR_TIME          276856 non-null  object        
 10  PLANNED_ANAESTHESIA_INDUCTION  231120 non-null  datetime64[ns]
 11  

## Handle Missing Data
Handling missing data is important to ensure that we do not run into any issues with the EDA, as well as our AI/ML implementations. It is a vital step in data cleaning to ensure that the dataset can be used efficiently and properly.

### Handling optional columns
These columns potentially will be blank as there is nothing to write for some operations, hence we will fill those with 0.

In [90]:
data["Delay_Reason"].fillna(0, inplace=True)
data["Remarks"].fillna(0, inplace=True)
data["IMPLANT"].fillna(0, inplace=True)
data["EQUIPMENT"].fillna(0, inplace=True)
data["EMERGENCY_PRIORITY"].fillna(0, inplace=True)

### Date Handling and Imputation Rules

This section handles messy date/time data by enforcing **consistent start dates** and applying **domain-specific sync rules**.

---

#### 1. Detecting Date vs. Time-only Strings
- `_looks_like_date_string(s)` → checks if a string contains a date-like pattern (`YYYY-MM-DD`, `DD/MM/YYYY`, etc.).  
- `_is_time_only_string(s)` → checks if a string looks like a time-only entry (`08:15`, `8:15:00 AM`, etc.).

This distinction allows us to avoid misinterpreting time-only values as full datetimes.

---

#### 2. Determining Constant Start Dates
- `find_start_date_from_row(row, cols)` scans a list of columns in a row and finds the **first valid date**.  
  - A valid date is a `Timestamp` with `year > 1900` or a parseable date string.  
  - Returns the **normalized date** (time set to 00:00:00).  

- `attach_constant_dates(row, planned_cols, actual_cols)`:
  - Finds one **planned_start** and one **actual_start** per row.
  - If `actual_start` is missing but `planned_start` exists, use the planned date as fallback.
  - For any time-only strings, attach the corresponding start date to construct a full `Timestamp`.
  - Full datetime values are preserved as-is.

---

#### 2b. Date Sanitisation 
We discovered that some of the input data, while they had dates, had **corrupted or incorrectly manipulated dates**.  
To ensure all downstream imputations are built on reliable timelines, we enforce a **sanity check**:

1. **Valid date range**  
   - Earliest allowed date: **2016-12-31**  
   - Latest allowed date: **2022-02-25**

2. **Correction procedure**  
   - For each row, check if `planned_start` and `actual_start` fall within the valid range.  
   - If either date is outside this range, attempt to backfill the correct value from  
     `oots-cleaned-unlocked.xlsx` using `OPERATION_ID`.  
   - If no match is found in the validation file, replace the invalid date with `NaT`.

3. **Guarantees after cleaning**  
   - Every `planned_start` and `actual_start` is either:
     - Within the valid range, or  
     - Backfilled from the validation dataset, or  
     - Explicitly marked as `NaT` if no trusted source is available.  

This step ensures that **all subsequent imputations** (e.g., filling missing times)  
operate only on dates within the trusted window.

---

#### 3. Row-wise Imputation Rules
- `impute_with_rules(row, planned_cols, actual_cols)`:
  1. **Attach constant start dates** using `attach_constant_dates`.
  2. **Sync critical columns**:
     - `PLANNED_PATIENT_CALL_TIME` ↔ `PLANNED_PATIENT_FETCH_TIME`  
     - `PLANNED_OR_CLEANUP_TIME` ↔ `PLANNED_EXIT_OR_TIME`  
     Preference is given to whichever value exists.
  3. **Enforce ordering constraints**:
     - Knife-to-skin ≤ Skin closure ≤ Patient reversal ≤ Exit OR ≤ Exit recovery ≤ OR cleanup
     - If any step goes backwards, adjust forward.
  4. **Fill missing anchor values**:
     - If missing, `PLANNED_ANAESTHESIA_INDUCTION` and `PLANNED_SURGERY_PREP_TIME` are set to `PLANNED_KNIFE_TO_SKIN_TIME`.
     - If missing, `PLANNED_PATIENT_REVERSAL_TIME` is set to `PLANNED_SKIN_CLOSURE`.

In [91]:
planned_cols = [
        "PLANNED_PATIENT_CALL_TIME",
        "PLANNED_PATIENT_FETCH_TIME",
        "PLANNED_RECEPTION_IN_TIME",
        "PLANNED_ENTER_OR_TIME",
        "PLANNED_ANAESTHESIA_INDUCTION",
        "PLANNED_SURGERY_PREP_TIME",
        "PLANNED_KNIFE_TO_SKIN_TIME",
        "PLANNED_SKIN_CLOSURE",
        "PLANNED_PATIENT_REVERSAL_TIME",
        "PLANNED_EXIT_OR_TIME",
        "PLANNED_OR_CLEANUP_TIME",
        "PLANNED_EXIT_RECOVERY_TIME",        
    ]

actual_cols = [
        "PATIENT_CALL_TIME",
        "PATIENT_FETCH_TIME",
        "ACTUAL_RECEPTION_IN_TIME",
        "ACTUAL_ENTER_OR_TIME",
        "ACTUAL_ANAESTHESIA_INDUCTION",        
        "ACTUAL_SURGERY_PREP_TIME",
        "ACTUAL_KNIFE_TO_SKIN_TIME",
        "ACTUAL_SKIN_CLOSURE",
        "ACTUAL_PATIENT_REVERSAL_TIME",
        "ACTUAL_EXIT_OR_TIME",
        "ACTUAL_OR_CLEANUP_TIME",
        "ACTUAL_EXIT_RECOVERY_TIME",        
    ]

MIN_DATE = pd.Timestamp("2016-12-31")
MAX_DATE = pd.Timestamp("2022-02-25")

warnings = []

_time_only_re = re.compile(r'^\s*\d{1,2}:\d{2}(:\d{2})?\s*(?:[AaPp][Mm])?\s*$')
_date_like_re = re.compile(r'\d{4}[-/]\d{1,2}[-/]\d{1,2}|\d{1,2}[-/]\d{1,2}[-/]\d{2,4}')

def _looks_like_date_string(s: str) -> bool:
    """Rudimentary check whether a string contains an explicit date."""
    if not isinstance(s, str):
        return False
    s = s.strip()
    return bool(_date_like_re.search(s))

def _is_time_only_string(s: str) -> bool:
    """True if string appears to contain time only (e.g. '08:15' or '8:15:00 AM')."""
    if not isinstance(s, str):
        return False
    return bool(_time_only_re.match(s.strip()))

def find_start_date_from_row(row, cols):
    """
    Scan cols in order and return the first discovered 'date' (normalized).
    We consider a value to contain a date if:
      - it's a pandas Timestamp / datetime with a year > 1900
      - or the original string contains a date-like pattern and parses to a Timestamp with sensible year
    """
    for col in cols:
        if col not in row.index:
            continue
        val = row[col]
        if pd.isna(val):
            continue

        # Already a Timestamp / datetime
        if isinstance(val, pd.Timestamp):
            if val.year > 1900:  # treat as containing a real date
                return val.normalize()
            else:
                # likely a parsed time-only; skip
                continue
        if isinstance(val, datetime.datetime):
            if val.year > 1900:
                return pd.Timestamp(val).normalize()

        # If it's a string, check if it looks like a date first
        try:
            s = str(val).strip()
        except Exception:
            continue

        if _looks_like_date_string(s):
            parsed = pd.to_datetime(s, errors="coerce", dayfirst=False)
            if not pd.isna(parsed) and parsed.year > 1900:
                return parsed.normalize()

        # If it wasn't date-like, skip (likely time-only)
    return None


def attach_constant_dates(row, planned_cols, actual_cols):
    """
    For a given row:
     - find planned_start = first planned col that contains a date
     - find actual_start  = first actual col that contains a date
     - verify both are within MIN_DATE .. MAX_DATE; if not, try to retrieve from validation_date by OPERATION_ID
     - for each planned col: if its value is time-only, attach planned_start
     - for each actual col: if its value is time-only, attach actual_start
    Does NOT override full datetimes.
    """
    # Find constants (scan entire list until first real-date found)
    planned_start = find_start_date_from_row(row, planned_cols)
    actual_start = find_start_date_from_row(row, actual_cols)

    # --- SANITY: if either start is missing or out of acceptable range, try to pull from validation sheet ---
    def _in_range(ts):
        return isinstance(ts, pd.Timestamp) and (MIN_DATE <= ts <= MAX_DATE)

    # helper to attempt retrieval from validation_data using OPERATION_ID
    def _try_validation_lookup(opid, cols):
        if opid is None:
            return None
        # attempt direct match first; fall back to string match if needed
        matches = pd.DataFrame()
        try:
            matches = validation_data.loc[validation_data["OPERATION_ID"] == opid]
        except Exception:
            try:
                matches = validation_data.loc[validation_data["OPERATION_ID"].astype(str) == str(opid)]
            except Exception:
                matches = pd.DataFrame()

        if matches is not None and not matches.empty:
            val_row = matches.iloc[0]
            return find_start_date_from_row(val_row, cols)
        return None

    opid = row.get("OPERATION_ID", None)

    # planned_start: if missing or out-of-range, attempt fallback from validation sheet
    if not _in_range(planned_start):
        alt_planned = _try_validation_lookup(opid, planned_cols)
        if _in_range(alt_planned):
            planned_start = alt_planned
            warnings.append(f"OPERATION_ID={opid}: planned_start replaced from validation file ({planned_start.date()}).")
        else:
            warnings.append(f"OPERATION_ID={opid}: planned_start {planned_start} out of range or missing; no valid replacement found in validation file.")
            planned_start = None

    # actual_start: if missing or out-of-range, attempt fallback from validation sheet
    if not _in_range(actual_start):
        alt_actual = _try_validation_lookup(opid, actual_cols)
        if _in_range(alt_actual):
            actual_start = alt_actual
            warnings.append(f"OPERATION_ID={opid}: actual_start replaced from validation file ({actual_start.date()}).")
        else:
            # if no actual start in validation, we will keep None for now and allow later fallback to planned_start
            warnings.append(f"OPERATION_ID={opid}: actual_start {actual_start} out of range or missing; no valid replacement found in validation file.")
            actual_start = None

    # --- NEW: fallback (if actual still missing, use planned_start) ---
    if actual_start is None and planned_start is not None:
        actual_start = planned_start

    # Helper to combine time-only string with a start date
    def _combine_time_with_date(s, base_date):
        # parse the time string into a datetime (may get today's date, we only use .time())
        parsed = pd.to_datetime(s, errors="coerce")
        if pd.isna(parsed) or base_date is None:
            return None
        return pd.Timestamp.combine(base_date, parsed.time())

    # Fill planned cols
    for col in planned_cols:
        if col not in row.index:
            continue
        val = row[col]
        if pd.isna(val):
            continue

        # if it's already a Timestamp or datetime
        if isinstance(val, (pd.Timestamp, datetime.datetime)):
            ts = pd.Timestamp(val)
            if MIN_DATE <= ts <= MAX_DATE:
                row[col] = ts
                continue
            else:
                # invalid → try to replace with same time on planned_start
                if planned_start is not None:
                    row[col] = pd.Timestamp.combine(planned_start, ts.time())
                else:
                    row[col] = pd.NaT
            continue

        # otherwise, treat it as string
        s = str(val).strip()
        if _is_time_only_string(s):
            if planned_start is not None:
                combined = _combine_time_with_date(s, planned_start)
                if combined is not None:
                    row[col] = combined
        else:
            parsed = pd.to_datetime(s, errors="coerce", dayfirst=False)
            if not pd.isna(parsed):
                if MIN_DATE <= parsed <= MAX_DATE:
                    row[col] = parsed
                elif planned_start is not None:
                    row[col] = pd.Timestamp.combine(planned_start, parsed.time())
                else:
                    row[col] = pd.NaT

    # Fill actual cols (same logic)
    for col in actual_cols:
        if col not in row.index:
            continue
        val = row[col]
        if pd.isna(val):
            continue

        if isinstance(val, (pd.Timestamp, datetime.datetime)):
            ts = pd.Timestamp(val)
            if MIN_DATE <= ts <= MAX_DATE:
                row[col] = ts
                continue
            else:
                if actual_start is not None:
                    row[col] = pd.Timestamp.combine(actual_start, ts.time())
                else:
                    row[col] = pd.NaT
            continue

        s = str(val).strip()
        if _is_time_only_string(s):
            if actual_start is not None:
                combined = _combine_time_with_date(s, actual_start)
                if combined is not None:
                    row[col] = combined
        else:
            parsed = pd.to_datetime(s, errors="coerce", dayfirst=False)
            if not pd.isna(parsed):
                if MIN_DATE <= parsed <= MAX_DATE:
                    row[col] = parsed
                elif actual_start is not None:
                    row[col] = pd.Timestamp.combine(actual_start, parsed.time())
                else:
                    row[col] = pd.NaT


    return row


def impute_with_rules(row, planned_cols, actual_cols):
    """
    Step 1: Attach constant planned/actual start dates
    Step 2: Apply sync rules (with bias toward existing non-null values)
    Step 3: Fill missing anaesthesia/prep times from knife-to-skin
    """
    # --- Step 1: attach constant dates ---
    row = attach_constant_dates(row, planned_cols, actual_cols)

    # --- Step 2: enforce logical sync rules ---
    def sync_cols(col_a, col_b, prefer="a"):
        """Sync two columns with preference if one is missing."""
        a, b = row.get(col_a, pd.NaT), row.get(col_b, pd.NaT)
        if pd.isna(a) and pd.notna(b):
            row[col_a] = b
        elif pd.isna(b) and pd.notna(a):
            row[col_b] = a
        elif pd.notna(a) and pd.notna(b):
            if prefer == "a":
                row[col_b] = a
            else:
                row[col_a] = b

    # Rule 1: PLANNED_PATIENT_CALL_TIME == PLANNED_PATIENT_FETCH_TIME
    sync_cols("PLANNED_PATIENT_CALL_TIME", "PLANNED_PATIENT_FETCH_TIME", prefer="fetch")

    # Rule 2: PLANNED_OR_CLEANUP_TIME == PLANNED_EXIT_OR_TIME
    sync_cols("PLANNED_OR_CLEANUP_TIME", "PLANNED_EXIT_OR_TIME", prefer="exit")

    # Rule 3: Ensure ordering constraints (only if both present)
    def enforce_order(before, after):
        if before in row.index and after in row.index:
            if pd.notna(row[before]) and pd.notna(row[after]):
                try:
                    if row[after] < row[before]:
                        row[after] = row[before]
                except Exception:
                    pass

    enforce_order("PLANNED_KNIFE_TO_SKIN_TIME", "PLANNED_SKIN_CLOSURE")
    enforce_order("PLANNED_SKIN_CLOSURE", "PLANNED_PATIENT_REVERSAL_TIME")
    enforce_order("PLANNED_PATIENT_REVERSAL_TIME", "PLANNED_EXIT_OR_TIME")
    enforce_order("PLANNED_EXIT_OR_TIME", "PLANNED_EXIT_RECOVERY_TIME")
    enforce_order("PLANNED_EXIT_RECOVERY_TIME", "PLANNED_OR_CLEANUP_TIME")

    # --- Step 3: fill missing times from anchors ---
    knife = row.get("PLANNED_KNIFE_TO_SKIN_TIME", pd.NaT)
    closure = row.get("PLANNED_SKIN_CLOSURE", pd.NaT)

    if pd.notna(knife):
        if "PLANNED_ANAESTHESIA_INDUCTION" in row.index and pd.isna(row["PLANNED_ANAESTHESIA_INDUCTION"]):
            row["PLANNED_ANAESTHESIA_INDUCTION"] = knife
        if "PLANNED_SURGERY_PREP_TIME" in row.index and pd.isna(row["PLANNED_SURGERY_PREP_TIME"]):
            row["PLANNED_SURGERY_PREP_TIME"] = knife

    if pd.notna(closure):
        if "PLANNED_PATIENT_REVERSAL_TIME" in row.index and pd.isna(row["PLANNED_PATIENT_REVERSAL_TIME"]):
            row["PLANNED_PATIENT_REVERSAL_TIME"] = closure

    return row

# Apply row-wise
data = data.apply(lambda r: impute_with_rules(r, planned_cols, actual_cols), axis=1)
data.iloc[:, 7:31]

Unnamed: 0,PLANNED_PATIENT_FETCH_TIME,PLANNED_RECEPTION_IN_TIME,PLANNED_ENTER_OR_TIME,PLANNED_ANAESTHESIA_INDUCTION,PLANNED_SURGERY_PREP_TIME,PLANNED_KNIFE_TO_SKIN_TIME,PLANNED_SKIN_CLOSURE,PLANNED_PATIENT_REVERSAL_TIME,PLANNED_EXIT_OR_TIME,PLANNED_EXIT_RECOVERY_TIME,...,ACTUAL_ENTER_OR_TIME,ACTUAL_ANAESTHESIA_INDUCTION,ACTUAL_SURGERY_PREP_TIME,ACTUAL_KNIFE_TO_SKIN_TIME,ACTUAL_SKIN_CLOSURE,ACTUAL_PATIENT_REVERSAL_TIME,ACTUAL_EXIT_OR_TIME,ACTUAL_EXIT_RECOVERY_TIME,ACTUAL_OR_CLEANUP_TIME,PATIENT_CODE
0,2019-04-11 09:50:00,2019-04-11 09:50:00,2019-04-11 09:50:00,2019-04-11 10:20:00,2019-04-11 10:20:00,2019-04-11 10:20:00,2019-04-11 17:10:00,2019-04-11 17:10:00,2019-04-11 17:25:00,2019-04-11 17:40:00,...,2019-04-11 11:04:00,2019-04-11 11:06:00,2019-04-11 11:17:00,2019-04-11 11:44:00,2019-04-11 17:31:00,2019-04-11 17:43:00,2019-04-11 17:46:00,2019-04-11 18:48:00,NaT,o5#}N[orwz9n82K>
1,2019-04-11 10:40:00,2019-04-11 10:40:00,2019-04-11 10:40:00,2019-04-11 10:50:00,2019-04-11 10:50:00,2019-04-11 10:50:00,2019-04-11 11:20:00,2019-04-11 11:20:00,2019-04-11 11:25:00,2019-04-11 11:40:00,...,2019-04-11 11:01:00,2019-04-11 11:03:00,2019-04-11 11:08:00,2019-04-11 11:10:00,2019-04-11 11:20:00,2019-04-11 11:32:00,2019-04-11 11:32:00,2019-04-11 12:48:00,2019-04-11 11:33:00,$aN75Z3hpKoGqbq7
2,2019-04-11 10:55:00,2019-04-11 10:55:00,2019-04-11 10:55:00,2019-04-11 11:10:00,2019-04-11 11:10:00,2019-04-11 11:10:00,2019-04-11 11:55:00,2019-04-11 11:55:00,2019-04-11 11:55:00,2019-04-11 11:55:00,...,2019-04-11 10:58:00,NaT,NaT,2019-04-11 11:02:00,2019-04-11 11:21:00,NaT,2019-04-11 11:24:00,2019-04-11 11:44:00,2019-04-11 13:35:00,h7t3RxYAiQLTP6#D
3,2019-04-11 10:50:00,2019-04-11 10:50:00,2019-04-11 10:50:00,2019-04-11 11:10:00,2019-04-11 11:10:00,2019-04-11 11:10:00,2019-04-11 13:40:00,2019-04-11 13:40:00,2019-04-11 13:50:00,2019-04-11 14:05:00,...,2019-04-11 10:58:00,2019-04-11 10:59:00,2019-04-11 11:09:00,2019-04-11 11:17:00,2019-04-11 13:09:00,2019-04-11 13:09:00,2019-04-11 13:27:00,2019-04-11 15:10:00,2019-04-11 13:27:00,}8P}5y9?lox&E&as
4,2019-04-11 10:30:00,2019-04-11 10:30:00,2019-04-11 10:30:00,2019-04-11 10:50:00,2019-04-11 10:50:00,2019-04-11 10:50:00,2019-04-11 12:20:00,2019-04-11 12:20:00,2019-04-11 12:30:00,2019-04-11 12:45:00,...,2019-04-11 10:58:00,2019-04-11 11:01:00,2019-04-11 11:23:00,2019-04-11 11:24:00,2019-04-11 13:17:00,2019-04-11 13:28:00,2019-04-11 13:34:00,2019-04-11 18:41:00,2019-04-11 13:50:00,$A#p3lTAo6;?t&m8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276858,2020-06-19 09:25:00,2020-06-19 09:25:00,2020-06-19 09:25:00,2020-06-19 09:35:00,2020-06-19 09:35:00,2020-06-19 09:35:00,2020-06-19 09:48:00,2020-06-19 09:48:00,2020-06-19 09:50:00,2020-06-19 10:05:00,...,2020-06-19 09:44:00,2020-06-19 09:49:00,NaT,2020-06-19 09:50:00,2020-06-19 10:05:00,2020-06-19 10:05:00,2020-06-19 10:07:00,2020-06-19 11:18:00,2020-06-19 10:37:00,9chN0s9dgz#6SNy0
276859,2020-06-19 12:45:00,2020-06-19 12:45:00,2020-06-19 12:45:00,2020-06-19 13:15:00,2020-06-19 13:15:00,2020-06-19 13:15:00,2020-06-19 17:20:00,2020-06-19 17:20:00,2020-06-19 17:30:00,2020-06-19 17:45:00,...,2020-06-19 09:36:00,2020-06-19 09:38:00,2020-06-19 09:50:00,2020-06-19 10:19:00,2020-06-19 13:23:00,2020-06-19 13:23:00,2020-06-19 13:51:00,2020-06-19 16:14:00,2020-06-19 14:01:00,3rejZP3&2DYQjJM6
276860,2020-06-19 08:30:00,2020-06-19 08:30:00,2020-06-19 08:30:00,2020-06-19 08:40:00,2020-06-19 08:40:00,2020-06-19 08:40:00,2020-06-19 08:48:00,2020-06-19 08:48:00,2020-06-19 08:50:00,2020-06-19 09:05:00,...,2020-06-19 09:20:00,2020-06-19 09:27:00,NaT,2020-06-19 09:30:00,2020-06-19 09:50:00,2020-06-19 09:50:00,2020-06-19 09:52:00,2020-06-19 10:16:00,2020-06-19 09:53:00,r8rUfquSvQ4yoQ6z
276861,2020-06-19 09:20:00,2020-06-19 09:20:00,2020-06-19 09:20:00,2020-06-19 09:35:00,2020-06-19 09:35:00,2020-06-19 09:35:00,2020-06-19 09:55:00,2020-06-19 09:55:00,2020-06-19 10:00:00,2020-06-19 10:15:00,...,2020-06-19 09:20:00,2020-06-19 09:34:00,2020-06-19 09:40:00,2020-06-19 09:46:00,2020-06-19 10:03:00,2020-06-19 10:04:00,2020-06-19 10:08:00,2020-06-19 10:40:00,2020-06-19 10:10:00,ZQjqjpPoqwP32Zm4


Date processing for ACTUAL columns

In [92]:
def impute_patient_times(row):
    fetch_col = "PATIENT_FETCH_TIME"
    call_col = "PATIENT_CALL_TIME"
    reception_col = "ACTUAL_RECEPTION_IN_TIME"

    # Helper: coerce any value to Timestamp
    def _to_ts(val):
        if pd.isna(val):
            return None
        if isinstance(val, pd.Timestamp):
            return val
        try:
            return pd.to_datetime(val, errors="coerce")
        except Exception:
            return None

    # --- Step 1: if CALL is empty, copy RECEPTION ---
    call_time = _to_ts(row.get(call_col, pd.NaT))
    reception_time = _to_ts(row.get(reception_col, pd.NaT))

    if call_time is None and reception_time is not None:
        row[call_col] = reception_time
        call_time = reception_time

    # --- Step 2: if FETCH is empty, fill it ---
    fetch_time = _to_ts(row.get(fetch_col, pd.NaT))
    if fetch_time is None:
        if call_time is not None and reception_time is not None:
            midpoint = call_time + (reception_time - call_time) / 2
            row[fetch_col] = midpoint.floor("min")  # round down
        elif call_time is not None:
            row[fetch_col] = call_time.floor("min")
        elif reception_time is not None:
            row[fetch_col] = reception_time.floor("min")

    return row

data = data.apply(impute_patient_times, axis=1)

data.iloc[:, 19:31]

Unnamed: 0,PATIENT_FETCH_TIME,ACTUAL_RECEPTION_IN_TIME,ACTUAL_ENTER_OR_TIME,ACTUAL_ANAESTHESIA_INDUCTION,ACTUAL_SURGERY_PREP_TIME,ACTUAL_KNIFE_TO_SKIN_TIME,ACTUAL_SKIN_CLOSURE,ACTUAL_PATIENT_REVERSAL_TIME,ACTUAL_EXIT_OR_TIME,ACTUAL_EXIT_RECOVERY_TIME,ACTUAL_OR_CLEANUP_TIME,PATIENT_CODE
0,2019-04-11 10:26:00,2019-04-11 10:41:00,2019-04-11 11:04:00,2019-04-11 11:06:00,2019-04-11 11:17:00,2019-04-11 11:44:00,2019-04-11 17:31:00,2019-04-11 17:43:00,2019-04-11 17:46:00,2019-04-11 18:48:00,NaT,o5#}N[orwz9n82K>
1,2019-04-11 09:55:00,2019-04-11 10:17:00,2019-04-11 11:01:00,2019-04-11 11:03:00,2019-04-11 11:08:00,2019-04-11 11:10:00,2019-04-11 11:20:00,2019-04-11 11:32:00,2019-04-11 11:32:00,2019-04-11 12:48:00,2019-04-11 11:33:00,$aN75Z3hpKoGqbq7
2,2019-04-11 10:15:00,2019-04-11 10:39:00,2019-04-11 10:58:00,NaT,NaT,2019-04-11 11:02:00,2019-04-11 11:21:00,NaT,2019-04-11 11:24:00,2019-04-11 11:44:00,2019-04-11 13:35:00,h7t3RxYAiQLTP6#D
3,2019-04-11 10:18:00,2019-04-11 10:36:00,2019-04-11 10:58:00,2019-04-11 10:59:00,2019-04-11 11:09:00,2019-04-11 11:17:00,2019-04-11 13:09:00,2019-04-11 13:09:00,2019-04-11 13:27:00,2019-04-11 15:10:00,2019-04-11 13:27:00,}8P}5y9?lox&E&as
4,2019-04-11 10:44:00,2019-04-11 10:58:00,2019-04-11 10:58:00,2019-04-11 11:01:00,2019-04-11 11:23:00,2019-04-11 11:24:00,2019-04-11 13:17:00,2019-04-11 13:28:00,2019-04-11 13:34:00,2019-04-11 18:41:00,2019-04-11 13:50:00,$A#p3lTAo6;?t&m8
...,...,...,...,...,...,...,...,...,...,...,...,...
276858,2020-06-19 08:26:00,2020-06-19 08:29:00,2020-06-19 09:44:00,2020-06-19 09:49:00,NaT,2020-06-19 09:50:00,2020-06-19 10:05:00,2020-06-19 10:05:00,2020-06-19 10:07:00,2020-06-19 11:18:00,2020-06-19 10:37:00,9chN0s9dgz#6SNy0
276859,2020-06-19 09:04:00,2020-06-19 09:13:00,2020-06-19 09:36:00,2020-06-19 09:38:00,2020-06-19 09:50:00,2020-06-19 10:19:00,2020-06-19 13:23:00,2020-06-19 13:23:00,2020-06-19 13:51:00,2020-06-19 16:14:00,2020-06-19 14:01:00,3rejZP3&2DYQjJM6
276860,2020-06-19 08:21:00,2020-06-19 08:32:00,2020-06-19 09:20:00,2020-06-19 09:27:00,NaT,2020-06-19 09:30:00,2020-06-19 09:50:00,2020-06-19 09:50:00,2020-06-19 09:52:00,2020-06-19 10:16:00,2020-06-19 09:53:00,r8rUfquSvQ4yoQ6z
276861,2020-06-19 09:20:00,2020-06-19 09:20:00,2020-06-19 09:20:00,2020-06-19 09:34:00,2020-06-19 09:40:00,2020-06-19 09:46:00,2020-06-19 10:03:00,2020-06-19 10:04:00,2020-06-19 10:08:00,2020-06-19 10:40:00,2020-06-19 10:10:00,ZQjqjpPoqwP32Zm4


### Imputing Missing Induction, Prep, and Reversal Times

For some rows, `ACTUAL_ANAESTHESIA_INDUCTION`, `ACTUAL_SURGERY_PREP_TIME`,   
`ACTUAL_PATIENT_REVERSAL_TIME`, and `ACTUAL_OR_CLEANUP_TIME` are missing.  
To fill these values in a consistent and data-driven way, we treat the **OR workflow as a timeline**.

---

#### Case A: Induction & Prep together (baseline method)
- `ACTUAL_ENTER_OR_TIME` → **0% mark**  
- `ACTUAL_KNIFE_TO_SKIN_TIME` → **100% mark**  

For rows where both induction and prep times are available, we compute their relative positions:
- **Induction mark** = (Induction − Enter OR) ÷ (Knife-to-skin − Enter OR)  
- **Prep mark** = (Prep − Enter OR) ÷ (Knife-to-skin − Enter OR)  

We then take the **average mark** across all valid rows.  
For rows with missing values:
- `ACTUAL_ANAESTHESIA_INDUCTION` is backfilled as  
  `Enter OR + (Knife-to-skin − Enter OR) × <avg induction mark>`, rounded to the nearest minute  
- `ACTUAL_SURGERY_PREP_TIME` is backfilled as  
  `Enter OR + (Knife-to-skin − Enter OR) × <avg prep mark>`, rounded to the nearest minute  

---

#### Case B: Prep missing, but induction & knife available
For rows with induction and knife-to-skin times but missing prep:  
- Compute average **prep-from-induction mark** = (Prep − Induction) ÷ (Knife − Induction)  
- Backfill missing prep as  
  `Induction + (Knife − Induction) × <avg prep-from-induction mark>`

---

#### Case C: Induction missing, but enter & prep available
For rows with enter and prep but missing induction:  
- Compute average **induction-from-enter mark** = (Induction − Enter OR) ÷ (Prep − Enter OR)  
- Backfill missing induction as  
  `Enter OR + (Prep − Enter OR) × <avg induction-from-enter mark>`

---

#### Case D: Reversal missing, but closure & exit available
For rows with closure and exit but missing reversal:  
- `ACTUAL_SKIN_CLOSURE` → **0% mark**  
- `ACTUAL_EXIT_OR_TIME` → **100% mark**  
- Compute average **reversal mark** = (Reversal − Closure) ÷ (Exit − Closure)  
- Backfill missing reversal as  
  `Closure + (Exit − Closure) × <avg reversal mark>`, rounded to the nearest minute  

---

#### Case E: Cleanup missing, but exit available
Unlike induction, prep, and reversal, cleanup is best modeled as a **fixed offset** after exit.  
- Compute average **cleanup offset** = (Cleanup − Exit) across rows with both values.  
- Backfill missing cleanup as  
  `Exit + <avg cleanup offset>`  
Rounded to the nearest minute.

---

This imputation strategy ensures that filled values preserve the natural ordering of OR events, are grounded in real observed distributions, and remain realistic within the surgical timeline.

In [93]:
def compute_marks(data):
    marks = {}

    # Case A: induction & prep relative to enter/knife
    mask = (
        data["ACTUAL_ENTER_OR_TIME"].notna()
        & data["ACTUAL_ANAESTHESIA_INDUCTION"].notna()
        & data["ACTUAL_SURGERY_PREP_TIME"].notna()
        & data["ACTUAL_KNIFE_TO_SKIN_TIME"].notna()
    )
    clean = data.loc[mask].copy()
    clean = clean[
        (clean["ACTUAL_ENTER_OR_TIME"] <= clean["ACTUAL_ANAESTHESIA_INDUCTION"])
        & (clean["ACTUAL_ANAESTHESIA_INDUCTION"] <= clean["ACTUAL_SURGERY_PREP_TIME"])
        & (clean["ACTUAL_SURGERY_PREP_TIME"] <= clean["ACTUAL_KNIFE_TO_SKIN_TIME"])
    ]
    if not clean.empty:
        total = (clean["ACTUAL_KNIFE_TO_SKIN_TIME"] - clean["ACTUAL_ENTER_OR_TIME"]).dt.total_seconds()
        marks["induction"] = ((clean["ACTUAL_ANAESTHESIA_INDUCTION"] - clean["ACTUAL_ENTER_OR_TIME"]).dt.total_seconds() / total).mean(skipna=True)
        marks["prep"] = ((clean["ACTUAL_SURGERY_PREP_TIME"] - clean["ACTUAL_ENTER_OR_TIME"]).dt.total_seconds() / total).mean(skipna=True)

    # Case B: prep relative to induction/knife
    mask = (
        data["ACTUAL_ANAESTHESIA_INDUCTION"].notna()
        & data["ACTUAL_SURGERY_PREP_TIME"].notna()
        & data["ACTUAL_KNIFE_TO_SKIN_TIME"].notna()
    )
    clean = data.loc[mask].copy()
    clean = clean[
        (clean["ACTUAL_ANAESTHESIA_INDUCTION"] <= clean["ACTUAL_SURGERY_PREP_TIME"])
        & (clean["ACTUAL_SURGERY_PREP_TIME"] <= clean["ACTUAL_KNIFE_TO_SKIN_TIME"])
    ]
    if not clean.empty:
        total = (clean["ACTUAL_KNIFE_TO_SKIN_TIME"] - clean["ACTUAL_ANAESTHESIA_INDUCTION"]).dt.total_seconds()
        marks["prep_from_induction"] = ((clean["ACTUAL_SURGERY_PREP_TIME"] - clean["ACTUAL_ANAESTHESIA_INDUCTION"]).dt.total_seconds() / total).mean(skipna=True)

    # Case C: induction relative to enter/prep
    mask = (
        data["ACTUAL_ENTER_OR_TIME"].notna()
        & data["ACTUAL_ANAESTHESIA_INDUCTION"].notna()
        & data["ACTUAL_SURGERY_PREP_TIME"].notna()
    )
    clean = data.loc[mask].copy()
    clean = clean[
        (clean["ACTUAL_ENTER_OR_TIME"] <= clean["ACTUAL_ANAESTHESIA_INDUCTION"])
        & (clean["ACTUAL_ANAESTHESIA_INDUCTION"] <= clean["ACTUAL_SURGERY_PREP_TIME"])
    ]
    if not clean.empty:
        total = (clean["ACTUAL_SURGERY_PREP_TIME"] - clean["ACTUAL_ENTER_OR_TIME"]).dt.total_seconds()
        marks["induction_from_enter"] = ((clean["ACTUAL_ANAESTHESIA_INDUCTION"] - clean["ACTUAL_ENTER_OR_TIME"]).dt.total_seconds() / total).mean(skipna=True)

    # Case D: reversal relative to closure/exit
    mask = (
        data["ACTUAL_SKIN_CLOSURE"].notna()
        & data["ACTUAL_PATIENT_REVERSAL_TIME"].notna()
        & data["ACTUAL_EXIT_OR_TIME"].notna()
    )
    clean = data.loc[mask].copy()
    clean = clean[
        (clean["ACTUAL_SKIN_CLOSURE"] <= clean["ACTUAL_PATIENT_REVERSAL_TIME"])
        & (clean["ACTUAL_PATIENT_REVERSAL_TIME"] <= clean["ACTUAL_EXIT_OR_TIME"])
    ]
    if not clean.empty:
        total = (clean["ACTUAL_EXIT_OR_TIME"] - clean["ACTUAL_SKIN_CLOSURE"]).dt.total_seconds()
        marks["reversal"] = ((clean["ACTUAL_PATIENT_REVERSAL_TIME"] - clean["ACTUAL_SKIN_CLOSURE"]).dt.total_seconds() / total).mean(skipna=True)

    # Case E: cleanup offset from exit
    mask = (
        data["ACTUAL_EXIT_OR_TIME"].notna()
        & data["ACTUAL_OR_CLEANUP_TIME"].notna()
    )
    clean = data.loc[mask].copy()

    # Keep only realistic differences (0 to 12 hours after exit)
    valid = (clean["ACTUAL_OR_CLEANUP_TIME"] >= clean["ACTUAL_EXIT_OR_TIME"]) & (
        (clean["ACTUAL_OR_CLEANUP_TIME"] - clean["ACTUAL_EXIT_OR_TIME"]) <= pd.Timedelta(hours=12)
    )
    clean = clean[valid]

    if not clean.empty:
        diffs = (clean["ACTUAL_OR_CLEANUP_TIME"] - clean["ACTUAL_EXIT_OR_TIME"]).dt.total_seconds()
        marks["cleanup_offset"] = round(diffs.mean(skipna=True) / 60.0)  # minutes


    return marks


def impute_induction_prep_reversal_cleanup(row, marks):
    enter, induction, prep, knife = row[["ACTUAL_ENTER_OR_TIME", "ACTUAL_ANAESTHESIA_INDUCTION", "ACTUAL_SURGERY_PREP_TIME", "ACTUAL_KNIFE_TO_SKIN_TIME"]]
    closure, reversal, exit_, cleanup = row[["ACTUAL_SKIN_CLOSURE", "ACTUAL_PATIENT_REVERSAL_TIME", "ACTUAL_EXIT_OR_TIME", "ACTUAL_OR_CLEANUP_TIME"]]

    # --- Case A: both missing induction & prep
    if pd.notna(enter) and pd.isna(induction) and pd.isna(prep) and pd.notna(knife):
        if "induction" in marks and "prep" in marks:
            total = knife - enter
            row["ACTUAL_ANAESTHESIA_INDUCTION"] = (enter + total * marks["induction"]).round("min")
            row["ACTUAL_SURGERY_PREP_TIME"] = (enter + total * marks["prep"]).round("min")

    # --- Case B: missing prep only
    if pd.notna(induction) and pd.isna(prep) and pd.notna(knife):
        if "prep_from_induction" in marks:
            total = knife - induction
            row["ACTUAL_SURGERY_PREP_TIME"] = (induction + total * marks["prep_from_induction"]).round("min")

    # --- Case C: missing induction only
    if pd.notna(enter) and pd.isna(induction) and pd.notna(prep):
        if "induction_from_enter" in marks:
            total = prep - enter
            row["ACTUAL_ANAESTHESIA_INDUCTION"] = (enter + total * marks["induction_from_enter"]).round("min")

    # --- Case D: missing reversal
    if pd.notna(closure) and pd.isna(reversal) and pd.notna(exit_):
        if "reversal" in marks:
            total = exit_ - closure
            row["ACTUAL_PATIENT_REVERSAL_TIME"] = (closure + total * marks["reversal"]).round("min")

    # --- Case E: missing cleanup
    if pd.notna(exit_) and pd.isna(cleanup):
        if "cleanup_offset" in marks:
            row["ACTUAL_OR_CLEANUP_TIME"] = (exit_ + pd.Timedelta(minutes=marks["cleanup_offset"])).round("min")

    return row


# Step 1: get average marks
marks = compute_marks(data)

# Step 2: apply backfill
data = data.apply(lambda r: impute_induction_prep_reversal_cleanup(r, marks), axis=1)

data.iloc[:, 19:31]

Unnamed: 0,PATIENT_FETCH_TIME,ACTUAL_RECEPTION_IN_TIME,ACTUAL_ENTER_OR_TIME,ACTUAL_ANAESTHESIA_INDUCTION,ACTUAL_SURGERY_PREP_TIME,ACTUAL_KNIFE_TO_SKIN_TIME,ACTUAL_SKIN_CLOSURE,ACTUAL_PATIENT_REVERSAL_TIME,ACTUAL_EXIT_OR_TIME,ACTUAL_EXIT_RECOVERY_TIME,ACTUAL_OR_CLEANUP_TIME,PATIENT_CODE
0,2019-04-11 10:26:00,2019-04-11 10:41:00,2019-04-11 11:04:00,2019-04-11 11:06:00,2019-04-11 11:17:00,2019-04-11 11:44:00,2019-04-11 17:31:00,2019-04-11 17:43:00,2019-04-11 17:46:00,2019-04-11 18:48:00,2019-04-11 18:04:00,o5#}N[orwz9n82K>
1,2019-04-11 09:55:00,2019-04-11 10:17:00,2019-04-11 11:01:00,2019-04-11 11:03:00,2019-04-11 11:08:00,2019-04-11 11:10:00,2019-04-11 11:20:00,2019-04-11 11:32:00,2019-04-11 11:32:00,2019-04-11 12:48:00,2019-04-11 11:33:00,$aN75Z3hpKoGqbq7
2,2019-04-11 10:15:00,2019-04-11 10:39:00,2019-04-11 10:58:00,2019-04-11 10:59:00,2019-04-11 11:01:00,2019-04-11 11:02:00,2019-04-11 11:21:00,2019-04-11 11:22:00,2019-04-11 11:24:00,2019-04-11 11:44:00,2019-04-11 13:35:00,h7t3RxYAiQLTP6#D
3,2019-04-11 10:18:00,2019-04-11 10:36:00,2019-04-11 10:58:00,2019-04-11 10:59:00,2019-04-11 11:09:00,2019-04-11 11:17:00,2019-04-11 13:09:00,2019-04-11 13:09:00,2019-04-11 13:27:00,2019-04-11 15:10:00,2019-04-11 13:27:00,}8P}5y9?lox&E&as
4,2019-04-11 10:44:00,2019-04-11 10:58:00,2019-04-11 10:58:00,2019-04-11 11:01:00,2019-04-11 11:23:00,2019-04-11 11:24:00,2019-04-11 13:17:00,2019-04-11 13:28:00,2019-04-11 13:34:00,2019-04-11 18:41:00,2019-04-11 13:50:00,$A#p3lTAo6;?t&m8
...,...,...,...,...,...,...,...,...,...,...,...,...
276858,2020-06-19 08:26:00,2020-06-19 08:29:00,2020-06-19 09:44:00,2020-06-19 09:49:00,2020-06-19 09:50:00,2020-06-19 09:50:00,2020-06-19 10:05:00,2020-06-19 10:05:00,2020-06-19 10:07:00,2020-06-19 11:18:00,2020-06-19 10:37:00,9chN0s9dgz#6SNy0
276859,2020-06-19 09:04:00,2020-06-19 09:13:00,2020-06-19 09:36:00,2020-06-19 09:38:00,2020-06-19 09:50:00,2020-06-19 10:19:00,2020-06-19 13:23:00,2020-06-19 13:23:00,2020-06-19 13:51:00,2020-06-19 16:14:00,2020-06-19 14:01:00,3rejZP3&2DYQjJM6
276860,2020-06-19 08:21:00,2020-06-19 08:32:00,2020-06-19 09:20:00,2020-06-19 09:27:00,2020-06-19 09:29:00,2020-06-19 09:30:00,2020-06-19 09:50:00,2020-06-19 09:50:00,2020-06-19 09:52:00,2020-06-19 10:16:00,2020-06-19 09:53:00,r8rUfquSvQ4yoQ6z
276861,2020-06-19 09:20:00,2020-06-19 09:20:00,2020-06-19 09:20:00,2020-06-19 09:34:00,2020-06-19 09:40:00,2020-06-19 09:46:00,2020-06-19 10:03:00,2020-06-19 10:04:00,2020-06-19 10:08:00,2020-06-19 10:40:00,2020-06-19 10:10:00,ZQjqjpPoqwP32Zm4


### Handle Rows with no Actual Data

For the purpose of our project, actual date/time data is needed to track any delays with the planned time.
By observation, we note that these rows with no actual data track to procedures marked with LOCATION == "OUT OF OT ROOMS".
These represent cases outside of operating theatres and should not be included in downstream time sequence analysis. We therefore remove them.


In [94]:
before = len(data)
data = data[data["LOCATION"] != "OUT OF OT ROOMS"].copy()
after = len(data)

print(f"Removed {before - after} rows with LOCATION == 'OUT OF OT ROOMS' (kept {after}).")

Removed 3726 rows with LOCATION == 'OUT OF OT ROOMS' (kept 273137).


### Convert Planned Columns to Datetime

To make sure all planned time columns are in a consistent `datetime64[ns]` format, we explicitly convert them using `pd.to_datetime`.

In [95]:
# Convert all planned columns to datetime64[ns]
for col in planned_cols:
    if col in data.columns:
        data[col] = pd.to_datetime(data[col], errors="coerce")

data

Unnamed: 0,OPERATION_ID,LOCATION,ROOM,CASE_STATUS,OPERATION_TYPE,EMERGENCY_PRIORITY,PLANNED_PATIENT_CALL_TIME,PLANNED_PATIENT_FETCH_TIME,PLANNED_RECEPTION_IN_TIME,PLANNED_ENTER_OR_TIME,...,ADMISSION_WARD,ADMISSION_BED,AOH,BLOOD,IMPLANT,DIAGNOSIS,CANCER_INDICATOR,TRAUMA_INDICATOR,Delay_Reason,Remarks
0,588456.0,Main Building OT,MBOR11,Final,Elective,0,2019-04-11 09:50:00,2019-04-11 09:50:00,2019-04-11 09:50:00,2019-04-11 09:50:00,...,NW6A,N06A036,False,NIL,required microscope,Right Breast CA,False,False,Surgeon (e.g. Surgeon not available & etc.),0
1,590736.0,Main Building OT,MBOR05,Final,Elective,0,2019-04-11 10:40:00,2019-04-11 10:40:00,2019-04-11 10:40:00,2019-04-11 10:40:00,...,NWASW,NASWA11,False,NIL,0,early pregnancy failure,False,False,0,0
2,591995.0,ICL,RoomC,Actualised,Elective,0,2019-04-11 10:55:00,2019-04-11 10:55:00,2019-04-11 10:55:00,2019-04-11 10:55:00,...,NW7B,N07B005,False,NIL,0,,False,False,0,0
3,590451.0,Main Building OT,MBOR04,Final,Elective,0,2019-04-11 10:50:00,2019-04-11 10:50:00,2019-04-11 10:50:00,2019-04-11 10:50:00,...,NW41,N041004,False,NIL,Need Eustachian tube ballon,EUSTACHIAN TUBE DISORDER,False,False,0,0
4,573666.0,Medical Center OT,MCOR03,Final,Elective,0,2019-04-11 10:30:00,2019-04-11 10:30:00,2019-04-11 10:30:00,2019-04-11 10:30:00,...,NW2A,N02A025,False,NIL,0,gall bladder stone,False,False,first case havent finished,first case havent finished
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276858,652260.0,Endoscopy Center,ENDO4,Actualised,Elective,0,2020-06-19 09:25:00,2020-06-19 09:25:00,2020-06-19 09:25:00,2020-06-19 09:25:00,...,NCKED,,False,NIL,0,,False,False,0,0
276859,667863.0,Main Building OT,MBOR09,Final,Elective,0,2020-06-19 12:45:00,2020-06-19 12:45:00,2020-06-19 12:45:00,2020-06-19 12:45:00,...,NW52,N052023,False,NIL,ENT Endoscope SetENT DrillENT Neuronavigation,Pituitary Tumour,False,False,Resequencing of cases,0
276860,667473.0,Endoscopy Center,ENDO2,Actualised,Elective,0,2020-06-19 08:30:00,2020-06-19 08:30:00,2020-06-19 08:30:00,2020-06-19 08:30:00,...,NW42,N042041,False,NIL,0,Follow up on varices.,False,False,0,0
276861,665881.0,Medical Center OT,MCOR05,Final,Elective,0,2020-06-19 09:20:00,2020-06-19 09:20:00,2020-06-19 09:20:00,2020-06-19 09:20:00,...,NWASW,NASWC82,False,NIL,0,left eye cataract,False,False,0,0


### Date Sanitisation

The longest surgery recorded in Singapore, and a contender for the world's longest, was the 2001 separation of head-conjoined twins, Ganga and Jamuna Shrestha, at Singapore General Hospital.  
The procedure lasted 103 hours (more than four days) and involved a team of 20 doctors using advanced computer imaging to plan and execute the complex separation of their shared cranial cavity and partially fused brains.

Fortunately, most patients won't require a surgery that long.  
Thus, we perform a one-pass validation to ensure the datetime entries related to planned and actual surgery times are consistent and realistic:  
- The timestamps should be non-decreasing (each subsequent time is the same or later than the previous).  
- The overall duration between the first and last timestamps should not exceed 72 hours.

Rows failing these checks are considered erroneous and are removed from the dataset to maintain data quality for downstream analysis.

In [96]:
def validate_row_datetime_order(row, cols):
    times = row[cols]
    
    # Systematic check: from 0 to len - 4
    for i in range(len(cols) - 4):
        if times[i] > times[i + 1]:
            return False

    # Specific last checks
    anchor_idx = -3  # index of anchor point
    if times[anchor_idx] > times[-2]:
        return False
    if times[anchor_idx] > times[-1]:
        return False

    return True

# Apply for planned_cols and actual_cols
data['planned_valid'] = data.apply(lambda row: validate_row_datetime_order(row, planned_cols), axis=1)
data['actual_valid'] = data.apply(lambda row: validate_row_datetime_order(row, actual_cols), axis=1)

# Combine both validations: keep rows that are valid for both
valid_mask = data['planned_valid'] & data['actual_valid']
num_invalid = (~valid_mask).sum()
data = data[valid_mask].copy()

print(f"Number of rows dropped due to validation failure: {num_invalid}")

Number of rows dropped due to validation failure: 8143


### Emergency Priority Standardization

Per client's instructions, at this step, we standardise the values in the `EMERGENCY_PRIORITY` column by mapping various priority labels to consistent formats:
- Convert `"P3a"` to `"P3A"`
- Convert `"P2"` to `"P2B"`
- Convert `"P3"` and `"P3b"` to `"P3B"`

This ensures uniformity in priority labels for accurate analysis and reporting.

In [97]:
data['EMERGENCY_PRIORITY'] = data['EMERGENCY_PRIORITY'].replace({
    'P3a': 'P3A',
    'P2': 'P2B',
    'P3': 'P3B',
    'P3b': 'P3B'
})

### Handle admission related columns
Some of these surgeries may be day surgeries of from the A&E, hence might not have admission data. Hence, we will replace blanks with "Not Admitted".

In [98]:
admission_cols = ["ADMISSION_STATUS", "ADMISSION_CLASS_TYPE", 
                  "ADMISSION_TYPE", "ADMISSION_WARD", "ADMISSION_BED"]
data[admission_cols] = data[admission_cols].fillna("Not Admitted")

### Fill in missing staff data
Some surgeries are missing surgeon, anaesthetist, or diagnosis data, hence we will fill it with "Unknown" and "Not Recorded". This is because it is likely not possible for a surgery to proceed without them.

In [99]:
clinician_cols = ["SURGEON", "ANAESTHETIST_TEAM", "ANAESTHETIST_MCR_NO"]
data[clinician_cols] = data[clinician_cols].fillna("Unknown")
data["DIAGNOSIS"] = data["DIAGNOSIS"].fillna("Not Recorded")
data

Unnamed: 0,OPERATION_ID,LOCATION,ROOM,CASE_STATUS,OPERATION_TYPE,EMERGENCY_PRIORITY,PLANNED_PATIENT_CALL_TIME,PLANNED_PATIENT_FETCH_TIME,PLANNED_RECEPTION_IN_TIME,PLANNED_ENTER_OR_TIME,...,AOH,BLOOD,IMPLANT,DIAGNOSIS,CANCER_INDICATOR,TRAUMA_INDICATOR,Delay_Reason,Remarks,planned_valid,actual_valid
0,588456.0,Main Building OT,MBOR11,Final,Elective,0,2019-04-11 09:50:00,2019-04-11 09:50:00,2019-04-11 09:50:00,2019-04-11 09:50:00,...,False,NIL,required microscope,Right Breast CA,False,False,Surgeon (e.g. Surgeon not available & etc.),0,True,True
1,590736.0,Main Building OT,MBOR05,Final,Elective,0,2019-04-11 10:40:00,2019-04-11 10:40:00,2019-04-11 10:40:00,2019-04-11 10:40:00,...,False,NIL,0,early pregnancy failure,False,False,0,0,True,True
2,591995.0,ICL,RoomC,Actualised,Elective,0,2019-04-11 10:55:00,2019-04-11 10:55:00,2019-04-11 10:55:00,2019-04-11 10:55:00,...,False,NIL,0,Not Recorded,False,False,0,0,True,True
3,590451.0,Main Building OT,MBOR04,Final,Elective,0,2019-04-11 10:50:00,2019-04-11 10:50:00,2019-04-11 10:50:00,2019-04-11 10:50:00,...,False,NIL,Need Eustachian tube ballon,EUSTACHIAN TUBE DISORDER,False,False,0,0,True,True
4,573666.0,Medical Center OT,MCOR03,Final,Elective,0,2019-04-11 10:30:00,2019-04-11 10:30:00,2019-04-11 10:30:00,2019-04-11 10:30:00,...,False,NIL,0,gall bladder stone,False,False,first case havent finished,first case havent finished,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276858,652260.0,Endoscopy Center,ENDO4,Actualised,Elective,0,2020-06-19 09:25:00,2020-06-19 09:25:00,2020-06-19 09:25:00,2020-06-19 09:25:00,...,False,NIL,0,Not Recorded,False,False,0,0,True,True
276859,667863.0,Main Building OT,MBOR09,Final,Elective,0,2020-06-19 12:45:00,2020-06-19 12:45:00,2020-06-19 12:45:00,2020-06-19 12:45:00,...,False,NIL,ENT Endoscope SetENT DrillENT Neuronavigation,Pituitary Tumour,False,False,Resequencing of cases,0,True,True
276860,667473.0,Endoscopy Center,ENDO2,Actualised,Elective,0,2020-06-19 08:30:00,2020-06-19 08:30:00,2020-06-19 08:30:00,2020-06-19 08:30:00,...,False,NIL,0,Follow up on varices.,False,False,0,0,True,True
276861,665881.0,Medical Center OT,MCOR05,Final,Elective,0,2020-06-19 09:20:00,2020-06-19 09:20:00,2020-06-19 09:20:00,2020-06-19 09:20:00,...,False,NIL,0,left eye cataract,False,False,0,0,True,True


### Drop remaining missing rows
After filling in the missing values that we are able to fill, there are some columns that are still missing data. We will thus drop them as they make up a very small portion of our overall data.

In [100]:
before2 = len(data)
data = data.dropna()
after2 = len(data)

print(f"Removed {before2 - after2} rows with NA values (kept {after2}).")

Removed 16191 rows with NA values (kept 248803).


### View current state of dataframe
Currently, the dataset no longer contains any missing data, and thus we are able to proceed with the next steps.

In [101]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 248803 entries, 0 to 276861
Data columns (total 54 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   OPERATION_ID                   248803 non-null  float64       
 1   LOCATION                       248803 non-null  object        
 2   ROOM                           248803 non-null  object        
 3   CASE_STATUS                    248803 non-null  object        
 4   OPERATION_TYPE                 248803 non-null  object        
 5   EMERGENCY_PRIORITY             248803 non-null  object        
 6   PLANNED_PATIENT_CALL_TIME      248803 non-null  datetime64[ns]
 7   PLANNED_PATIENT_FETCH_TIME     248803 non-null  datetime64[ns]
 8   PLANNED_RECEPTION_IN_TIME      248803 non-null  datetime64[ns]
 9   PLANNED_ENTER_OR_TIME          248803 non-null  datetime64[ns]
 10  PLANNED_ANAESTHESIA_INDUCTION  248803 non-null  datetime64[ns]
 11  PLANN

## Handle Duplicate Data
Important to remove to prevent bias in our AI/ML solution

### Check for duplicate rows
This is to see if our dataset contains any rows that are completely identical. This means that the same surgery has been accidentally logged twice. We want to avoid having this in our dataset as it would cause our analysis in the future to skew.

In [102]:
data.duplicated().sum()

3

### Drop duplicate rows
We identified 3 duplicate rows, and hence we will want to drop them. 

In [103]:
data = data.drop_duplicates()

In [104]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 248800 entries, 0 to 276861
Data columns (total 54 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   OPERATION_ID                   248800 non-null  float64       
 1   LOCATION                       248800 non-null  object        
 2   ROOM                           248800 non-null  object        
 3   CASE_STATUS                    248800 non-null  object        
 4   OPERATION_TYPE                 248800 non-null  object        
 5   EMERGENCY_PRIORITY             248800 non-null  object        
 6   PLANNED_PATIENT_CALL_TIME      248800 non-null  datetime64[ns]
 7   PLANNED_PATIENT_FETCH_TIME     248800 non-null  datetime64[ns]
 8   PLANNED_RECEPTION_IN_TIME      248800 non-null  datetime64[ns]
 9   PLANNED_ENTER_OR_TIME          248800 non-null  datetime64[ns]
 10  PLANNED_ANAESTHESIA_INDUCTION  248800 non-null  datetime64[ns]
 11  PLANN

## Deep cleaning each column
Looking into each individual column to clean up most of the free text portions. Please add more cleaning as we go, as there is quite alot to sieve through and I dont think i caught it all.

In [105]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 248800 entries, 0 to 276861
Data columns (total 54 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   OPERATION_ID                   248800 non-null  float64       
 1   LOCATION                       248800 non-null  object        
 2   ROOM                           248800 non-null  object        
 3   CASE_STATUS                    248800 non-null  object        
 4   OPERATION_TYPE                 248800 non-null  object        
 5   EMERGENCY_PRIORITY             248800 non-null  object        
 6   PLANNED_PATIENT_CALL_TIME      248800 non-null  datetime64[ns]
 7   PLANNED_PATIENT_FETCH_TIME     248800 non-null  datetime64[ns]
 8   PLANNED_RECEPTION_IN_TIME      248800 non-null  datetime64[ns]
 9   PLANNED_ENTER_OR_TIME          248800 non-null  datetime64[ns]
 10  PLANNED_ANAESTHESIA_INDUCTION  248800 non-null  datetime64[ns]
 11  PLANN

### Inspect Location
No problems.

#### Unique values analysis

In [106]:
inspect_column(data, "LOCATION", top_n=30)

Column: LOCATION
Unique values: 9

Top value counts:
LOCATION
Main Building OT          73425
Endoscopy Center          62449
Medical Center OT         55211
Kent Ridge Wing OT        22311
ICL                       19901
DDI                       15315
Urology                     129
AH Endoscopy Center          53
AH Day Surgery Theatre        6
Name: count, dtype: int64


### Inspect Room
No problems.

#### Unique values analysis

In [107]:
inspect_column(data, "ROOM", top_n=30)

Column: ROOM
Unique values: 56

Top value counts:
ROOM
ENDO4       13173
ENDO2       11276
ENDO3        9581
RoomC        9347
MCOR05       9328
MBOR08       9327
ENDO6        9273
ENDO1        9203
MBOR09       8681
MCOR04       7577
RoomB        6874
MCOR06       6655
KRWOR1       6590
ENDO5        6437
DDIXA001     6244
MBOR18       5896
MBOR14       5512
MCOR10       5084
DDIXA004     4991
MCOR03       4939
MCOR09       4666
MCOR07       4416
MCOR02       4405
MBOR05       4371
MBOR06       4255
MCOR08       4092
DDIXA007     4080
MCOR01       4049
MBOR07       3965
RoomA        3680
Name: count, dtype: int64


### Inspect case status
No problems.

#### Unique values analysis

In [108]:
inspect_column(data, "CASE_STATUS", top_n=30)

Column: CASE_STATUS
Unique values: 5

Top value counts:
CASE_STATUS
Final         133598
Actualised    114925
Booked           158
Missed           118
Rebooking          1
Name: count, dtype: int64


### Inspect OPERATION_TYPE
No problems.

#### UVA

In [109]:
inspect_column(data, "OPERATION_TYPE", top_n=30)

Column: OPERATION_TYPE
Unique values: 2

Top value counts:
OPERATION_TYPE
Elective     213886
Emergency     34914
Name: count, dtype: int64


### Inspect Emergency Priority
No problems.

#### UVA

In [110]:
inspect_column(data, "EMERGENCY_PRIORITY", top_n=30)

Column: EMERGENCY_PRIORITY
Unique values: 7

Top value counts:
EMERGENCY_PRIORITY
0      213886
P2B     21121
P1       8306
P0       2088
P2A      2086
P3B       974
P3A       339
Name: count, dtype: int64


### Inspect Patient Code
No problems.

#### UVA

In [111]:
inspect_column(data, "PATIENT_CODE", top_n=30)

Column: PATIENT_CODE
Unique values: 151480

Top value counts:
PATIENT_CODE
mrSK5m9nqRlG[@dG    62
d<P8DK2?5odu8As6    61
a9a5R8heBV8Gy02e    36
S57v9vPZ64irK5ko    35
EJ6QB8qKDtZR3&LP    35
T4+jrog6Rnj3kWRv    33
io5[ei}@8b7d5t$i    32
6#66j3XPCph&DeQU    32
>oPUp7[6P7kZx33D    32
AwXDGce4$L3N8dX6    31
435T3j@UE4Ghkzpf    31
ptEf3fen^L?3Rxm5    30
}wX^sJ4ou0+Wyi{&    30
58}cNxQeH8MlvNn6    29
[?d#5CHonA4MY@h5    28
pth3}9j9ZkEia9$y    28
Nv#Um78atsyUuWDg    28
cT7dr7v}KvtoAXu8    27
wtlQ6lAU{}Wop0n;    27
?53&jgEQiWbq]uG6    27
oJ6AlAnb]28jYDcH    26
sPHpnBW77GKDQA9f    26
qLaR0<GgTuhQRsw6    26
hHtMePk3alTEnB7j    26
XLyDnhn5teXzEZ8W    26
L8id0<f+flK5Q2Fq    26
GM8kvXd2iakqbbEo    26
9xnrcUn0Bmui4edX    26
Rz23eDw9l58WTm8o    25
dxlvF7Gr2lYFd6#v    25
Name: count, dtype: int64


### Inspect Nature
Removed this column entirely, and created a legend(can be found below) to map SURGICAL_CODE to NATURE, as they are the same thing.

#### UVA

In [112]:
inspect_column(data, "NATURE", top_n=30)

Column: NATURE
Unique values: 9234

Top value counts:
NATURE
Intestine/Stomach, Upper GI endoscopy with / without biopsy (SF701I)                                                                                      20528
Colon, Colonoscopy (diagnostic), fibreoptic with/without biopsy (SF702C)                                                                                  12625
Colon, Colonoscopy (diagnostic), fibreoptic with/without biopsy (SF702C); Intestine/Stomach, Upper GI endoscopy with / without biopsy (SF701I)            11488
Dilation Of Cervix (SIC004)                                                                                                                                6529
Skin and Subcutaneous Tissue, Deep>3cm/Extensive Contaminated Wound, Debridement (SA811S)                                                                  5719
Coronary Angio. [ICL] (ISD811H)                                                                                                            

In [113]:
data, nature_legend = build_operation_legend_and_drop_nature(
    data,
    code_col="SURGICAL_CODE",
    nature_col="NATURE",
    drop_nature=True,           
    keep_title_case_copy=True    
)

In [114]:
print(data.columns)              
nature_legend.drop(columns='operation_name_title', inplace=True)
nature_legend

Index(['OPERATION_ID', 'LOCATION', 'ROOM', 'CASE_STATUS', 'OPERATION_TYPE',
       'EMERGENCY_PRIORITY', 'PLANNED_PATIENT_CALL_TIME',
       'PLANNED_PATIENT_FETCH_TIME', 'PLANNED_RECEPTION_IN_TIME',
       'PLANNED_ENTER_OR_TIME', 'PLANNED_ANAESTHESIA_INDUCTION',
       'PLANNED_SURGERY_PREP_TIME', 'PLANNED_KNIFE_TO_SKIN_TIME',
       'PLANNED_SKIN_CLOSURE', 'PLANNED_PATIENT_REVERSAL_TIME',
       'PLANNED_EXIT_OR_TIME', 'PLANNED_EXIT_RECOVERY_TIME',
       'PLANNED_OR_CLEANUP_TIME', 'PATIENT_CALL_TIME', 'PATIENT_FETCH_TIME',
       'ACTUAL_RECEPTION_IN_TIME', 'ACTUAL_ENTER_OR_TIME',
       'ACTUAL_ANAESTHESIA_INDUCTION', 'ACTUAL_SURGERY_PREP_TIME',
       'ACTUAL_KNIFE_TO_SKIN_TIME', 'ACTUAL_SKIN_CLOSURE',
       'ACTUAL_PATIENT_REVERSAL_TIME', 'ACTUAL_EXIT_OR_TIME',
       'ACTUAL_EXIT_RECOVERY_TIME', 'ACTUAL_OR_CLEANUP_TIME', 'PATIENT_CODE',
       'SURGICAL_CODE', 'DISCIPLINE', 'SURGEON', 'ANAESTHETIST_TEAM',
       'ANAESTHETIST_MCR_NO', 'ANESTHESIA', 'EQUIPMENT', 'ADMISSION_STAT

Unnamed: 0,operation_code,operation_name
0,0050741; ISD705H,biopsy (under fluoroscopic guidance) (0050741)...
1,0050741; ISD706H,biopsy (under fluoroscopic guidance) (0050741)...
2,0050741; ISD734H,biopsy (under fluoroscopic guidance) (0050741)...
3,0050741; ISD734H; ZSD706H,biopsy (under fluoroscopic guidance) (0050741)...
4,0050741; ISD815H,biopsy (under fluoroscopic guidance) (0050741)...
...,...,...
8829,ZSD802H; ISD706H,eps with ablation [icl] (zsd802h); icd implant...
8830,ZSD802H; ISD716H,eps with ablation [icl] (zsd802h); perm. cardi...
8831,ZSD802H; ZSD706H,eps with ablation [icl] (zsd802h); biventricul...
8832,ZSD810H,cor. angio with angioplasty - simple [icl]


In [115]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 248800 entries, 0 to 276861
Data columns (total 53 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   OPERATION_ID                   248800 non-null  float64       
 1   LOCATION                       248800 non-null  object        
 2   ROOM                           248800 non-null  object        
 3   CASE_STATUS                    248800 non-null  object        
 4   OPERATION_TYPE                 248800 non-null  object        
 5   EMERGENCY_PRIORITY             248800 non-null  object        
 6   PLANNED_PATIENT_CALL_TIME      248800 non-null  datetime64[ns]
 7   PLANNED_PATIENT_FETCH_TIME     248800 non-null  datetime64[ns]
 8   PLANNED_RECEPTION_IN_TIME      248800 non-null  datetime64[ns]
 9   PLANNED_ENTER_OR_TIME          248800 non-null  datetime64[ns]
 10  PLANNED_ANAESTHESIA_INDUCTION  248800 non-null  datetime64[ns]
 11  PLANN

### Inspect Surgical Code
Extension of NATURE.

#### UVA

In [116]:
inspect_column(data, "SURGICAL_CODE", top_n=30)

Column: SURGICAL_CODE
Unique values: 8834

Top value counts:
SURGICAL_CODE
SF701I            20528
SF702C            16697
SF702C; SF701I    15451
SIC004             6529
SP834U             6421
SA811S             5719
ISD811H            5562
SL809L             4445
SL808L             4345
ZSD810H            3425
XSD722V            3188
LL808L             3147
LL809L             3011
ISD714H            2379
SD707A             1798
SA701S             1793
SH808P             1762
SF710C             1724
SB810K             1562
ISD810H            1468
SF718B             1275
SA702S             1192
XSF706A            1178
SA841S             1167
SD812H             1137
XSD721V            1083
SA803S             1048
SF708B             1019
ISD715H            1013
SL801V             1008
Name: count, dtype: int64


### Inspect discipline
No problems.

#### UVA

In [117]:
inspect_column(data, "DISCIPLINE", top_n=30)

Column: DISCIPLINE
Unique values: 20

Top value counts:
DISCIPLINE
Surgery                          55007
Gastroenterology & Hepatology    35531
Obstetrics & Gynaecology         23876
Orthopaedic Surgery              22765
Ophthalmology                    21727
Cardiac                          19218
Radiology                        15320
CardiacThoracic&Vascular Surg    14102
Hand&Reconstructive Microsurg    11264
Otolaryngology&Head&Neck Surg     8899
Urology                           7106
Paediatric Surgery                4953
Medicine                          3247
Dental                            2962
Psychological Medicine            1265
Paediatrics                        809
Anaesthesia                        598
Transplant                         130
Haematology - Oncology              18
Neonatology                          3
Name: count, dtype: int64


### Inspect Surgeon
No problems.

#### UVA

In [118]:
inspect_column(data, "SURGEON", top_n=30)

Column: SURGEON
Unique values: 1226

Top value counts:
SURGEON
Unknown             15312
lHwi02ZeE3K2VunU     4203
ZEFBvq6RyyUi06&+     3911
mbQ;KNz{$02aCxsz     3639
haaeD5Q>4l&5cyMs     3065
nlJsN6bxb860fuvn     3045
96T?5Mj8VSelR}Cl     2873
aWZlndEB5y?M]03P     2722
C29nEfgqQBB7xVT?     2571
nei0JobdGxcE65t6     2441
RMDcp3MhHDP7tCgg     2321
T8Qv5G#&i6Qb$&q0     2276
qzEBDv<3hxja@xlk     2263
<r7whvGE0gU6QKKt     2240
#w0VHw03M#Wz2qtv     2110
}z#JLao8aJTQxqL<     2022
50c2z5Lc&m7Z2TaK     1931
ci2iezjzZ}L[{QN6     1855
exKU23ieN7zJsGLZ     1829
M[;D[QHC?Skd}JC8     1760
k@NSCW#4VzNuD5p8     1727
R8Xq6zPliFdBfK$8     1635
Xdph7$0RR^LtG27y     1631
06eDEgGBCh{gowXU     1579
W5{6rF02PmZX+ybv     1497
CmMlPoq8055XLXsw     1470
]FlWay&b[5b[v<oj     1362
LrZPovtT5MkTGz9[     1357
y[;kyh[m]CLsJM2A     1355
7rV86GcmAbuwptu0     1331
Name: count, dtype: int64


### Inspect ANAESTHETIST_TEAM
No problems.

#### UVA

In [119]:
inspect_column(data, "ANAESTHETIST_TEAM", top_n=30)

Column: ANAESTHETIST_TEAM
Unique values: 21037

Top value counts:
ANAESTHETIST_TEAM
Unknown                                                             124759
d053551ad4c28a2fb932ba43e6b1d113c638a50ef979df413d055fbd5f0f3192      2269
90697bd90f1e98133b3baf52d7d8bad94fae0f82182ccbce369b231d0e9aad81      1480
c3d2524de7f2c23598e5af6961683a7e44e61bb39fa6ccc5cfe5f236baf44649      1240
b8dd61fc03d6e3a642dfb099851f01cdc80de504245f49c58129dae4b4865a5d      1009
69e7e4a0e4d5bc7eb3852b47b0a1828bdab6115e8bd860cd44c4ebc354a17707       883
5f1d6a174c8cb3052fed44ccc95e00ae34c73c4e2eda8465422877dd6b08ae2a       741
670c0eb324e240959bf85b5d91f5abee71f7af88e1cc3cb3de887ba63bf32f91       728
d209a9569b8f43d22756e5ea8b8a75df973b9a1c47e60fe798614a959c5fe576       595
4baa32452676f4c55f7a42b895c59c41af1dc17a33ea3bba597526007b766310       592
3e2583cfd30a4c8732acd737262803aaf1b765d638fb94c1fecf3302680801df       591
b321002af2aa9a977e25c26492b6d2de51847c3dbbaf7792845e09b75bb222fa       569
e51218bf6137b06a

### INSPECT ANAESTHETIST_MCR_NO
No problems.

#### UVA

In [120]:
inspect_column(data, "ANAESTHETIST_MCR_NO", top_n=30)

Column: ANAESTHETIST_MCR_NO
Unique values: 397

Top value counts:
ANAESTHETIST_MCR_NO
Unknown                                                             125063
aabd0c05e0c4e9b6bfa368698260838d288a941f1b16bd4ada4b4ef8abdd9bbc      2397
b00c197ef120c6d366079d1d280ad75fa4956a4856609b4c7b22964e76b2fe80      2051
502af80887f9dad24b15c17c60e6e4a1c4aff7055acab561e970eb9b88a26520      2019
336fdaba1f1c9345f491a00dd3271735dad3e04e2346d423e865f41bcb1c4e50      1978
392d9547cf7c381e067ba6b48329ee24d8989d89a19355f3a0da1c727532632e      1880
7bb2e45c6fd6182f126613c5312ae2cb698fce0b7be26639d2be9629b09b1c92      1856
fa8be41538144d4b7c35e1daee25a14619bed1c76feef34df97625dd971690af      1809
5a40b6fd919d12b44462644d68c104c541fbecd0086225864bcbfedbfc285a75      1808
4eeecfeb436a3c9cf98b312a0fc1e9b3d44ce3b146b86634b4cc5526e2f1a53a      1702
2111d59051e5844a4ef523b9aa4db854d2783662d557ffb169152f950a6bf3a9      1686
c45f5ec87112196e3a48f5a3d48e3e1437efa903e74ac87711aae250f0542c22      1635
4dc6309a08fde9

### INSPECT ANESTHESIA
No problems.

#### UVA

In [121]:
inspect_column(data, "ANESTHESIA", top_n=30)

Column: ANESTHESIA
Unique values: 7

Top value counts:
ANESTHESIA
GA                            103662
Sedation                       62758
LA                             52261
NIL                            19805
RA                              8375
Monitored Anaesthesia Care      1888
LA (Standby)                      51
Name: count, dtype: int64


### Inspect EQUIPMENT
Removed #NUH_ or #NUH from all entries, as well as alphabetically ordered the equipment such that even if they were in different orders, they would appear under the same unique value.

#### UVA

In [122]:
inspect_column(data, "EQUIPMENT", top_n=30)

Column: EQUIPMENT
Unique values: 520

Top value counts:
EQUIPMENT
0                                                              228479
#NUH_Image Intensifier                                           7997
Image Intensifier                                                5084
#NUH_Mini II                                                     1494
Mini II                                                          1158
#NUH_Da Vinci Robotics                                            451
Da Vinci Robotics                                                 289
#NUH_Image Intensifier; #NUH_Jackson Table                        218
Image Intensifier; Jackson Table                                  169
#NUH_Microscope                                                   153
#NUH_Hand Microscope                                              147
#NUH_Image Intensifier; #NUH_Amsco Table                          138
Microscope                                                        133
Hand Microscope         

In [123]:
synonyms = {}
data = clean_equipment(
    data,
    col="EQUIPMENT",
    tags_to_strip=(r"#nuh",),        
    unknown_vals=("0","na","n/a","-","null","nan",""),
    synonym_map=synonyms
)

# Inspect results
inspect_column(data, "EQUIPMENT", top_n=30)

Column: EQUIPMENT
Unique values: 218

Top value counts:
EQUIPMENT
0                                                                 228479
image intensifier                                                  13082
mini ii                                                             2652
da vinci robotics                                                    740
image intensifier; jackson table                                     487
amsco table; image intensifier                                       311
microscope                                                           286
hand microscope                                                      268
amsco table; image intensifier; microscope                           200
image intensifier; microscope                                        165
image intensifier; traction table                                    157
head light; image intensifier; jackson table                         136
image intensifier; jackson table; microscope              

### Inspect ADMISSION_STATUS
No problems.

#### UVA

In [124]:
inspect_column(data, "ADMISSION_STATUS", top_n=30)

Column: ADMISSION_STATUS
Unique values: 7

Top value counts:
ADMISSION_STATUS
Discharged      205708
Actual           35113
Not Admitted      7786
Planned            132
Cancelled           59
1518656227           1
1518637975           1
Name: count, dtype: int64


In [125]:
data = data[data["ADMISSION_STATUS"] != "1518656227"]
data = data[data["ADMISSION_STATUS"] != "1518637975"]

### Inspect ADMISSION_CLASS_TYPE
No problems.

#### UVA

In [126]:
inspect_column(data, "ADMISSION_CLASS_TYPE", top_n=30)

Column: ADMISSION_CLASS_TYPE
Unique values: 21

Top value counts:
ADMISSION_CLASS_TYPE
Subsidised         75849
Class C            48981
Class B2           38154
Private            17959
Class A            11691
Non Resident       10708
Not Admitted        7785
Subsidised PR       4981
Class B1            4618
Private for RF      4498
Class C for RF      3866
Private for PR      3484
Class A for RF      3321
Non Resident B1     3286
Class C for PR      3151
Class B2 for PR     2498
Class A for PR      2483
Class B1 for PR      665
Class B1 for RF      457
Class B2 for RF      363
Discharged             2
Name: count, dtype: int64


### Inspect ADMISSION_TYPE
No problems.

#### UVA

In [127]:
inspect_column(data, "ADMISSION_TYPE", top_n=30)

Column: ADMISSION_TYPE
Unique values: 13

Top value counts:
ADMISSION_TYPE
Emergency         65466
Day Surgery       64065
Endoscopy         46694
Same Day Adm.     45117
Elective inpa     13570
Not Admitted       7789
DS turn Inpat      5151
Day Surgery OP      660
Newborn             211
Transfer             60
Technical Adm        15
Class A for PR        1
Class C               1
Name: count, dtype: int64


### Inspect ADMISSION_WARD
No problems.

#### UVA

In [128]:
inspect_column(data, "ADMISSION_WARD", top_n=30)

Column: ADMISSION_WARD
Unique values: 99

Top value counts:
ADMISSION_WARD
NWASW           53806
NCKED           45002
NW2A            30165
NW63             9978
Not Admitted     9136
NW43             7516
NW51             7111
NW41             6420
NW28             5727
NW64             4999
NW52             4944
NW48             4250
NW44             4092
NW47             3546
NWICL            3436
NW9B             3245
NW7A             3050
NW6A             3048
NW54             3033
NWEDS            2698
NW6B             2518
NW9A             2453
NW55             2405
NW42             2288
NW7B             2276
NW27             1975
NW56             1721
NCZCOL           1517
NW33             1428
NW22             1198
Name: count, dtype: int64


### Inspect ADMISSION_BED
No problems.

#### UVA

In [129]:
inspect_column(data, "ADMISSION_BED", top_n=30)

Column: ADMISSION_BED
Unique values: 1612

Top value counts:
ADMISSION_BED
Not Admitted    56371
NASWC82          9195
NASWC31          7067
NEDS001          2696
NASWB01          1620
NASWB25          1424
NASWB21          1375
NASWB19          1355
NASWB22          1348
NASWB20          1339
NASWB26          1318
NASWB30          1288
NASWB15          1267
NASWB10          1258
NASWB09          1244
NASWB29          1242
NASWB16          1242
N063DL01         1240
NASWB23          1223
NASWB08          1211
NASWB14          1204
NASWB28          1203
NASWB27          1173
NASWB17          1153
NASWB11          1151
NASWB07          1096
NASWB24          1087
NASWB18          1078
NASWB13          1064
NASWB12           990
Name: count, dtype: int64


### Inspect AOH
Fixed True False.

#### UVA

In [130]:
inspect_column(data, "AOH", top_n=30)

Column: AOH
Unique values: 4

Top value counts:
AOH
False      248711
True           87
N05A006         1
N052031         1
Name: count, dtype: int64


In [131]:
data = normalize_text(data, "AOH", unknown_vals=["0", "na", "n/a", "-", "null", "nan"])
inspect_column(data, "AOH", top_n=30)

Column: AOH
Unique values: 4

Top value counts:
AOH
false      248711
true           87
n05a006         1
n052031         1
Name: count, dtype: int64


### Inspect BLOOD
No problems.

#### UVA

In [132]:
inspect_column(data, "BLOOD", top_n=30)

Column: BLOOD
Unique values: 8

Top value counts:
BLOOD
NIL               230841
Blood - 2 pint     13404
GXM                 4528
TXS                   14
Blood - 1 pint         5
Blood - 4 pint         5
False                  2
Blood - 3 pint         1
Name: count, dtype: int64


### Inspect IMPLANT
remove 'yes' remove 'x1' remove multiple white spaces, leading and trailing whitespaces and symbols.

#### UVA

In [133]:
inspect_column(data, "IMPLANT", top_n=30)

Column: IMPLANT
Unique values: 16711

Top value counts:
IMPLANT
0                                     224995
na                                       550
-                                        512
NIL                                      417
Stent                                    384
YES X1 PRESSURE WIRE AND X1 STENT.       310
mesh                                     227
YES                                      196
Mesh                                     194
STENT                                    143
NIL.                                     136
MESH                                     122
YES X1 IMPLANT.                          121
nil                                      120
Nil                                       88
YES X1 STENT.                             71
PRESSURE WIRE                             59
Smith and Nephew                          52
Camera system 1488                        51
YES X1 IMPLANT                            48
Smith & Nephew                      

In [134]:
data = normalize_text(data, "IMPLANT", unknown_vals=["0", "na", "n/a", "-", "null", "nan", "", "nil", "nil."])
data["IMPLANT"] = (
    data["IMPLANT"]
    .astype(str)
    .str.strip(" ;,.-")
    .str.replace(r"\bx\d+\b", "", regex=True)
    .str.replace(r"\byes\b", "", regex=True)
    .str.replace(r"\s+", " ", regex=True)    
    .str.strip()
    .str.replace("&", "and", regex=False)
)
inspect_column(data, "IMPLANT", top_n=30)

Column: IMPLANT
Unique values: 15754

Top value counts:
IMPLANT
0                          226317
                              763
stent                         674
mesh                          545
pressure wire and stent       338
pressure wire                 216
implant                       191
smith and nephew              165
camera system 1488            101
valve                          97
device                         79
graft                          78
operating microscope           76
vac dressing                   72
k wires                        68
flat jackson table             66
zimmer nexgen                  56
vac                            55
phenol                         54
, valve                        53
stryker scorpio nrg            51
stryker                        44
navigation                     43
, device                       42
k wire                         41
medtronic solera               33
smith nephew                   32
arthrex           

### Inspect DIAGNOSIS
just normal standardisation

#### UVA

In [135]:
inspect_column(data, "DIAGNOSIS", top_n=30)

Column: DIAGNOSIS
Unique values: 57338

Top value counts:
DIAGNOSIS
Not Recorded                      87809
cataract                           4654
right cataract                     3605
left cataract                      3568
subfert                            3011
Subfert                            2212
End stage renal failure            1865
CORONARY ARTERY DISEASE            1457
Peripheral vascular disease        1241
Cataract                           1204
END STAGE RENAL FAILURE             909
subfertility                        875
ISCHEMIC HEART DISEASE              820
Left cataract                       771
Right cataract                      769
dental caries                       670
Left breast cancer                  627
Right breast cancer                 609
Gall bladder stones                 559
Previous LSCS                       491
Phimosis                            462
Acute appendicitis                  405
Impacted wisdom teeth               401
acute append

In [136]:
data = normalize_text(data, "DIAGNOSIS", unknown_vals=["0", "na", "n/a", "-", "null", "nan", "", "nil"])
inspect_column(data, "DIAGNOSIS", top_n=100)

Column: DIAGNOSIS
Unique values: 48477

Top value counts:
DIAGNOSIS
not recorded                    87809
cataract                         5876
subfert                          5417
right cataract                   4483
left cataract                    4439
                                ...  
prev cs                           160
supraventricular tachycardia      160
cyst                              158
gastric cancer                    158
right distal radius fracture      157
Name: count, Length: 100, dtype: int64


### Inspect CANCER_INDICATOR
just normal standardisation

#### UVA

In [137]:
inspect_column(data, "CANCER_INDICATOR", top_n=30)

Column: CANCER_INDICATOR
Unique values: 13

Top value counts:
CANCER_INDICATOR
False                                                                           243202
True                                                                              5582
CI Pinless x1 ?                                                                      4
CI Pinless x1 in house?                                                              3
HP x7  1 sets ?                                                                      1
CI Pinless x (x1 in house) ?                                                         1
Gen Spine x1-                                                                        1
CI Pinless ?                                                                         1
CI Pinless (x1 in house) ?                                                           1
METRx I Set ? 1 tray ?                                                               1
Diagnostic laparoscopy, laparoscopic cholecystectom

In [138]:
data = normalize_text(data, "CANCER_INDICATOR", unknown_vals=["0", "na", "n/a", "-", "null", "nan"])
data = data[data["CANCER_INDICATOR"].isin(["false", "true"])]
inspect_column(data, "CANCER_INDICATOR", top_n=30)

Column: CANCER_INDICATOR
Unique values: 2

Top value counts:
CANCER_INDICATOR
false    243202
true       5582
Name: count, dtype: int64


### Inspect TUMOR_INDICATOR
just normal standardisation

#### UVA

In [139]:
inspect_column(data, "TRAUMA_INDICATOR", top_n=30)

Column: TRAUMA_INDICATOR
Unique values: 2

Top value counts:
TRAUMA_INDICATOR
False    248686
True         98
Name: count, dtype: int64


In [140]:
data = normalize_text(data, "TRAUMA_INDICATOR", unknown_vals=["0", "na", "n/a", "-", "null", "nan"])
inspect_column(data, "TRAUMA_INDICATOR", top_n=30)


Column: TRAUMA_INDICATOR
Unique values: 2

Top value counts:
TRAUMA_INDICATOR
false    248686
true         98
Name: count, dtype: int64


### Inspect Delay_Reason
No problems.

#### UVA

In [141]:
inspect_column(data, "Delay_Reason", top_n=30)

Column: Delay_Reason
Unique values: 5199

Top value counts:
Delay_Reason
0                                                                              200474
Admin (e.g. registration and etc.)                                               9038
Surgeon (e.g. Surgeon not available & etc.)                                      5165
Resequencing of cases                                                            4273
MCOT End of AM Elective List: 830am-1pm (Last scheduled AM case of the day)      3418
e case                                                                           3208
OR not ready (Cleaning, setting up OT table / equipment)                         1970
Patient (e.g. new medical condition, URTI, cancellation)                         1371
Pre-Op (e.g. no consent, investigation, blood , Xray)                            1293
emergency case                                                                   1100
Anaesthetist (e.g. Anaesthetist not available & etc.)              

### Inspect Remarks
No problems.

#### UVA

In [142]:
inspect_column(data, "Remarks", top_n=30)

Column: Remarks
Unique values: 8762

Top value counts:
Remarks
0                          225936
e case                       3684
emergency case               1234
no delay                      873
emergency                     581
ecase                         441
E case                        341
e-case                        329
EOT                           298
eot                           282
different surgeon             282
.                             265
e case                        255
no delays                     184
na                            170
E CASE                        143
eot case                      135
aoh                           134
AOH                            95
Emergency case                 80
em case                        79
p1                             75
additional case                73
others                         66
pm list                        65
nil                            60
different surgical team        59
e list             

Save file from cleaning steps above into a seperate file (change file name if required)

In [143]:
output_path = "oots-data-cleaning-1.xlsx"
data.to_excel(output_path, index=False)

print(f"Saved to {output_path}")

Saved to oots-data-cleaning-1.xlsx


## EMMAS PART

Find the highest frequency of words, bigrams, and trigrams to be used in taxonomy for categorisation

In [144]:
if "Delay_Reason" not in data.columns:
    raise KeyError(f"'Delay Reason' not found. Available columns: {list(data.columns)}")

data.head()

Unnamed: 0,OPERATION_ID,LOCATION,ROOM,CASE_STATUS,OPERATION_TYPE,EMERGENCY_PRIORITY,PLANNED_PATIENT_CALL_TIME,PLANNED_PATIENT_FETCH_TIME,PLANNED_RECEPTION_IN_TIME,PLANNED_ENTER_OR_TIME,...,AOH,BLOOD,IMPLANT,DIAGNOSIS,CANCER_INDICATOR,TRAUMA_INDICATOR,Delay_Reason,Remarks,planned_valid,actual_valid
0,588456.0,Main Building OT,MBOR11,Final,Elective,0,2019-04-11 09:50:00,2019-04-11 09:50:00,2019-04-11 09:50:00,2019-04-11 09:50:00,...,False,NIL,required microscope,right breast ca,False,False,Surgeon (e.g. Surgeon not available & etc.),0,True,True
1,590736.0,Main Building OT,MBOR05,Final,Elective,0,2019-04-11 10:40:00,2019-04-11 10:40:00,2019-04-11 10:40:00,2019-04-11 10:40:00,...,False,NIL,0,early pregnancy failure,False,False,0,0,True,True
2,591995.0,ICL,RoomC,Actualised,Elective,0,2019-04-11 10:55:00,2019-04-11 10:55:00,2019-04-11 10:55:00,2019-04-11 10:55:00,...,False,NIL,0,not recorded,False,False,0,0,True,True
3,590451.0,Main Building OT,MBOR04,Final,Elective,0,2019-04-11 10:50:00,2019-04-11 10:50:00,2019-04-11 10:50:00,2019-04-11 10:50:00,...,False,NIL,need eustachian tube ballon,eustachian tube disorder,False,False,0,0,True,True
4,573666.0,Medical Center OT,MCOR03,Final,Elective,0,2019-04-11 10:30:00,2019-04-11 10:30:00,2019-04-11 10:30:00,2019-04-11 10:30:00,...,False,NIL,0,gall bladder stone,False,False,first case havent finished,first case havent finished,True,True


Normalize text in Delay_Reason (remove punctuation, standardise case, remove trialing spaces)

In [145]:
_punct_tbl = str.maketrans("", "", string.punctuation)

def normalize_text(s: str) -> str:

    s = str(s).lower()
    s = s.translate(_punct_tbl)
    s = re.sub(r"\s+", " ", s).strip()

    s = re.sub(r"\bo\.t\b", "operating theater", s)
    s = re.sub(r"\bot\b", "operating theater", s)
    s = re.sub(r"\bo\.r\b", "operating room", s)
    s = re.sub(r"\banaesth\b", "anaesthesia", s)
    s = re.sub(r"\banesth\b", "anaesthesia", s)
    s = re.sub(r"\bpt\b", "patient", s)
    s = re.sub(r"\bprev\b", "previous", s)
    s = re.sub(r"\bdr\b", "doctor", s)
    s = re.sub(r"\bpre-med\b", "premedication", s)

    return s

# Apply normalization
data["_Delay_norm"] = data["Delay_Reason"].astype(str).fillna("").map(normalize_text)
data[["_Delay_norm"]].head(10)

Unnamed: 0,_Delay_norm
0,surgeon eg surgeon not available etc
1,0
2,0
3,0
4,first case havent finished
5,surgeon eg surgeon not available etc
6,e case
7,0
8,0
9,0


In [146]:
STOPWORDS = {
    "the","a","an","and","or","to","of","for","by","with","from",
    "is","are","was","were","be","been","being","due","because",
    "this","that","it","as","into","per","via", "eg", "etc"
}

# Initialize containers
words, bigrams, trigrams = [], [], []

# Tokenize each delay reason
for text in data["_Delay_norm"]:
    tokens = [t for t in text.split() if t and t not in STOPWORDS]
    if not tokens:
        continue

    words.extend(tokens)
    if len(tokens) >= 2:
        bigrams.extend([" ".join(tokens[i:i+2]) for i in range(len(tokens)-1)])
    if len(tokens) >= 3:
        trigrams.extend([" ".join(tokens[i:i+3]) for i in range(len(tokens)-2)])

In [147]:
OUTPUT_FILE = "oots-data-cleaning-3-flagged.xlsx"
OUTPUT_CSV  = "oots-data-cleaning-3-flagged.csv"

COL = "Delay_Reason"
s = data[COL].astype(str)

clean = (
    s.str.lower()
     .str.replace(r"[^\w\s]", "", regex=True)    
     .str.replace(r"\s+", " ", regex=True)       
     .str.strip()
)

raw = s.str.strip()
only_punct_or_numbers = raw.str.match(r'^(?=.*\S)(?!.*[A-Za-z]).*$', na=False)

data.loc[only_punct_or_numbers, COL] = "0"

not_late_phrases = [
    "no delay", "not delay", "not delayed", "not late",
    "na", "0", "null", "nan"
]

def phrase_to_token_pattern(p: str) -> str:
    p = p.strip().lower()
    esc = re.escape(p).replace(r"\ ", r"\s+")
    return rf"(?<!\w){esc}(?!\w)"

pattern = r"(?:{})".format("|".join(phrase_to_token_pattern(p) for p in not_late_phrases))
regex = re.compile(pattern, flags=re.IGNORECASE)

phrase_hit = clean.str.contains(regex, na=False)

data["Reason_Is_Late"] = np.where(only_punct_or_numbers | phrase_hit, 0, 1)

data[COL] = clean
data.loc[only_punct_or_numbers, COL] = "0"
data.drop(columns=COL, inplace=True)


## Adding target variables

In [148]:
# --- Surgery duration (knife → closure) ---
data["ACTUAL_SURGERY_DURATION"]  = data["ACTUAL_SKIN_CLOSURE"]  - data["ACTUAL_KNIFE_TO_SKIN_TIME"]
data["PLANNED_SURGERY_DURATION"] = data["PLANNED_SKIN_CLOSURE"] - data["PLANNED_KNIFE_TO_SKIN_TIME"]
data["DIFF_SURGERY_DURATION"]    = data["ACTUAL_SURGERY_DURATION"] - data["PLANNED_SURGERY_DURATION"]

# --- OR usage duration (enter OR → exit OR) ---
data["ACTUAL_USAGE_DURATION"]  = data["ACTUAL_EXIT_OR_TIME"]  - data["ACTUAL_ENTER_OR_TIME"]
data["PLANNED_USAGE_DURATION"] = data["PLANNED_EXIT_OR_TIME"] - data["PLANNED_ENTER_OR_TIME"]
data["DIFF_USAGE_DURATION"]    = data["ACTUAL_USAGE_DURATION"] - data["PLANNED_USAGE_DURATION"]

# --- Start delays for enter OR and incision timings and leave OR (actual minus planned) ---
data["ENTER_START_DELAY"] = data["ACTUAL_ENTER_OR_TIME"] - data["PLANNED_ENTER_OR_TIME"]
data["KNIFE_START_DELAY"] = data["ACTUAL_KNIFE_TO_SKIN_TIME"] - data["PLANNED_KNIFE_TO_SKIN_TIME"]
data["EXIT_OR_DELAY"] = data["ACTUAL_EXIT_OR_TIME"] - data["PLANNED_EXIT_OR_TIME"]


converting new target variables to minutes

In [149]:
to_min = lambda s: s.dt.total_seconds() / 60
for col in [
    "ACTUAL_SURGERY_DURATION","PLANNED_SURGERY_DURATION","DIFF_SURGERY_DURATION",
    "ACTUAL_USAGE_DURATION","PLANNED_USAGE_DURATION","DIFF_USAGE_DURATION",
    "ENTER_START_DELAY","KNIFE_START_DELAY","EXIT_OR_DELAY"
]:
    data[col] = to_min(data[col])

data.head()

Unnamed: 0,OPERATION_ID,LOCATION,ROOM,CASE_STATUS,OPERATION_TYPE,EMERGENCY_PRIORITY,PLANNED_PATIENT_CALL_TIME,PLANNED_PATIENT_FETCH_TIME,PLANNED_RECEPTION_IN_TIME,PLANNED_ENTER_OR_TIME,...,Reason_Is_Late,ACTUAL_SURGERY_DURATION,PLANNED_SURGERY_DURATION,DIFF_SURGERY_DURATION,ACTUAL_USAGE_DURATION,PLANNED_USAGE_DURATION,DIFF_USAGE_DURATION,ENTER_START_DELAY,KNIFE_START_DELAY,EXIT_OR_DELAY
0,588456.0,Main Building OT,MBOR11,Final,Elective,0,2019-04-11 09:50:00,2019-04-11 09:50:00,2019-04-11 09:50:00,2019-04-11 09:50:00,...,1,347.0,410.0,-63.0,402.0,455.0,-53.0,74.0,84.0,21.0
1,590736.0,Main Building OT,MBOR05,Final,Elective,0,2019-04-11 10:40:00,2019-04-11 10:40:00,2019-04-11 10:40:00,2019-04-11 10:40:00,...,0,10.0,30.0,-20.0,31.0,45.0,-14.0,21.0,20.0,7.0
2,591995.0,ICL,RoomC,Actualised,Elective,0,2019-04-11 10:55:00,2019-04-11 10:55:00,2019-04-11 10:55:00,2019-04-11 10:55:00,...,0,19.0,45.0,-26.0,26.0,60.0,-34.0,3.0,-8.0,-31.0
3,590451.0,Main Building OT,MBOR04,Final,Elective,0,2019-04-11 10:50:00,2019-04-11 10:50:00,2019-04-11 10:50:00,2019-04-11 10:50:00,...,0,112.0,150.0,-38.0,149.0,180.0,-31.0,8.0,7.0,-23.0
4,573666.0,Medical Center OT,MCOR03,Final,Elective,0,2019-04-11 10:30:00,2019-04-11 10:30:00,2019-04-11 10:30:00,2019-04-11 10:30:00,...,1,113.0,90.0,23.0,156.0,120.0,36.0,28.0,34.0,64.0


## EMMA'S PART #2

In [150]:
PLANNED_COL = "PLANNED_KNIFE_TO_SKIN_TIME"
# Accept common typo variants for ACTUAL
ACTUAL_COL = "ACTUAL_KNIFE_TO_SKIN_TIME"

if PLANNED_COL not in data.columns:
    raise KeyError(f"'{PLANNED_COL}' not found. Available columns: {list(data.columns)}")
if ACTUAL_COL not in data.columns:
    raise KeyError(f"'{ACTUAL_COL}' not found. Available columns: {list(data.columns)}")

NORM_COL = "_Delay_norm"
CAT_COL     = "Delay_Category"

In [151]:
# ---------- DATETIME PARSER ----------
def _parse_dt(series: pd.Series) -> pd.Series:
    """Robust parser: handles datetime dtype, Excel serials, and DD/MM/YYYY HH:MM[:SS] strings."""
    s = series
    if np.issubdtype(s.dtype, np.datetime64):
        return s
    if np.issubdtype(s.dtype, np.number):  # Excel serials
        return pd.to_datetime(s, unit="D", origin="1899-12-30", errors="coerce")
    # Strings like DD/MM/YYYY HH:MM or HH:MM:SS
    return pd.to_datetime(s, errors="coerce", dayfirst=True)

In [152]:
# ---------- TEXT NORMALIZATION (with spaCy assist) ----------
try:
    import spacy
    try:
        nlp = spacy.load("en_core_web_sm", disable=["ner", "textcat"])
    except Exception:
        nlp = spacy.blank("en")
except Exception:
    nlp = None

_punct_tbl = str.maketrans("", "", string.punctuation)

def normalize_text_basic(s: str) -> str:
    s = str(s).lower()
    s = s.translate(_punct_tbl)
    s = re.sub(r"\s+", " ", s).strip()
    s = re.sub(r"\bo\.t\b", "operating theater", s)
    s = re.sub(r"\bot\b", "operating theater", s)
    s = re.sub(r"\bo\.r\b", "operating room", s)
    s = re.sub(r"\banaesth\b", "anaesthesia", s)
    s = re.sub(r"\banesth\b", "anaesthesia", s)
    s = re.sub(r"\bpt\b", "patient", s)
    s = re.sub(r"\bprev\b", "previous", s)
    s = re.sub(r"\bdr\b", "doctor", s)
    s = re.sub(r"\bpre ?med\b", "premedication", s)
    s = re.sub(r"\baoh\b", "after office hours", s)
    s = re.sub(r"\bem(er|urg)\w*\b", "emergency", s)
    return s

CANONICAL_TOKENS = {
    "emergency": {"emer", "emerg", "emrgncy", "emeegency", "emeergency", "emegency", "emrgency", "emerhency"},
    "anaesthetist": {"anesthetist", "anaethetist", "anesthethist", "anaesthsist", "anaesteist", "anaesthist"},
    "anaesthesia": {"aneasthesia", "anaestheisa", "anesthesia", "aneastheisa"},
    "transfer": {"tranfer", "transfered", "transferd", "transfred", "transfre", "trasfer", "trasferred", "transferred"},
    "scheduled": {"sched", "scheduel", "schelude", "sheduled", "shedulled", "secheduled", "schedulled"},
    "radiographer": {"radiografer", "radiogapher"},
    "interpreter": {"interperter", "interprter", "intrerpreter", "translater"},
    "porter": {"portel", "proter", "nurseporter"},
    "theatre": {"theater", "theartre", "thaetre"},
    "operating": {"op", "operation", "operating"},
    "room": {"or", "ot"},
    "checklist": {"check list", "checlist", "checklits"},
    "assessment": {"assesment", "assestment", "assesmnet"},
}
FUZZY_THRESHOLD = 0.85

def fuzzy_canonicalize_token(tok: str) -> str:
    if not tok or tok.isdigit():
        return tok
    best = tok
    best_ratio = 0.0
    for canonical, hints in CANONICAL_TOKENS.items():
        r = difflib.SequenceMatcher(None, tok, canonical).ratio()
        if r > best_ratio:
            best_ratio, best = r, (canonical if r >= FUZZY_THRESHOLD else best)
        for h in hints:
            r2 = difflib.SequenceMatcher(None, tok, h).ratio()
            if r2 > best_ratio:
                best_ratio, best = r2, (canonical if r2 >= FUZZY_THRESHOLD else best)
    return best

def enhance_norm(text: str) -> str:
    s = normalize_text_basic(text)
    if nlp is None:
        return s
    doc = nlp(s)
    toks = []
    for t in doc:
        if t.is_space:
            continue
        lemma = (t.lemma_ or t.text).lower().strip()
        lemma = re.sub(r"\s+", "", lemma)
        toks.append(fuzzy_canonicalize_token(lemma))
    out = " ".join(toks)
    out = re.sub(r"\s+", " ", out).strip()
    return out

# Build normalized text column from _Delay_norm
data[NORM_COL] = data["_Delay_norm"].astype(str).fillna("").map(enhance_norm)


In [153]:
# ---------- IS_LATE FROM PLANNED VS ACTUAL ----------
kp = _parse_dt(data[PLANNED_COL])
ka = _parse_dt(data[ACTUAL_COL])
delta_min = (ka - kp).dt.total_seconds() / 60.0
data["Is_Late"] = (delta_min > 0).astype(int)  # actual later than planned

In [154]:
# ---------- TAXONOMY & CLASSIFICATION ----------
taxonomy_data = [
  {"category": "Priority Case/Emergency", "patterns": [
      r"\bem(er|urg)\w*\b",
      r"\be\s?case\b|\be\s?op\b|\be\s?ot\b|\be\s?list\b",
      r"\b(crash\s*)?lscs\b|\bel?scs\b|\becabg\b|\bec case\b|\be cs\b|\bec?\s?case\b",
      r"\bicu( case)?\b|\bpicu( case)?\b|\bmicu( case)?\b",
      r"\bp\s?0\b|\bp\s?1\b", r"p1", r"p2", r"\se\s", r"emergency"
  ]},
  {"category": "Scheduling/Venue Changes", "patterns": [
      r"^\s*from\b", r"\bfrom (or|ot|m?cor|krwot|krwor|mcot)\b.*", r"\bcase (?:from|is from)\b",
      r"\btransf\w*\b", r"\b(transfer|shift) to (another|other) or\b|\boperation from another or\b",
      r"\b(add\s*on|addon|add in) case(s)?\b|\badded case(s)?\b|\bad ?case\b", r"\badditional( case)?\b",
      r"\brescheduled cases?\b|\bre scheduling\b|\brearranging of case\b|\bre ?order\b", r"\bsequence\b|\bre[- ]?sequenc\w*\b|\bswap( case| theater| theatre)?\b",
      r"\b(change|re ?arranged|reshuffle|merged)\b.*\blist\b",
      r"\bchange (of )?or\b|\bchange slot\b|\bchange case\b",
      r"\bcase list(ing)? (after|is pm)\b|\bover list\b|\boverl(apping|ap) cases\b", r"tranfer"
  ]},
  {"category": "Anaesthetic", "patterns": [
      r"\banae?sthe?t(i|e)st\w*\b|\banes\b|\banaesthetic\b|\banae?sthe?si\w*\b",
      r"\b(waiting|awaiting).{0,25}anae?sthe?t\w*\b|\banae?sthe?t\w* (review(ing)?|to review)\b",
      r"\bspinal\b.*\bpacu\b|\bblock\b.*\b(pacu|recovery)\b",
      r"\bpacu is full\b|\banaesthetics take over\b|\banaest do(ing)? (her )?pre ?assess?ment\b",
      r"anesthetist", r"anaesthetic"
  ]},
  {"category": "Administration", "patterns": [
      r"\bpre[- ]?op\b.*\b(assessment|checklist)\b|\bpre[- ]?assessment( not done)?\b|\basses?ment( needed| to be done| not done)?\b",
      r"\bcheck ?list( locked| was not done| not done)?\b|\bchecklist to be done\b",
      r"\bregistration\b|\breception\b|\bscan at reception\b|\bbooking\b",
      r"\bawaiting marking\b|\bsite marking not done\b|\bop site (not mark(ed)?|marking)\b",
      r"\bfinancial counse?ll?ing( not done| issue)?\b", r"\broll call\b", r"consent"
  ]},
  {"category": "Imaging/Labs", "patterns": [
      r"\bblood\b|\bhypocount\b", r"\blab result(s)?\b|\becg( result)?\b|\bscan\b|\bultrasound\b",
      r"\bradiology|x ?ray|xray|ct|mri|imaging (pending|delay|not ready)\b",
      r"\bcross ?match|gxm\b",
      r"\bplatelet|ffp|prbc|cryoprecipitate\b",
      r"\bradiographer (not available|awaiting)\b|\bawaiting radiocolloid\b"
  ]},
  {"category": "Pharmacy/Medication", "patterns": [
      r"\bpharmacy\b|\bmedication\b|\bpre ?med(s|ication)?\b",
      r"\bwaiting .* (pharmacy|medication|mitomycin)\b|\bmitomycin( c)? (not ready|to be ready|waiting)\b"
  ]},
  {"category": "Equipment", "patterns": [
      r"\bmachine\b|\bequipment(s)?\b|\bequipment (fault|failure|breakdown|not (ready|available))\b",
      r"\b(cautery|microscope|drill|scope|laparoscope) (issue|fault|broken|failure|problem)\b",
      r"\b(endoscop(y|ic)|endo) (not ready|case)\b",
      r"\b(sets?|set up) not ready\b|\bcheck diathermy esu\b",
      r"\b(change|changing) (ot )?table\b|\btable\b",
      r"\brfid\b|\brobot\b|\bimage intensifier not available\b",
      r"\bpace ?maker( testing| reset)?\b|\bcardiac tech to check pacemaker\b"
  ]},
  {"category": "Room/Facilities", "patterns": [
      r"^\s*(or|ot)\b|\boperating room\b|\btheatre\b|\btheater\b",
      r"\b(or|ot|theatre) not ready\b|\bclean(ing)? (the )?(or|ot|theatre|theater)\b|\bcleaning in progress\b|\bcleaning process\b|\bcleaning very messy\b",
      r"\b(power|electric|light|temperature|aircon|hvac) (issue|fault|failure|problem)\b",
      r"\bsetting\b|\bopsite\b"
  ]},
  {"category": "Preparation", "patterns": [
      r"\bturn\s*over\b|\bturning over\b|\bturing over\b|\bturn ?over\b",
      r"\bwash(ing)?\s*(or|ot|theatre|theater)\b|\bneed to wash (or|ot)\b",
      r"\bprepar(e|ing) (or|ot|theatre|theater)\b|\bpreparing the (ot|theatre|theater)\b",
      r"\bpreparing (bronchoscope|for nivats|for triplets)\b",
      r"\broom\b|\bwarming\b|\bsets not ready\b|\bset up line\b",
      r"bed", r"cleaning", r"washing"
  ]},
  {"category": "Transport/Portering", "patterns": [
      r"\bporter( not available| unavailable| delay| late)?\b|\bno porter( available)?\b",
      r"\btransport(ation)?( delay| unavailable| late)?\b|\bsend wrong transportation\b",
      r"\b(fetch|fetching)\b|\bsend and fetch\b"
  ]},
  {"category": "Patient Factors", "patterns": [
      r"\bpatient(s)?\b|(?<!previous )\bpt(s)?\b|\bpate?i?nt\b",
      r"\bfasting\b|\bnot fasted enough\b",
      r"\bparent(s)?\b|\bfather\b|\bmother\b|\bfamily\b|\bbaby\b|\bneonate(s)?\b",
      r"\b(hd|shd) bed (not available|confirmation|arrangement)\b|\bawaiting (hd|shd) bed\b|\bawaiting h[di] bed\b",
      r"\bpaed(s|iatric)?\b|\bpead(s)?\b",
      r"\binterpreter\b|\btranslation|translator\b|\bwaiting interpreter\b",
      r"\b(pupil|eye) (not )?dilated\b|\bawaited eye dilation\b",
      r"\binpatient\b|\bisolation case\b|\bspecial child\b|\blife threatening\b",
      r"\bparents needed some clarifications\b",
      r"waiting for the last case"
  ]},
  {"category": "Other Case Issues", "patterns": [
      r"\b(1|1st|first|2|2nd|second|2and|2en|3|3rd|third|5th)\s*case\b",
      r"\b(1st|2nd|3rd) case (started|finish(ed)?|ended) late\b|\b(1st|2nd) one (cancelled|delayed|finished|completed)\b",
      r"\b(am|morning) (list|session) (done|ended|ends late|over ?run|finished|fanished|finshed|og over run)\b",
      r"\bpm (list|case|session|cases)\b",
      r"\bcase (over ?run|overrun|ended late|finish(ed)? late|delay(ed)?)\b",
      r"\bend of am (case|list|session|shift)\b|\bnext case\b|\bnxt case\b",
      r"\bcase just (book|listed|finish)\b|\bjust finished (another|the other e) case\b",
      r"\bcase sent late\b|\blist (delayed|started later than schedule)\b",
      r"previous", r"previous case", r"previous op", r"afternoon list", r"afternoon case"
  ]},
  {"category": "Timing", "patterns": [
      r"\b(list(ed)?|schedule(d)?|start(s|ed)?) at\b\s*\d{3,4}(am|pm|hrs)?\b",
      r"\bcase (at|is) \d{3,4}(am|pm|hrs)?\b|\boperation (time|at) (is )?\d{3,4}(am|pm|hrs)?\b",
      r"\bop at \d{3,4}(am|pm|hrs)?\b",
      r"\b(op|case) (sched(?:uled|ulled|uled|ule|uled)|schelude|sheduled|shedulled|secheduled|sched) (at|after)\b\s*\d{3,4}(am|pm|hrs)?\b",
      r"\bcase list(ing)? (after|is pm)\b",
      r"case booked", r"case listed", r"case scheduled",
      r"case start", r"case starts", r"op start", r"op starts",
      r"operation list", r"operation scheduled", r"operation time", r"scheduled"
  ]},
  {"category": "Surgeon/Staff", "patterns": [
      r"\bsurgeon(s)?\b|\bdifferent (surgical )?team\b|\bdifferent surgeons?\b|\bchange of surgical team\b",
      r"\bteam\b|\bnurses?\b|\bstaff (shortage|insufficient|not available|unavailable)\b",
      r"\bawait(?:ing)? (?:for )?surgeon\b|\bawait(?:ing)? for surgeons? assessment\b",
      r"\b(nurse|scrub|circulating|tech|porter) (late|unavailable|shortage|missing)\b"
  ]},
]

COMPILED_TAXONOMY = [
    (entry["category"], [re.compile(p, re.IGNORECASE) for p in entry.get("patterns", [])])
    for entry in taxonomy_data
]

def classify_late_reason(text_norm: str) -> str:
    for cat, pats in COMPILED_TAXONOMY:
        for pat in pats:
            if pat.search(text_norm or ""):
                return cat
    return "Unspecified (late)"

In [155]:
# --- Ensure Delay_Category column exists ---
if CAT_COL not in data.columns:
    data[CAT_COL] = None  # start with empty values

# --- Initial assignment ---
# Non-late cases (Is_Late = 0) → "No Delay"
data.loc[data["Is_Late"] == 0, CAT_COL] = "No Delay"

# Late cases (Is_Late = 1) → classify by taxonomy from _Delay_norm
late_mask = data["Is_Late"] == 1
data.loc[late_mask, CAT_COL] = [
    classify_late_reason(t) for t in data.loc[late_mask, NORM_COL]
]

# --- Safety correction (if any late rows still wrongly "No Delay") ---
cat_lower = data[CAT_COL].astype(str).str.strip().str.lower()
mask_incorrect = (data["Is_Late"] == 1) & (cat_lower == "no delay")
if mask_incorrect.any():
    print(f"Fixing {mask_incorrect.sum()} late rows incorrectly marked 'No Delay'.")
    data.loc[mask_incorrect, CAT_COL] = [
        classify_late_reason(t) for t in data.loc[mask_incorrect, NORM_COL]
    ]

# Quick peek
print(data[[PLANNED_COL, ACTUAL_COL, "Is_Late", NORM_COL, CAT_COL]].head(10))

  PLANNED_KNIFE_TO_SKIN_TIME ACTUAL_KNIFE_TO_SKIN_TIME  Is_Late  \
0        2019-04-11 10:20:00       2019-04-11 11:44:00        1   
1        2019-04-11 10:50:00       2019-04-11 11:10:00        1   
2        2019-04-11 11:10:00       2019-04-11 11:02:00        0   
3        2019-04-11 11:10:00       2019-04-11 11:17:00        1   
4        2019-04-11 10:50:00       2019-04-11 11:24:00        1   
5        2019-04-11 10:16:00       2019-04-11 11:28:00        1   
6        2019-04-11 10:24:00       2019-04-11 11:12:00        1   
7        2019-04-11 10:40:00       2019-04-11 10:48:00        1   
8        2019-04-11 10:30:00       2019-04-11 10:44:00        1   
9        2019-04-11 11:30:00       2019-04-11 10:41:00        0   

                            _Delay_norm           Delay_Category  
0  surgeon eg surgeon not available etc            Surgeon/Staff  
1                                     0       Unspecified (late)  
2                                     0                 No De

In [156]:
# Assign Delay_Category based on Is_Late
late_mask = data["Is_Late"] == 1
out_cat = pd.Series("No Delay", index=data.index)
out_cat.loc[late_mask] = [classify_late_reason(t) for t in data.loc[late_mask, NORM_COL]]
data[CAT_COL] = out_cat

In [157]:
def inspect_column(df, col, top_n=20): #Get Unique Value information on column in df
    print(f"Column: {col}")
    print(f"Unique values: {df[col].nunique(dropna=False)}")
    print("\nTop value counts:")
    print(df[col].value_counts(dropna=False).head(top_n))

In [158]:
inspect_column(data, "Delay_Category", top_n=30)

Column: Delay_Category
Unique values: 16

Top value counts:
Delay_Category
Unspecified (late)          122708
No Delay                     93650
Administration                8228
Priority Case/Emergency       6974
Surgeon/Staff                 4353
Imaging/Labs                  3878
Scheduling/Venue Changes      2816
Equipment                     2334
Patient Factors               1633
Anaesthetic                   1047
Other Case Issues              457
Room/Facilities                306
Timing                         179
Transport/Portering            145
Preparation                     60
Pharmacy/Medication             16
Name: count, dtype: int64


## KATELYN'S PART

In [159]:
# Correcting emergency priority codes

data["EMERGENCY_PRIORITY"] = data["EMERGENCY_PRIORITY"].replace({
    "P2": "P2B",
    "P3": "P3B",
    "P3a": "P3A",
    "P3b": "P3B"
})

In [160]:
inspect_column(data, "EMERGENCY_PRIORITY", top_n=30)

Column: EMERGENCY_PRIORITY
Unique values: 7

Top value counts:
EMERGENCY_PRIORITY
0      213871
P2B     21121
P1       8306
P0       2088
P2A      2085
P3B       974
P3A       339
Name: count, dtype: int64


In [164]:
cols = [
    "OPERATION_ID","ROOM","DISCIPLINE",
    "PLANNED_ENTER_OR_TIME","ACTUAL_ENTER_OR_TIME","ENTER_START_DELAY","ENTER_START_DELAY_MIN",
    "PLANNED_KNIFE_TO_SKIN_TIME","ACTUAL_KNIFE_TO_SKIN_TIME","KNIFE_START_DELAY","KNIFE_START_DELAY_MIN",
    "PLANNED_EXIT_OR_TIME","ACTUAL_EXIT_OR_TIME","EXIT_OR_DELAY","EXIT_OR_DELAY_MIN",
]
display_cols = [c for c in cols if c in data.columns]
data.loc[:, display_cols].head(10)


Unnamed: 0,OPERATION_ID,ROOM,DISCIPLINE,PLANNED_ENTER_OR_TIME,ACTUAL_ENTER_OR_TIME,ENTER_START_DELAY,PLANNED_KNIFE_TO_SKIN_TIME,ACTUAL_KNIFE_TO_SKIN_TIME,KNIFE_START_DELAY,PLANNED_EXIT_OR_TIME,ACTUAL_EXIT_OR_TIME,EXIT_OR_DELAY
0,588456.0,MBOR11,Surgery,2019-04-11 09:50:00,2019-04-11 11:04:00,74.0,2019-04-11 10:20:00,2019-04-11 11:44:00,84.0,2019-04-11 17:25:00,2019-04-11 17:46:00,21.0
1,590736.0,MBOR05,Obstetrics & Gynaecology,2019-04-11 10:40:00,2019-04-11 11:01:00,21.0,2019-04-11 10:50:00,2019-04-11 11:10:00,20.0,2019-04-11 11:25:00,2019-04-11 11:32:00,7.0
2,591995.0,RoomC,Cardiac,2019-04-11 10:55:00,2019-04-11 10:58:00,3.0,2019-04-11 11:10:00,2019-04-11 11:02:00,-8.0,2019-04-11 11:55:00,2019-04-11 11:24:00,-31.0
3,590451.0,MBOR04,Otolaryngology&Head&Neck Surg,2019-04-11 10:50:00,2019-04-11 10:58:00,8.0,2019-04-11 11:10:00,2019-04-11 11:17:00,7.0,2019-04-11 13:50:00,2019-04-11 13:27:00,-23.0
4,573666.0,MCOR03,Surgery,2019-04-11 10:30:00,2019-04-11 10:58:00,28.0,2019-04-11 10:50:00,2019-04-11 11:24:00,34.0,2019-04-11 12:30:00,2019-04-11 13:34:00,64.0
5,591396.0,KRWOR7,Orthopaedic Surgery,2019-04-11 10:06:00,2019-04-11 10:48:00,42.0,2019-04-11 10:16:00,2019-04-11 11:28:00,72.0,2019-04-11 11:26:00,2019-04-11 13:01:00,95.0
6,592028.0,MBOR08,Surgery,2019-04-11 09:54:00,2019-04-11 10:40:00,46.0,2019-04-11 10:24:00,2019-04-11 11:12:00,48.0,2019-04-11 10:54:00,2019-04-11 12:13:00,79.0
7,580798.0,ENDO1,Gastroenterology & Hepatology,2019-04-11 10:30:00,2019-04-11 10:36:00,6.0,2019-04-11 10:40:00,2019-04-11 10:48:00,8.0,2019-04-11 11:20:00,2019-04-11 11:33:00,13.0
8,591033.0,KRWOR1,Obstetrics & Gynaecology,2019-04-11 10:30:00,2019-04-11 10:34:00,4.0,2019-04-11 10:30:00,2019-04-11 10:44:00,14.0,2019-04-11 10:40:00,2019-04-11 10:52:00,12.0
9,584452.0,MCOR05,Ophthalmology,2019-04-11 11:15:00,2019-04-11 10:32:00,-43.0,2019-04-11 11:30:00,2019-04-11 10:41:00,-49.0,2019-04-11 11:45:00,2019-04-11 11:07:00,-38.0


In [161]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 248784 entries, 0 to 276861
Data columns (total 65 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   OPERATION_ID                   248784 non-null  float64       
 1   LOCATION                       248784 non-null  object        
 2   ROOM                           248784 non-null  object        
 3   CASE_STATUS                    248784 non-null  object        
 4   OPERATION_TYPE                 248784 non-null  object        
 5   EMERGENCY_PRIORITY             248784 non-null  object        
 6   PLANNED_PATIENT_CALL_TIME      248784 non-null  datetime64[ns]
 7   PLANNED_PATIENT_FETCH_TIME     248784 non-null  datetime64[ns]
 8   PLANNED_RECEPTION_IN_TIME      248784 non-null  datetime64[ns]
 9   PLANNED_ENTER_OR_TIME          248784 non-null  datetime64[ns]
 10  PLANNED_ANAESTHESIA_INDUCTION  248784 non-null  datetime64[ns]
 11  PLANN

In [162]:
data.to_csv("Final_Cleaned_Dataset_OPTIC_7.csv")

In [163]:
data.head()

Unnamed: 0,OPERATION_ID,LOCATION,ROOM,CASE_STATUS,OPERATION_TYPE,EMERGENCY_PRIORITY,PLANNED_PATIENT_CALL_TIME,PLANNED_PATIENT_FETCH_TIME,PLANNED_RECEPTION_IN_TIME,PLANNED_ENTER_OR_TIME,...,PLANNED_SURGERY_DURATION,DIFF_SURGERY_DURATION,ACTUAL_USAGE_DURATION,PLANNED_USAGE_DURATION,DIFF_USAGE_DURATION,ENTER_START_DELAY,KNIFE_START_DELAY,EXIT_OR_DELAY,Is_Late,Delay_Category
0,588456.0,Main Building OT,MBOR11,Final,Elective,0,2019-04-11 09:50:00,2019-04-11 09:50:00,2019-04-11 09:50:00,2019-04-11 09:50:00,...,410.0,-63.0,402.0,455.0,-53.0,74.0,84.0,21.0,1,Surgeon/Staff
1,590736.0,Main Building OT,MBOR05,Final,Elective,0,2019-04-11 10:40:00,2019-04-11 10:40:00,2019-04-11 10:40:00,2019-04-11 10:40:00,...,30.0,-20.0,31.0,45.0,-14.0,21.0,20.0,7.0,1,Unspecified (late)
2,591995.0,ICL,RoomC,Actualised,Elective,0,2019-04-11 10:55:00,2019-04-11 10:55:00,2019-04-11 10:55:00,2019-04-11 10:55:00,...,45.0,-26.0,26.0,60.0,-34.0,3.0,-8.0,-31.0,0,No Delay
3,590451.0,Main Building OT,MBOR04,Final,Elective,0,2019-04-11 10:50:00,2019-04-11 10:50:00,2019-04-11 10:50:00,2019-04-11 10:50:00,...,150.0,-38.0,149.0,180.0,-31.0,8.0,7.0,-23.0,1,Unspecified (late)
4,573666.0,Medical Center OT,MCOR03,Final,Elective,0,2019-04-11 10:30:00,2019-04-11 10:30:00,2019-04-11 10:30:00,2019-04-11 10:30:00,...,90.0,23.0,156.0,120.0,36.0,28.0,34.0,64.0,1,Other Case Issues
