<a href="https://colab.research.google.com/github/kadefue/MoEST/blob/main/MoEST_Data_Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import os

def extract_sheets_no_spaces():
    # 1. Define the mapping of Year -> Sheet Names (No Spaces)
    sheet_mapping = {
        2016: ["3.26Lab", "3.27LabGov"],
        2017: ["3.36LabRegCoun", "3.37LabGovtRegCoun"], # Appears identically in both columns
        2018: ["3.37Lab", "3.38LabGov"],
        2019: ["3.37Lab", "3.38LabGov"],
        2020: ["3.37Lab", "3.38LabGov"],
        2021: ["T3.37Lab", "T3.38LabGov"],
        2022: ["T3.38Lab", "T3.39LabGov"],
        2023: ["T3.39LabG&NG", "T3.40LabG"],
        2024: ["T3.39LabG&NG", "T3.40LabG"],
        2025: ["T3.40LabG&NG", "T3.41LabG"]
    }

    # 2. Directory settings (Change '.' to your folder path if needed)
    base_dir = "."

    # 3. Iterate through years and process files
    for year, sheets_to_extract in sheet_mapping.items():
        filename = f"/content/drive/MyDrive/BEST/BEST {year}.xlsx"
        file_path = os.path.join(base_dir, filename)

        if not os.path.exists(file_path):
            print(f"Skipping {year}: File '{filename}' not found.")
            continue

        print(f"Processing {filename}...")

        try:
            # Load the Excel file
            xls = pd.ExcelFile(file_path)

            # Get list of actual sheets in the file
            available_sheets = xls.sheet_names

            for sheet_name in sheets_to_extract:
                if sheet_name in available_sheets:
                    # Read the specific sheet
                    df = pd.read_excel(xls, sheet_name=sheet_name)

                    # 4. Save the extracted sheet
                    output_filename = f"{year}_{sheet_name}.csv"
                    df.to_csv(output_filename, index=False)
                    print(f"  -> Extracted '{sheet_name}' to {output_filename}")
                else:
                    print(f"  [Warning] Sheet '{sheet_name}' not found in {filename}.")
                    # Optional: Print available sheets to help debug typos
                    # print(f"  Available sheets: {available_sheets}")

        except Exception as e:
            print(f"Error processing {filename}: {e}")

if __name__ == "__main__":
    extract_sheets_no_spaces()

Processing /content/drive/MyDrive/BEST/BEST 2016.xlsx...
  -> Extracted '3.26Lab' to 2016_3.26Lab.csv
  -> Extracted '3.27LabGov' to 2016_3.27LabGov.csv
Processing /content/drive/MyDrive/BEST/BEST 2017.xlsx...
  -> Extracted '3.36LabRegCoun' to 2017_3.36LabRegCoun.csv
  -> Extracted '3.37LabGovtRegCoun' to 2017_3.37LabGovtRegCoun.csv
Processing /content/drive/MyDrive/BEST/BEST 2018.xlsx...
  -> Extracted '3.37Lab' to 2018_3.37Lab.csv
  -> Extracted '3.38LabGov' to 2018_3.38LabGov.csv
Processing /content/drive/MyDrive/BEST/BEST 2019.xlsx...
  -> Extracted '3.37Lab' to 2019_3.37Lab.csv
  -> Extracted '3.38LabGov' to 2019_3.38LabGov.csv
Processing /content/drive/MyDrive/BEST/BEST 2020.xlsx...
  -> Extracted '3.37Lab' to 2020_3.37Lab.csv
  -> Extracted '3.38LabGov' to 2020_3.38LabGov.csv
Processing /content/drive/MyDrive/BEST/BEST 2021.xlsx...
  -> Extracted 'T3.37Lab' to 2021_T3.37Lab.csv
  -> Extracted 'T3.38LabGov' to 2021_T3.38LabGov.csv
Processing /content/drive/MyDrive/BEST/BEST 2022

In [None]:
import pandas as pd
import os
import re

# ==========================================
# Helper Functions from council_data_extractor.py
# ==========================================

def is_valid_text(value):
    """Checks if the value is Alphabetic or Alphanumeric."""
    s_val = str(value).strip()
    if not s_val:
        return False
    clean_val = s_val.replace(" ", "")
    if clean_val.isalnum():
        return True
    if re.search(r'[a-zA-Z]', s_val):
        return True
    return False

def normalize_merged_cells(df, header_rows=15):
    """
    Handles merged columns/rows in the header/label area.
    Duplicates text horizontally and vertically for merged cells (NaNs).
    """
    if df.empty:
        return df

    limit = min(header_rows, len(df))
    subset = df.iloc[:limit].copy()

    # Forward fill horizontally and vertically
    subset = subset.ffill(axis=1)
    subset = subset.ffill(axis=0)

    df.iloc[:limit] = subset
    return df

def process_council_sheet(df):
    """
    Applies specific cleaning steps:
    1. Populate structural columns downwards.
    2. Remove 'Grand' and 'Total' rows.
    3. Drop sparse columns.
    """
    if df.empty:
        return df

    # 1. Identify Region Column
    region_col = None
    for i, row in df.head(5).iterrows():
        for col in df.columns:
            val = str(row[col]).lower()
            if "region" in val or "mkoa" in val:
                region_col = col
                break
        if region_col is not None:
            break

    if region_col is None and not df.empty:
        region_col = df.columns[0]

    # 2. Populate Columns Downwards (Unmerge Vertical for structural cols)
    cols_to_fill = list(df.columns[:3])
    if region_col is not None and region_col not in cols_to_fill:
        cols_to_fill.append(region_col)

    for col in cols_to_fill:
        if col in df.columns:
            df[col] = df[col].replace({0: None, '0': None})
            df[col] = df[col].ffill()

    # 3. "Grand" Logic: Delete row and all below if "Grand" is found
    if region_col is not None and region_col in df.columns:
        grand_mask = df[region_col].astype(str).str.contains("Grand", case=False, na=False)
        if grand_mask.any():
            cutoff_idx = grand_mask.idxmax()
            df = df.loc[:cutoff_idx-1]

    # 4. "Total" Logic (Region): Delete rows containing "Total"
    if region_col is not None and region_col in df.columns:
        total_mask = df[region_col].astype(str).str.contains("Total", case=False, na=False)
        df = df[~total_mask]

    # 5. "Total" Logic (Council)
    council_col = None
    council_keywords = ['council', 'halmashauri', 'district', 'lga', 'wilaya', 'municipal', 'town council']

    for i, row in df.head(5).iterrows():
        for col in df.columns:
            val = str(row[col]).lower()
            if any(kw in val for kw in council_keywords):
                council_col = col
                break
        if council_col is not None:
            break

    if council_col is not None and council_col in df.columns:
        pat = "Total|Sub-Total|Sub Total"
        council_total_mask = df[council_col].astype(str).str.contains(pat, case=False, na=False)
        df = df[~council_total_mask]

    # 6. Sparsity Logic: Drop column if >60% empty
    cols_to_keep = []
    threshold = 0.60
    for col in df.columns:
        is_missing = df[col].isna() | df[col].isin([0, '0', ''])
        missing_pct = is_missing.mean()
        if missing_pct <= threshold:
            cols_to_keep.append(col)
    df = df[cols_to_keep]

    # 7. Final Cleanup: Remove rows with "Total" in first few columns
    target_indices = [0, 1, 2]
    for idx in target_indices:
        if idx < len(df.columns):
            col_name = df.columns[idx]
            mask = df[col_name].astype(str).str.contains("Total", case=False, na=False)
            df = df[~mask]

    # 8. Duplicate Region Column Check
    if region_col is not None and region_col in df.columns:
        cols_to_drop = []
        for col in df.columns:
            if col == region_col: continue
            if df[col].equals(df[region_col]):
                cols_to_drop.append(col)
        if cols_to_drop:
            df = df.drop(columns=cols_to_drop)

    return df

# ==========================================
# Main Extraction Logic
# ==========================================

def extract_sheets_cleaned():
    # 1. Define the mapping of Year -> Sheet Names (No Spaces)
    sheet_mapping = {
        2016: ["3.26Lab", "3.27LabGov"],
        2017: ["3.36LabRegCoun", "3.37LabGovtRegCoun"],
        2018: ["3.37Lab", "3.38LabGov"],
        2019: ["3.37Lab", "3.38LabGov"],
        2020: ["3.37Lab", "3.38LabGov"],
        2021: ["T3.37Lab", "T3.38LabGov"],
        2022: ["T3.38Lab", "T3.39LabGov"],
        2023: ["T3.39LabG&NG", "T3.40LabG"],
        2024: ["T3.39LabG&NG", "T3.40LabG"],
        2025: ["T3.40LabG&NG", "T3.41LabG"]
    }

    # 2. Directory settings
    # Note: Using standard path joining to be safe
    base_dir = "/content/drive/MyDrive/BEST"

    # 3. Iterate through years and process files
    for year, sheets_to_extract in sheet_mapping.items():
        filename = f"BEST {year}.xlsx"
        file_path = os.path.join(base_dir, filename)

        if not os.path.exists(file_path):
            print(f"Skipping {year}: File '{file_path}' not found.")
            continue

        print(f"Processing {filename}...")

        try:
            # Load the Excel file wrapper
            xls = pd.ExcelFile(file_path)
            available_sheets = xls.sheet_names

            for sheet_name in sheets_to_extract:
                if sheet_name in available_sheets:
                    # LOAD with header=None to support the cleaning logic
                    df = pd.read_excel(xls, sheet_name=sheet_name, header=None)

                    # APPLY CLEANING LOGIC
                    df = normalize_merged_cells(df)
                    df = process_council_sheet(df)

                    # SAVE
                    output_filename = f"{year}_{sheet_name}.csv"
                    # Use header=False because headers are part of the data rows now
                    df.to_csv(output_filename, index=False, header=False)
                    print(f"  -> Extracted & Cleaned '{sheet_name}' to {output_filename}")
                else:
                    print(f"  [Warning] Sheet '{sheet_name}' not found in {filename}.")

        except Exception as e:
            print(f"Error processing {filename}: {e}")

if __name__ == "__main__":
    extract_sheets_cleaned()

Processing BEST 2016.xlsx...
  -> Extracted & Cleaned '3.26Lab' to 2016_3.26Lab.csv
  -> Extracted & Cleaned '3.27LabGov' to 2016_3.27LabGov.csv
Processing BEST 2017.xlsx...


 'Table 3.36: Number of Laboratories in Government and Non Government Schools, 2017'
 '% of Available' 'SHORTAGE' 82.6923076923077 90.19607843137256 40.625 30
 73.68421052631578 61.904761904761905 36.36363636363637 70.08547008547008
 89.81481481481481 80 87.5]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.36: Number of Laboratories in Government and Non Government Schools, 2017'
 '% of Available' 'SHORTAGE' 82.6923076923077 90.19607843137256 40.625 30
 73.68421052631578 61.904761904761905 36.36363636363637 70.08547008547008
 89.81481481481481 80 87.5]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.36: Number of Laboratories in Government and Non Government Schools, 2017'
 '% of Available' 'SHORTAGE' 126.0 133.0 41.0 9.0 133.0 38.0 14.0 494.0
 295.0 48.0 214.0]' has dtype incompatible with float64, please explicitly cast to a co

  -> Extracted & Cleaned '3.36LabRegCoun' to 2017_3.36LabRegCoun.csv
  -> Extracted & Cleaned '3.37LabGovtRegCoun' to 2017_3.37LabGovtRegCoun.csv
Processing BEST 2018.xlsx...


 'Table 3.37: Number of Laboratories in Government and Non-Government Schools, 2018'
 'PHYSICS LABORATORIES' 'council' 124.0 143.0 49.0 12.0 164.0 45.0 23.0
 560.0 301.0 61.0 216.0]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.38: Number of Laboratories in Government Schools, 2018'
 'PHYSICS LABORATORIES' 'SHORTAGE' 5 1 18 5 9 4 5 47 -1 0 3]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.38: Number of Laboratories in Government Schools, 2018'
 'PHYSICS LABORATORIES' 'council' 5 1 18 5 9 4 5 351 -1 0 3]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset


  -> Extracted & Cleaned '3.37Lab' to 2018_3.37Lab.csv
  -> Extracted & Cleaned '3.38LabGov' to 2018_3.38LabGov.csv
Processing BEST 2019.xlsx...
  -> Extracted & Cleaned '3.37Lab' to 2019_3.37Lab.csv
  -> Extracted & Cleaned '3.38LabGov' to 2019_3.38LabGov.csv
Processing BEST 2020.xlsx...
  -> Extracted & Cleaned '3.37Lab' to 2020_3.37Lab.csv
  -> Extracted & Cleaned '3.38LabGov' to 2020_3.38LabGov.csv
Processing BEST 2021.xlsx...
  -> Extracted & Cleaned 'T3.37Lab' to 2021_T3.37Lab.csv
  -> Extracted & Cleaned 'T3.38LabGov' to 2021_T3.38LabGov.csv
Processing BEST 2022.xlsx...
  -> Extracted & Cleaned 'T3.38Lab' to 2022_T3.38Lab.csv
  -> Extracted & Cleaned 'T3.39LabGov' to 2022_T3.39LabGov.csv
Processing BEST 2023.xlsx...
  -> Extracted & Cleaned 'T3.39LabG&NG' to 2023_T3.39LabG&NG.csv
  -> Extracted & Cleaned 'T3.40LabG' to 2023_T3.40LabG.csv
Processing BEST 2024.xlsx...


 'Table 3.40: Number of Science Laboratories in Government Schools, 2023'
 'PHYSICS LABORATORIES' 'SHORTAGE' 4.0 5.0 6.0 7.0 8.0 9.0 1 58 25 3 13]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.40: Number of Science Laboratories in Government Schools, 2023'
 'PHYSICS LABORATORIES' 'SHORTAGE' 4.0 5.0 6.0 7.0 8.0 9.0 1 58 25 3 13]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset


  -> Extracted & Cleaned 'T3.39LabG&NG' to 2024_T3.39LabG&NG.csv
  -> Extracted & Cleaned 'T3.40LabG' to 2024_T3.40LabG.csv
Processing BEST 2025.xlsx...


 'Table 3.40: Number of Science Laboratories in Government Schools, 2024'
 'PHYSICS LABORATORIES' 'SHORTAGE' 4.0 5.0 6.0 7.0 8.0 9.0 1 56 16 3 8]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.40: Number of Science Laboratories in Government Schools, 2024'
 'PHYSICS LABORATORIES' 'SHORTAGE' 87.0 5.0 6.0 7.0 8.0 9.0 1 56 16 3 8]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset


  -> Extracted & Cleaned 'T3.40LabG&NG' to 2025_T3.40LabG&NG.csv
  -> Extracted & Cleaned 'T3.41LabG' to 2025_T3.41LabG.csv


 'Table 3.41: Number of Science Laboratories in Government Schools, 2025'
 'PHYSICS LABORATORIES' 'SHORTAGE' 4.0 5.0 6.0 7.0 8.0 9.0 3 62 22 7 9]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.41: Number of Science Laboratories in Government Schools, 2025'
 'PHYSICS LABORATORIES' 'SHORTAGE' 90.0 5.0 6.0 7.0 8.0 9.0 3 62 22 7 9]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset


In [None]:
import pandas as pd
import os
import re

# ==========================================
# 1. Cleaning & Helper Functions
# ==========================================

def is_valid_text(value):
    """Checks if the value is Alphabetic or Alphanumeric."""
    s_val = str(value).strip()
    if not s_val:
        return False
    clean_val = s_val.replace(" ", "")
    if clean_val.isalnum():
        return True
    if re.search(r'[a-zA-Z]', s_val):
        return True
    return False

def normalize_merged_cells(df, header_rows=15):
    """
    Handles merged columns/rows in the header/label area.
    Duplicates text horizontally and vertically for merged cells.
    """
    if df.empty:
        return df

    limit = min(header_rows, len(df))
    subset = df.iloc[:limit].copy()

    # Forward fill horizontally and vertically
    subset = subset.ffill(axis=1)
    subset = subset.ffill(axis=0)

    df.iloc[:limit] = subset
    return df

def process_council_sheet(df):
    """
    Applies specific cleaning steps:
    1. Populate structural columns downwards.
    2. Remove 'Grand' and 'Total' rows.
    3. Drop sparse columns.
    """
    if df.empty:
        return df

    # Identify Region Column
    region_col = None
    for i, row in df.head(5).iterrows():
        for col in df.columns:
            val = str(row[col]).lower()
            if "region" in val or "mkoa" in val:
                region_col = col
                break
        if region_col is not None:
            break

    if region_col is None and not df.empty:
        region_col = df.columns[0]

    # Populate Columns Downwards (Unmerge Vertical for structural cols)
    cols_to_fill = list(df.columns[:3])
    if region_col is not None and region_col not in cols_to_fill:
        cols_to_fill.append(region_col)

    for col in cols_to_fill:
        if col in df.columns:
            df[col] = df[col].replace({0: None, '0': None})
            df[col] = df[col].ffill()

    # "Grand" Logic
    if region_col is not None and region_col in df.columns:
        grand_mask = df[region_col].astype(str).str.contains("Grand", case=False, na=False)
        if grand_mask.any():
            cutoff_idx = grand_mask.idxmax()
            df = df.loc[:cutoff_idx-1]

    # "Total" Logic (Region)
    if region_col is not None and region_col in df.columns:
        total_mask = df[region_col].astype(str).str.contains("Total", case=False, na=False)
        df = df[~total_mask]

    # "Total" Logic (Council)
    council_col = None
    council_keywords = ['council', 'halmashauri', 'district', 'lga', 'wilaya', 'municipal', 'town council']

    for i, row in df.head(5).iterrows():
        for col in df.columns:
            val = str(row[col]).lower()
            if any(kw in val for kw in council_keywords):
                council_col = col
                break
        if council_col is not None:
            break

    if council_col is not None and council_col in df.columns:
        pat = "Total|Sub-Total|Sub Total"
        council_total_mask = df[council_col].astype(str).str.contains(pat, case=False, na=False)
        df = df[~council_total_mask]

    # Sparsity Logic
    cols_to_keep = []
    threshold = 0.60
    for col in df.columns:
        is_missing = df[col].isna() | df[col].isin([0, '0', ''])
        missing_pct = is_missing.mean()
        if missing_pct <= threshold:
            cols_to_keep.append(col)
    df = df[cols_to_keep]

    # Final Cleanup: Remove rows with "Total" in first few columns
    target_indices = [0, 1, 2]
    for idx in target_indices:
        if idx < len(df.columns):
            col_name = df.columns[idx]
            mask = df[col_name].astype(str).str.contains("Total", case=False, na=False)
            df = df[~mask]

    # Duplicate Region Column Check
    if region_col is not None and region_col in df.columns:
        cols_to_drop = []
        for col in df.columns:
            if col == region_col: continue
            if df[col].equals(df[region_col]):
                cols_to_drop.append(col)
        if cols_to_drop:
            df = df.drop(columns=cols_to_drop)

    return df

# ==========================================
# 2. Main Extraction and Combination Logic
# ==========================================

def extract_and_combine():
    # Mapping based on your latest snippet
    sheet_mapping = {
        2016: ["3.26Lab", "3.27LabGov"],
        2017: ["3.36LabRegCoun", "3.37LabGovtRegCoun"],
        2018: ["3.37Lab", "3.38LabGov"],
        2019: ["3.37Lab", "3.38LabGov"],
        2020: ["3.37Lab", "3.38LabGov"],
        2021: ["T3.37Lab", "T3.38LabGov"],
        2022: ["T3.38Lab", "T3.39LabGov"],
        2023: ["T3.39LabG&NG", "T3.40LabG"],
        2024: ["T3.39LabG&NG", "T3.40LabG"],
        2025: ["T3.40LabG&NG", "T3.41LabG"]
    }

    base_dir = "/content/drive/MyDrive/BEST"

    # Lists to hold the dataframes for final merging
    left_dfs = []
    right_dfs = []

    for year, sheets in sheet_mapping.items():
        filename = f"BEST {year}.xlsx"
        file_path = os.path.join(base_dir, filename)

        if not os.path.exists(file_path):
            print(f"Skipping {year}: File not found at {file_path}")
            continue

        print(f"Processing {filename}...")

        try:
            xls = pd.ExcelFile(file_path)
            available_sheets = xls.sheet_names

            # --- PROCESS LEFT SHEET (Index 0) ---
            target_left = sheets[0]
            if target_left in available_sheets:
                df_left = pd.read_excel(xls, sheet_name=target_left, header=None)
                df_left = normalize_merged_cells(df_left)
                df_left = process_council_sheet(df_left)

                # Add Year column for tracking
                df_left.insert(0, 'Source_Year', year)
                left_dfs.append(df_left)
                print(f"  -> Added Left Sheet: {target_left}")
            else:
                print(f"  [Warning] Left Sheet '{target_left}' missing.")

            # --- PROCESS RIGHT SHEET (Index 1) ---
            if len(sheets) > 1:
                target_right = sheets[1]
                if target_right in available_sheets:
                    df_right = pd.read_excel(xls, sheet_name=target_right, header=None)
                    df_right = normalize_merged_cells(df_right)
                    df_right = process_council_sheet(df_right)

                    # Add Year column for tracking
                    df_right.insert(0, 'Source_Year', year)
                    right_dfs.append(df_right)
                    print(f"  -> Added Right Sheet: {target_right}")
                else:
                    print(f"  [Warning] Right Sheet '{target_right}' missing.")

        except Exception as e:
            print(f"Error processing {filename}: {e}")

    # ==========================================
    # 3. Save Combined Files
    # ==========================================

    print("\n--- Saving Combined Files ---")

    if left_dfs:
        combined_left = pd.concat(left_dfs, ignore_index=True)
        combined_left.to_csv("Combined_Left_Sheets.csv", index=False, header=False)
        print(f"Saved 'Combined_Left_Sheets.csv' with {len(combined_left)} rows.")
    else:
        print("No Left sheets extracted.")

    if right_dfs:
        combined_right = pd.concat(right_dfs, ignore_index=True)
        combined_right.to_csv("Combined_Right_Sheets.csv", index=False, header=False)
        print(f"Saved 'Combined_Right_Sheets.csv' with {len(combined_right)} rows.")

if __name__ == "__main__":
    extract_and_combine()

Processing BEST 2016.xlsx...
  -> Added Left Sheet: 3.26Lab
  -> Added Right Sheet: 3.27LabGov
Processing BEST 2017.xlsx...


 'Table 3.36: Number of Laboratories in Government and Non Government Schools, 2017'
 '% of Available' 'SHORTAGE' 82.6923076923077 90.19607843137256 40.625 30
 73.68421052631578 61.904761904761905 36.36363636363637 70.08547008547008
 89.81481481481481 80 87.5]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.36: Number of Laboratories in Government and Non Government Schools, 2017'
 '% of Available' 'SHORTAGE' 82.6923076923077 90.19607843137256 40.625 30
 73.68421052631578 61.904761904761905 36.36363636363637 70.08547008547008
 89.81481481481481 80 87.5]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.36: Number of Laboratories in Government and Non Government Schools, 2017'
 '% of Available' 'SHORTAGE' 126.0 133.0 41.0 9.0 133.0 38.0 14.0 494.0
 295.0 48.0 214.0]' has dtype incompatible with float64, please explicitly cast to a co

  -> Added Left Sheet: 3.36LabRegCoun
  -> Added Right Sheet: 3.37LabGovtRegCoun
Processing BEST 2018.xlsx...


 'Table 3.37: Number of Laboratories in Government and Non-Government Schools, 2018'
 'PHYSICS LABORATORIES' 'council' 124.0 143.0 49.0 12.0 164.0 45.0 23.0
 560.0 301.0 61.0 216.0]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.38: Number of Laboratories in Government Schools, 2018'
 'PHYSICS LABORATORIES' 'SHORTAGE' 5 1 18 5 9 4 5 47 -1 0 3]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.38: Number of Laboratories in Government Schools, 2018'
 'PHYSICS LABORATORIES' 'council' 5 1 18 5 9 4 5 351 -1 0 3]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset


  -> Added Left Sheet: 3.37Lab
  -> Added Right Sheet: 3.38LabGov
Processing BEST 2019.xlsx...
  -> Added Left Sheet: 3.37Lab
  -> Added Right Sheet: 3.38LabGov
Processing BEST 2020.xlsx...
  -> Added Left Sheet: 3.37Lab
  -> Added Right Sheet: 3.38LabGov
Processing BEST 2021.xlsx...
  -> Added Left Sheet: T3.37Lab
  -> Added Right Sheet: T3.38LabGov
Processing BEST 2022.xlsx...
  -> Added Left Sheet: T3.38Lab
  -> Added Right Sheet: T3.39LabGov
Processing BEST 2023.xlsx...
  -> Added Left Sheet: T3.39LabG&NG
  -> Added Right Sheet: T3.40LabG
Processing BEST 2024.xlsx...


 'Table 3.40: Number of Science Laboratories in Government Schools, 2023'
 'PHYSICS LABORATORIES' 'SHORTAGE' 4.0 5.0 6.0 7.0 8.0 9.0 1 58 25 3 13]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.40: Number of Science Laboratories in Government Schools, 2023'
 'PHYSICS LABORATORIES' 'SHORTAGE' 4.0 5.0 6.0 7.0 8.0 9.0 1 58 25 3 13]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset


  -> Added Left Sheet: T3.39LabG&NG
  -> Added Right Sheet: T3.40LabG
Processing BEST 2025.xlsx...


 'Table 3.40: Number of Science Laboratories in Government Schools, 2024'
 'PHYSICS LABORATORIES' 'SHORTAGE' 4.0 5.0 6.0 7.0 8.0 9.0 1 56 16 3 8]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.40: Number of Science Laboratories in Government Schools, 2024'
 'PHYSICS LABORATORIES' 'SHORTAGE' 87.0 5.0 6.0 7.0 8.0 9.0 1 56 16 3 8]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset


  -> Added Left Sheet: T3.40LabG&NG
  -> Added Right Sheet: T3.41LabG

--- Saving Combined Files ---
Saved 'Combined_Left_Sheets.csv' with 1876 rows.
Saved 'Combined_Right_Sheets.csv' with 1876 rows.


 'Table 3.41: Number of Science Laboratories in Government Schools, 2025'
 'PHYSICS LABORATORIES' 'SHORTAGE' 4.0 5.0 6.0 7.0 8.0 9.0 3 62 22 7 9]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.41: Number of Science Laboratories in Government Schools, 2025'
 'PHYSICS LABORATORIES' 'SHORTAGE' 90.0 5.0 6.0 7.0 8.0 9.0 3 62 22 7 9]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset


In [None]:
import pandas as pd
import os
import re

# ==========================================
# 1. Cleaning & Helper Functions
# ==========================================

def is_valid_text(value):
    """Checks if the value is Alphabetic or Alphanumeric."""
    s_val = str(value).strip()
    if not s_val:
        return False
    clean_val = s_val.replace(" ", "")
    if clean_val.isalnum():
        return True
    if re.search(r'[a-zA-Z]', s_val):
        return True
    return False

def normalize_merged_cells(df, header_rows=15):
    """
    Handles merged columns/rows in the header/label area.
    Duplicates text horizontally and vertically for merged cells.
    """
    if df.empty:
        return df

    limit = min(header_rows, len(df))
    subset = df.iloc[:limit].copy()

    # Forward fill horizontally and vertically
    subset = subset.ffill(axis=1)
    subset = subset.ffill(axis=0)

    df.iloc[:limit] = subset
    return df

def process_council_sheet(df):
    """
    Applies specific cleaning steps:
    1. Populate structural columns downwards.
    2. Remove 'Grand' and 'Total' rows.
    3. Drop sparse columns.
    """
    if df.empty:
        return df

    # Identify Region Column
    region_col = None
    for i, row in df.head(5).iterrows():
        for col in df.columns:
            val = str(row[col]).lower()
            if "region" in val or "mkoa" in val:
                region_col = col
                break
        if region_col is not None:
            break

    if region_col is None and not df.empty:
        region_col = df.columns[0]

    # Populate Columns Downwards (Unmerge Vertical for structural cols)
    cols_to_fill = list(df.columns[:3])
    if region_col is not None and region_col not in cols_to_fill:
        cols_to_fill.append(region_col)

    for col in cols_to_fill:
        if col in df.columns:
            df[col] = df[col].replace({0: None, '0': None})
            df[col] = df[col].ffill()

    # "Grand" Logic
    if region_col is not None and region_col in df.columns:
        grand_mask = df[region_col].astype(str).str.contains("Grand", case=False, na=False)
        if grand_mask.any():
            cutoff_idx = grand_mask.idxmax()
            df = df.loc[:cutoff_idx-1]

    # "Total" Logic (Region)
    if region_col is not None and region_col in df.columns:
        total_mask = df[region_col].astype(str).str.contains("Total", case=False, na=False)
        df = df[~total_mask]

    # "Total" Logic (Council)
    council_col = None
    council_keywords = ['council', 'halmashauri', 'district', 'lga', 'wilaya', 'municipal', 'town council']

    for i, row in df.head(5).iterrows():
        for col in df.columns:
            val = str(row[col]).lower()
            if any(kw in val for kw in council_keywords):
                council_col = col
                break
        if council_col is not None:
            break

    if council_col is not None and council_col in df.columns:
        pat = "Total|Sub-Total|Sub Total"
        council_total_mask = df[council_col].astype(str).str.contains(pat, case=False, na=False)
        df = df[~council_total_mask]

    # Sparsity Logic
    cols_to_keep = []
    threshold = 0.60
    for col in df.columns:
        is_missing = df[col].isna() | df[col].isin([0, '0', ''])
        missing_pct = is_missing.mean()
        if missing_pct <= threshold:
            cols_to_keep.append(col)
    df = df[cols_to_keep]

    # Final Cleanup: Remove rows with "Total" in first few columns
    target_indices = [0, 1, 2]
    for idx in target_indices:
        if idx < len(df.columns):
            col_name = df.columns[idx]
            mask = df[col_name].astype(str).str.contains("Total", case=False, na=False)
            df = df[~mask]

    # Duplicate Region Column Check
    if region_col is not None and region_col in df.columns:
        cols_to_drop = []
        for col in df.columns:
            if col == region_col: continue
            if df[col].equals(df[region_col]):
                cols_to_drop.append(col)
        if cols_to_drop:
            df = df.drop(columns=cols_to_drop)

    return df

# ==========================================
# 2. Main Extraction and Combination Logic
# ==========================================

def extract_and_combine():
    # Mapping based on your latest snippet
    sheet_mapping = {
        2016: ["3.26Lab", "3.27LabGov"],
        2017: ["3.36LabRegCoun", "3.37LabGovtRegCoun"],
        2018: ["3.37Lab", "3.38LabGov"],
        2019: ["3.37Lab", "3.38LabGov"],
        2020: ["3.37Lab", "3.38LabGov"],
        2021: ["T3.37Lab", "T3.38LabGov"],
        2022: ["T3.38Lab", "T3.39LabGov"],
        2023: ["T3.39LabG&NG", "T3.40LabG"],
        2024: ["T3.39LabG&NG", "T3.40LabG"],
        2025: ["T3.40LabG&NG", "T3.41LabG"]
    }

    base_dir = "/content/drive/MyDrive/BEST"

    # Lists to hold the dataframes for final merging
    left_dfs = []
    right_dfs = []

    # Sorted ensures we start with 2016, then 2017, etc.
    for year, sheets in sorted(sheet_mapping.items()):
        filename = f"BEST {year}.xlsx"
        file_path = os.path.join(base_dir, filename)

        if not os.path.exists(file_path):
            print(f"Skipping {year}: File not found at {file_path}")
            continue

        print(f"Processing {filename}...")

        try:
            xls = pd.ExcelFile(file_path)
            available_sheets = xls.sheet_names

            # --- PROCESS LEFT SHEET (Index 0) ---
            target_left = sheets[0]
            if target_left in available_sheets:
                # 1. Load & Clean
                df_left = pd.read_excel(xls, sheet_name=target_left, header=None)
                df_left = normalize_merged_cells(df_left)
                df_left = process_council_sheet(df_left)

                # 2. Add Year Column
                df_left.insert(0, 'Source_Year', year)

                # 3. Handle Header Rows (Keep for first file, Drop for rest)
                if len(left_dfs) > 0:
                    # This is NOT the first file -> Drop rows 1 to 4 (Indices 0-3)
                    # We assume cleaning didn't drastically alter row count at the top
                    df_left = df_left.iloc[4:]
                    print(f"  -> Added Left Sheet (Rows 1-4 dropped): {target_left}")
                else:
                    # This IS the first file -> Keep all rows
                    print(f"  -> Added Left Sheet (Headers kept): {target_left}")

                left_dfs.append(df_left)
            else:
                print(f"  [Warning] Left Sheet '{target_left}' missing.")

            # --- PROCESS RIGHT SHEET (Index 1) ---
            if len(sheets) > 1:
                target_right = sheets[1]
                if target_right in available_sheets:
                    # 1. Load & Clean
                    df_right = pd.read_excel(xls, sheet_name=target_right, header=None)
                    df_right = normalize_merged_cells(df_right)
                    df_right = process_council_sheet(df_right)

                    # 2. Add Year Column
                    df_right.insert(0, 'Source_Year', year)

                    # 3. Handle Header Rows (Keep for first file, Drop for rest)
                    if len(right_dfs) > 0:
                        # This is NOT the first file -> Drop rows 1 to 4
                        df_right = df_right.iloc[4:]
                        print(f"  -> Added Right Sheet (Rows 1-4 dropped): {target_right}")
                    else:
                        # This IS the first file -> Keep all rows
                        print(f"  -> Added Right Sheet (Headers kept): {target_right}")

                    right_dfs.append(df_right)
                else:
                    print(f"  [Warning] Right Sheet '{target_right}' missing.")

        except Exception as e:
            print(f"Error processing {filename}: {e}")

    # ==========================================
    # 3. Save Combined Files
    # ==========================================

    print("\n--- Saving Combined Files ---")

    if left_dfs:
        combined_left = pd.concat(left_dfs, ignore_index=True)
        combined_left.to_csv("Combined_Left_Sheets.csv", index=False, header=False)
        print(f"Saved 'Combined_Left_Sheets.csv' with {len(combined_left)} rows.")
    else:
        print("No Left sheets extracted.")

    if right_dfs:
        combined_right = pd.concat(right_dfs, ignore_index=True)
        combined_right.to_csv("Combined_Right_Sheets.csv", index=False, header=False)
        print(f"Saved 'Combined_Right_Sheets.csv' with {len(combined_right)} rows.")

if __name__ == "__main__":
    extract_and_combine()

Processing BEST 2016.xlsx...
  -> Added Left Sheet (Headers kept): 3.26Lab
  -> Added Right Sheet (Headers kept): 3.27LabGov
Processing BEST 2017.xlsx...


 'Table 3.36: Number of Laboratories in Government and Non Government Schools, 2017'
 '% of Available' 'SHORTAGE' 82.6923076923077 90.19607843137256 40.625 30
 73.68421052631578 61.904761904761905 36.36363636363637 70.08547008547008
 89.81481481481481 80 87.5]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.36: Number of Laboratories in Government and Non Government Schools, 2017'
 '% of Available' 'SHORTAGE' 82.6923076923077 90.19607843137256 40.625 30
 73.68421052631578 61.904761904761905 36.36363636363637 70.08547008547008
 89.81481481481481 80 87.5]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.36: Number of Laboratories in Government and Non Government Schools, 2017'
 '% of Available' 'SHORTAGE' 126.0 133.0 41.0 9.0 133.0 38.0 14.0 494.0
 295.0 48.0 214.0]' has dtype incompatible with float64, please explicitly cast to a co

  -> Added Left Sheet (Rows 1-4 dropped): 3.36LabRegCoun
  -> Added Right Sheet (Rows 1-4 dropped): 3.37LabGovtRegCoun
Processing BEST 2018.xlsx...


 'Table 3.37: Number of Laboratories in Government and Non-Government Schools, 2018'
 'PHYSICS LABORATORIES' 'council' 124.0 143.0 49.0 12.0 164.0 45.0 23.0
 560.0 301.0 61.0 216.0]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.38: Number of Laboratories in Government Schools, 2018'
 'PHYSICS LABORATORIES' 'SHORTAGE' 5 1 18 5 9 4 5 47 -1 0 3]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.38: Number of Laboratories in Government Schools, 2018'
 'PHYSICS LABORATORIES' 'council' 5 1 18 5 9 4 5 351 -1 0 3]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset


  -> Added Left Sheet (Rows 1-4 dropped): 3.37Lab
  -> Added Right Sheet (Rows 1-4 dropped): 3.38LabGov
Processing BEST 2019.xlsx...
  -> Added Left Sheet (Rows 1-4 dropped): 3.37Lab
  -> Added Right Sheet (Rows 1-4 dropped): 3.38LabGov
Processing BEST 2020.xlsx...
  -> Added Left Sheet (Rows 1-4 dropped): 3.37Lab
  -> Added Right Sheet (Rows 1-4 dropped): 3.38LabGov
Processing BEST 2021.xlsx...
  -> Added Left Sheet (Rows 1-4 dropped): T3.37Lab
  -> Added Right Sheet (Rows 1-4 dropped): T3.38LabGov
Processing BEST 2022.xlsx...
  -> Added Left Sheet (Rows 1-4 dropped): T3.38Lab
  -> Added Right Sheet (Rows 1-4 dropped): T3.39LabGov
Processing BEST 2023.xlsx...
  -> Added Left Sheet (Rows 1-4 dropped): T3.39LabG&NG
  -> Added Right Sheet (Rows 1-4 dropped): T3.40LabG
Processing BEST 2024.xlsx...


 'Table 3.40: Number of Science Laboratories in Government Schools, 2023'
 'PHYSICS LABORATORIES' 'SHORTAGE' 4.0 5.0 6.0 7.0 8.0 9.0 1 58 25 3 13]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.40: Number of Science Laboratories in Government Schools, 2023'
 'PHYSICS LABORATORIES' 'SHORTAGE' 4.0 5.0 6.0 7.0 8.0 9.0 1 58 25 3 13]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset


  -> Added Left Sheet (Rows 1-4 dropped): T3.39LabG&NG
  -> Added Right Sheet (Rows 1-4 dropped): T3.40LabG
Processing BEST 2025.xlsx...


 'Table 3.40: Number of Science Laboratories in Government Schools, 2024'
 'PHYSICS LABORATORIES' 'SHORTAGE' 4.0 5.0 6.0 7.0 8.0 9.0 1 56 16 3 8]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.40: Number of Science Laboratories in Government Schools, 2024'
 'PHYSICS LABORATORIES' 'SHORTAGE' 87.0 5.0 6.0 7.0 8.0 9.0 1 56 16 3 8]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset


  -> Added Left Sheet (Rows 1-4 dropped): T3.40LabG&NG
  -> Added Right Sheet (Rows 1-4 dropped): T3.41LabG

--- Saving Combined Files ---
Saved 'F_Combined_Left_Sheets.csv' with 1840 rows.
Saved 'F_Combined_Right_Sheets.csv' with 1840 rows.


 'Table 3.41: Number of Science Laboratories in Government Schools, 2025'
 'PHYSICS LABORATORIES' 'SHORTAGE' 4.0 5.0 6.0 7.0 8.0 9.0 3 62 22 7 9]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.41: Number of Science Laboratories in Government Schools, 2025'
 'PHYSICS LABORATORIES' 'SHORTAGE' 90.0 5.0 6.0 7.0 8.0 9.0 3 62 22 7 9]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset


In [None]:
import pandas as pd
import os
import re

# ==========================================
# 1. Cleaning & Helper Functions
# ==========================================

def is_valid_text(value):
    """Checks if the value is Alphabetic or Alphanumeric."""
    s_val = str(value).strip()
    if not s_val:
        return False
    clean_val = s_val.replace(" ", "")
    if clean_val.isalnum():
        return True
    if re.search(r'[a-zA-Z]', s_val):
        return True
    return False

def normalize_merged_cells(df, header_rows=15):
    """
    Handles merged columns/rows in the header/label area.
    Duplicates text horizontally and vertically for merged cells.
    """
    if df.empty:
        return df

    limit = min(header_rows, len(df))
    subset = df.iloc[:limit].copy()

    # Forward fill horizontally and vertically
    subset = subset.ffill(axis=1)
    subset = subset.ffill(axis=0)

    df.iloc[:limit] = subset
    return df

def process_council_sheet(df):
    """
    Applies specific cleaning steps:
    1. Populate structural columns downwards.
    2. Remove 'Grand' and 'Total' rows.
    3. Drop sparse columns.
    """
    if df.empty:
        return df

    # Identify Region Column
    region_col = None
    for i, row in df.head(5).iterrows():
        for col in df.columns:
            val = str(row[col]).lower()
            if "region" in val or "mkoa" in val:
                region_col = col
                break
        if region_col is not None:
            break

    if region_col is None and not df.empty:
        region_col = df.columns[0]

    # Populate Columns Downwards (Unmerge Vertical for structural cols)
    cols_to_fill = list(df.columns[:3])
    if region_col is not None and region_col not in cols_to_fill:
        cols_to_fill.append(region_col)

    for col in cols_to_fill:
        if col in df.columns:
            df[col] = df[col].replace({0: None, '0': None})
            df[col] = df[col].ffill()

    # "Grand" Logic
    if region_col is not None and region_col in df.columns:
        grand_mask = df[region_col].astype(str).str.contains("Grand", case=False, na=False)
        if grand_mask.any():
            cutoff_idx = grand_mask.idxmax()
            df = df.loc[:cutoff_idx-1]

    # "Total" Logic (Region)
    if region_col is not None and region_col in df.columns:
        total_mask = df[region_col].astype(str).str.contains("Total", case=False, na=False)
        df = df[~total_mask]

    # "Total" Logic (Council)
    council_col = None
    council_keywords = ['council', 'halmashauri', 'district', 'lga', 'wilaya', 'municipal', 'town council']

    for i, row in df.head(5).iterrows():
        for col in df.columns:
            val = str(row[col]).lower()
            if any(kw in val for kw in council_keywords):
                council_col = col
                break
        if council_col is not None:
            break

    if council_col is not None and council_col in df.columns:
        pat = "Total|Sub-Total|Sub Total"
        council_total_mask = df[council_col].astype(str).str.contains(pat, case=False, na=False)
        df = df[~council_total_mask]

    # Sparsity Logic
    cols_to_keep = []
    threshold = 0.60
    for col in df.columns:
        is_missing = df[col].isna() | df[col].isin([0, '0', ''])
        missing_pct = is_missing.mean()
        if missing_pct <= threshold:
            cols_to_keep.append(col)
    df = df[cols_to_keep]

    # Final Cleanup: Remove rows with "Total" in first few columns
    target_indices = [0, 1, 2]
    for idx in target_indices:
        if idx < len(df.columns):
            col_name = df.columns[idx]
            mask = df[col_name].astype(str).str.contains("Total", case=False, na=False)
            df = df[~mask]

    # Duplicate Region Column Check
    if region_col is not None and region_col in df.columns:
        cols_to_drop = []
        for col in df.columns:
            if col == region_col: continue
            if df[col].equals(df[region_col]):
                cols_to_drop.append(col)
        if cols_to_drop:
            df = df.drop(columns=cols_to_drop)

    return df

# ==========================================
# 2. Main Extraction and Combination Logic
# ==========================================

def extract_and_combine_all():
    base_dir = "/content/drive/MyDrive/BEST"

    # --- DEFINITION: Data Categories and Mappings ---
    # NOTE: All keys in the lists below have had spaces REMOVED as per instruction.

    # 1. Laboratories
    lab_mapping = {
        2016: ["3.26Lab", "3.27LabGov"],
        2017: ["3.36LabRegCoun", "3.37LabGovtRegCoun"],
        2018: ["3.37Lab", "3.38LabGov"],
        2019: ["3.37Lab", "3.38LabGov"],
        2020: ["3.37Lab", "3.38LabGov"],
        2021: ["T3.37Lab", "T3.38LabGov"],
        2022: ["T3.38Lab", "T3.39LabGov"],
        2023: ["T3.39LabG&NG", "T3.40LabG"],
        2024: ["T3.39LabG&NG", "T3.40LabG"],
        2025: ["T3.40LabG&NG", "T3.41LabG"]
    }

    # 2. ICT Equipment
    # Notes: Spaces removed from "Table 170" -> "Table170", "T 3.46 ICT" -> "T3.46ICT" etc.
    ict_mapping = {
        2017: ["T3.42ICTAllRegCoun", "T3.43ICTGovRegCoun"],
        2018: ["T2.42_ICT_G&N", "T2.43_ICT_G"],
        2019: ["Table170", "Table169"],
        2020: ["Table147", "Table148"],
        2021: ["Table155", "Table156"],
        2022: ["T2.43_ICT_G&N", "T2.44_ICT_G"],
        2023: ["T2.44_ICT_G&N", "T2.45_ICT_G"],
        2024: ["T2.44_ICT_G&N", "T2.45_ICT_G"],
        2025: ["T3.46ICT", "T3.47ICT_Gov"]
    }

    # Grouping them for processing
    tasks = [
        ("Laboratories", lab_mapping, "Combined_Laboratories_All_G_NG.csv", "Combined_Laboratories_Govt.csv"),
        ("ICT_Equipment", ict_mapping, "Combined_ICT_All_G_NG.csv", "Combined_ICT_Govt.csv")
    ]

    # --- EXECUTION LOOP ---

    for category_name, mapping, left_out, right_out in tasks:
        print(f"\n=== Starting Extraction for: {category_name} ===")

        left_dfs = []
        right_dfs = []

        # Track if we have processed the "first file" yet for this specific category
        processed_first_file = False

        # Sort years to ensure chronological order
        for year, sheets in sorted(mapping.items()):
            filename = f"BEST {year}.xlsx"
            file_path = os.path.join(base_dir, filename)

            if not os.path.exists(file_path):
                print(f"  Skipping {year}: File not found at {file_path}")
                continue

            print(f"  Processing {filename}...")

            try:
                xls = pd.ExcelFile(file_path)
                available_sheets = xls.sheet_names

                # Helper to process a single sheet
                def process_sheet_data(sheet_name, is_first_time):
                    # Double check if sheet exists, or try matching without case if needed
                    if sheet_name in available_sheets:
                        # Load & Clean
                        df = pd.read_excel(xls, sheet_name=sheet_name, header=None)
                        df = normalize_merged_cells(df)
                        df = process_council_sheet(df)

                        # Add Year
                        df.insert(0, 'Source_Year', year)

                        # Handle Headers: Drop rows 0-3 (first 4 rows) if NOT the first file
                        if not is_first_time:
                            df = df.iloc[4:]

                        return df
                    else:
                        print(f"    [Warning] Sheet '{sheet_name}' missing in {filename}.")
                        return None

                # --- EXTRACT LEFT ---
                df_left = process_sheet_data(sheets[0], not processed_first_file)
                if df_left is not None:
                    left_dfs.append(df_left)

                # --- EXTRACT RIGHT ---
                if len(sheets) > 1:
                    df_right = process_sheet_data(sheets[1], not processed_first_file)
                    if df_right is not None:
                        right_dfs.append(df_right)

                # Mark that we have successfully processed at least one file
                processed_first_file = True

            except Exception as e:
                print(f"  Error processing {filename}: {e}")

        # --- SAVE FILES FOR THIS CATEGORY ---
        print(f"  > Saving {category_name} files...")

        if left_dfs:
            combined_left = pd.concat(left_dfs, ignore_index=True)
            combined_left.to_csv(left_out, index=False, header=False)
            print(f"    -> Created '{left_out}' ({len(combined_left)} rows).")
        else:
            print(f"    -> No Left sheets found for {category_name}.")

        if right_dfs:
            combined_right = pd.concat(right_dfs, ignore_index=True)
            combined_right.to_csv(right_out, index=False, header=False)
            print(f"    -> Created '{right_out}' ({len(combined_right)} rows).")

    print("\n=== All Tasks Complete ===")

if __name__ == "__main__":
    extract_and_combine_all()


=== Starting Extraction for: Laboratories ===
  Processing BEST 2016.xlsx...
  Processing BEST 2017.xlsx...


 'Table 3.36: Number of Laboratories in Government and Non Government Schools, 2017'
 '% of Available' 'SHORTAGE' 82.6923076923077 90.19607843137256 40.625 30
 73.68421052631578 61.904761904761905 36.36363636363637 70.08547008547008
 89.81481481481481 80 87.5]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.36: Number of Laboratories in Government and Non Government Schools, 2017'
 '% of Available' 'SHORTAGE' 82.6923076923077 90.19607843137256 40.625 30
 73.68421052631578 61.904761904761905 36.36363636363637 70.08547008547008
 89.81481481481481 80 87.5]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.36: Number of Laboratories in Government and Non Government Schools, 2017'
 '% of Available' 'SHORTAGE' 126.0 133.0 41.0 9.0 133.0 38.0 14.0 494.0
 295.0 48.0 214.0]' has dtype incompatible with float64, please explicitly cast to a co

  Processing BEST 2018.xlsx...


 'Table 3.37: Number of Laboratories in Government and Non-Government Schools, 2018'
 'PHYSICS LABORATORIES' 'council' 124.0 143.0 49.0 12.0 164.0 45.0 23.0
 560.0 301.0 61.0 216.0]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.38: Number of Laboratories in Government Schools, 2018'
 'PHYSICS LABORATORIES' 'SHORTAGE' 5 1 18 5 9 4 5 47 -1 0 3]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.38: Number of Laboratories in Government Schools, 2018'
 'PHYSICS LABORATORIES' 'council' 5 1 18 5 9 4 5 351 -1 0 3]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset


  Processing BEST 2019.xlsx...
  Processing BEST 2020.xlsx...
  Processing BEST 2021.xlsx...
  Processing BEST 2022.xlsx...
  Processing BEST 2023.xlsx...


 'Table 3.40: Number of Science Laboratories in Government Schools, 2023'
 'PHYSICS LABORATORIES' 'SHORTAGE' 4.0 5.0 6.0 7.0 8.0 9.0 1 58 25 3 13]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.40: Number of Science Laboratories in Government Schools, 2023'
 'PHYSICS LABORATORIES' 'SHORTAGE' 4.0 5.0 6.0 7.0 8.0 9.0 1 58 25 3 13]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset


  Processing BEST 2024.xlsx...


 'Table 3.40: Number of Science Laboratories in Government Schools, 2024'
 'PHYSICS LABORATORIES' 'SHORTAGE' 4.0 5.0 6.0 7.0 8.0 9.0 1 56 16 3 8]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.40: Number of Science Laboratories in Government Schools, 2024'
 'PHYSICS LABORATORIES' 'SHORTAGE' 87.0 5.0 6.0 7.0 8.0 9.0 1 56 16 3 8]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset


  Processing BEST 2025.xlsx...


 'Table 3.41: Number of Science Laboratories in Government Schools, 2025'
 'PHYSICS LABORATORIES' 'SHORTAGE' 4.0 5.0 6.0 7.0 8.0 9.0 3 62 22 7 9]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.41: Number of Science Laboratories in Government Schools, 2025'
 'PHYSICS LABORATORIES' 'SHORTAGE' 90.0 5.0 6.0 7.0 8.0 9.0 3 62 22 7 9]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset


  > Saving Laboratories files...
    -> Created 'Combined_Laboratories_All_G_NG.csv' (1840 rows).
    -> Created 'Combined_Laboratories_Govt.csv' (1840 rows).

=== Starting Extraction for: ICT_Equipment ===
  Processing BEST 2017.xlsx...
  Processing BEST 2018.xlsx...


  subset = subset.ffill(axis=1)
 'Table 2.43: Number of ICT Equipments  in Government Schools by Type, Region and Council, 2018'
 'Radio' 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset


  Processing BEST 2019.xlsx...
  Processing BEST 2020.xlsx...
  Processing BEST 2021.xlsx...
  Processing BEST 2022.xlsx...
  Processing BEST 2023.xlsx...
  Processing BEST 2024.xlsx...
  Processing BEST 2025.xlsx...
  > Saving ICT_Equipment files...
    -> Created 'Combined_ICT_All_G_NG.csv' (1657 rows).
    -> Created 'Combined_ICT_Govt.csv' (1650 rows).

=== All Tasks Complete ===


In [2]:
import pandas as pd
import os
import re

# ==========================================
# 1. Cleaning & Helper Functions
# ==========================================

def is_valid_text(value):
    """Checks if the value is Alphabetic or Alphanumeric."""
    s_val = str(value).strip()
    if not s_val:
        return False
    clean_val = s_val.replace(" ", "")
    if clean_val.isalnum():
        return True
    if re.search(r'[a-zA-Z]', s_val):
        return True
    return False

def normalize_merged_cells(df, header_rows=15):
    """
    Handles merged columns/rows in the header/label area.
    Duplicates text horizontally and vertically for merged cells.
    """
    if df.empty:
        return df

    limit = min(header_rows, len(df))
    subset = df.iloc[:limit].copy()

    # Forward fill horizontally and vertically
    subset = subset.ffill(axis=1)
    subset = subset.ffill(axis=0)

    df.iloc[:limit] = subset
    return df

def process_council_sheet(df):
    """
    Applies specific cleaning steps:
    1. Populate structural columns downwards.
    2. Remove 'Grand' and 'Total' rows.
    3. Drop sparse columns.
    """
    if df.empty:
        return df

    # Identify Region Column
    region_col = None
    for i, row in df.head(5).iterrows():
        for col in df.columns:
            val = str(row[col]).lower()
            if "region" in val or "mkoa" in val:
                region_col = col
                break
        if region_col is not None:
            break

    if region_col is None and not df.empty:
        region_col = df.columns[0]

    # Populate Columns Downwards (Unmerge Vertical for structural cols)
    cols_to_fill = list(df.columns[:3])
    if region_col is not None and region_col not in cols_to_fill:
        cols_to_fill.append(region_col)

    for col in cols_to_fill:
        if col in df.columns:
            df[col] = df[col].replace({0: None, '0': None})
            df[col] = df[col].ffill()

    # "Grand" Logic
    if region_col is not None and region_col in df.columns:
        grand_mask = df[region_col].astype(str).str.contains("Grand", case=False, na=False)
        if grand_mask.any():
            cutoff_idx = grand_mask.idxmax()
            df = df.loc[:cutoff_idx-1]

    # "Total" Logic (Region)
    if region_col is not None and region_col in df.columns:
        total_mask = df[region_col].astype(str).str.contains("Total", case=False, na=False)
        df = df[~total_mask]

    # "Total" Logic (Council)
    council_col = None
    council_keywords = ['council', 'halmashauri', 'district', 'lga', 'wilaya', 'municipal', 'town council']

    for i, row in df.head(5).iterrows():
        for col in df.columns:
            val = str(row[col]).lower()
            if any(kw in val for kw in council_keywords):
                council_col = col
                break
        if council_col is not None:
            break

    if council_col is not None and council_col in df.columns:
        pat = "Total|Sub-Total|Sub Total"
        council_total_mask = df[council_col].astype(str).str.contains(pat, case=False, na=False)
        df = df[~council_total_mask]

    # Sparsity Logic
    cols_to_keep = []
    threshold = 0.60
    for col in df.columns:
        is_missing = df[col].isna() | df[col].isin([0, '0', ''])
        missing_pct = is_missing.mean()
        if missing_pct <= threshold:
            cols_to_keep.append(col)
    df = df[cols_to_keep]

    # Final Cleanup: Remove rows with "Total" in first few columns
    target_indices = [0, 1, 2]
    for idx in target_indices:
        if idx < len(df.columns):
            col_name = df.columns[idx]
            mask = df[col_name].astype(str).str.contains("Total", case=False, na=False)
            df = df[~mask]

    # Duplicate Region Column Check
    if region_col is not None and region_col in df.columns:
        cols_to_drop = []
        for col in df.columns:
            if col == region_col: continue
            if df[col].equals(df[region_col]):
                cols_to_drop.append(col)
        if cols_to_drop:
            df = df.drop(columns=cols_to_drop)

    return df

# ==========================================
# 2. Main Extraction and Combination Logic
# ==========================================

def extract_and_combine_all():
    base_dir = "/content/drive/MyDrive/BEST"

    # --- DEFINITION: Data Categories and Mappings ---
    # NOTE: All keys in the lists below have had spaces REMOVED as per instruction.

    # 1. Laboratories
    lab_mapping = {
        2016: ["3.26Lab", "3.27LabGov"],
        2017: ["3.36LabRegCoun", "3.37LabGovtRegCoun"],
        2018: ["3.37Lab", "3.38LabGov"],
        2019: ["3.37Lab", "3.38LabGov"],
        2020: ["3.37Lab", "3.38LabGov"],
        2021: ["T3.37Lab", "T3.38LabGov"],
        2022: ["T3.38Lab", "T3.39LabGov"],
        2023: ["T3.39LabG&NG", "T3.40LabG"],
        2024: ["T3.39LabG&NG", "T3.40LabG"],
        2025: ["T3.40LabG&NG", "T3.41LabG"]
    }

    # 2. ICT Equipment
    # Notes: Spaces removed from "Table 170" -> "Table170", "T 3.46 ICT" -> "T3.46ICT" etc.
    ict_mapping = {
        2017: ["T3.42ICTAllRegCoun", "T3.43ICTGovRegCoun"],
        2018: ["T2.42_ICT_G&N", "T2.43_ICT_G"],
        2019: ["Table170", "Table169"],
        2020: ["Table147", "Table148"],
        2021: ["Table155", "Table156"],
        2022: ["T2.43_ICT_G&N", "T2.44_ICT_G"],
        2023: ["T2.44_ICT_G&N", "T2.45_ICT_G"],
        2024: ["T2.44_ICT_G&N", "T2.45_ICT_G"],
        2025: ["T3.46ICT", "T3.47ICT_Gov"]
    }

    # Grouping them for processing
    tasks = [
        ("Laboratories", lab_mapping, "Combined_Laboratories_All_G_NG.csv", "Combined_Laboratories_Govt.csv"),
        ("ICT_Equipment", ict_mapping, "Combined_ICT_All_G_NG.csv", "Combined_ICT_Govt.csv")
    ]

    # --- EXECUTION LOOP ---

    for category_name, mapping, left_out, right_out in tasks:
        print(f"\n=== Starting Extraction for: {category_name} ===")

        left_dfs = []
        right_dfs = []

        # Track if we have processed the "first file" yet for this specific category
        processed_first_file = False

        # Sort years to ensure chronological order
        for year, sheets in sorted(mapping.items()):
            filename = f"BEST {year}.xlsx"
            file_path = os.path.join(base_dir, filename)

            if not os.path.exists(file_path):
                print(f"  Skipping {year}: File not found at {file_path}")
                continue

            print(f"  Processing {filename}...")

            try:
                xls = pd.ExcelFile(file_path)
                available_sheets = xls.sheet_names

                # Helper to process a single sheet
                def process_sheet_data(sheet_name, is_first_time):
                    # Double check if sheet exists, or try matching without case if needed
                    if sheet_name in available_sheets:
                        # Load & Clean
                        df = pd.read_excel(xls, sheet_name=sheet_name, header=None)
                        df = normalize_merged_cells(df)
                        df = process_council_sheet(df)

                        # Add Year
                        df.insert(0, 'Source_Year', year)

                        # Handle Headers: Drop rows 0-3 (first 4 rows) if NOT the first file
                        if not is_first_time:
                            df = df.iloc[4:]

                        return df
                    else:
                        print(f"    [Warning] Sheet '{sheet_name}' missing in {filename}.")
                        return None

                # --- EXTRACT LEFT ---
                df_left = process_sheet_data(sheets[0], not processed_first_file)
                if df_left is not None:
                    left_dfs.append(df_left)

                # --- EXTRACT RIGHT ---
                if len(sheets) > 1:
                    df_right = process_sheet_data(sheets[1], not processed_first_file)
                    if df_right is not None:
                        right_dfs.append(df_right)

                # Mark that we have successfully processed at least one file
                processed_first_file = True

            except Exception as e:
                print(f"  Error processing {filename}: {e}")

        # --- SAVE FILES FOR THIS CATEGORY ---
        print(f"  > Saving {category_name} files...")

        if left_dfs:
            combined_left = pd.concat(left_dfs, ignore_index=True)
            combined_left.to_csv(left_out, index=False, header=False)
            print(f"    -> Created '{left_out}' ({len(combined_left)} rows).")
        else:
            print(f"    -> No Left sheets found for {category_name}.")

        if right_dfs:
            combined_right = pd.concat(right_dfs, ignore_index=True)
            combined_right.to_csv(right_out, index=False, header=False)
            print(f"    -> Created '{right_out}' ({len(combined_right)} rows).")

    print("\n=== All Tasks Complete ===")

if __name__ == "__main__":
    extract_and_combine_all()


=== Starting Extraction for: Laboratories ===
  Processing BEST 2016.xlsx...
  Processing BEST 2017.xlsx...


 'Table 3.36: Number of Laboratories in Government and Non Government Schools, 2017'
 '% of Available' 'SHORTAGE' 82.69230769 90.19607843 40.625 30 73.68421053
 61.9047619 36.36363636 70.08547009 89.81481481 80 87.5]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.36: Number of Laboratories in Government and Non Government Schools, 2017'
 '% of Available' 'SHORTAGE' 82.69230769 90.19607843 40.625 30 73.68421053
 61.9047619 36.36363636 70.08547009 89.81481481 80 87.5]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.36: Number of Laboratories in Government and Non Government Schools, 2017'
 '% of Available' 'SHORTAGE' 126.0 133.0 41.0 9.0 133.0 38.0 14.0 494.0
 295.0 48.0 214.0]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.37: Number of Laboratories in

  Processing BEST 2018.xlsx...


 'Table 3.37: Number of Laboratories in Government and Non-Government Schools, 2018'
 'PHYSICS LABORATORIES' 'council' 124.0 143.0 49.0 12.0 164.0 45.0 23.0
 560.0 301.0 61.0 216.0]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.38: Number of Laboratories in Government Schools, 2018'
 'PHYSICS LABORATORIES' 'SHORTAGE' 5 1 18 5 9 4 5 47 -1 0 3]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.38: Number of Laboratories in Government Schools, 2018'
 'PHYSICS LABORATORIES' 'council' 5 1 18 5 9 4 5 351 -1 0 3]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset


  Processing BEST 2019.xlsx...
  Processing BEST 2020.xlsx...
  Processing BEST 2021.xlsx...
  Processing BEST 2022.xlsx...
  Processing BEST 2023.xlsx...


 'Table 3.40: Number of Science Laboratories in Government Schools, 2023'
 'PHYSICS LABORATORIES' 'SHORTAGE' 4.0 5.0 6.0 7.0 8.0 9.0 1 58 25 3 13]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.40: Number of Science Laboratories in Government Schools, 2023'
 'PHYSICS LABORATORIES' 'SHORTAGE' 4.0 5.0 6.0 7.0 8.0 9.0 1 58 25 3 13]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset


  Processing BEST 2024.xlsx...


 'Table 3.40: Number of Science Laboratories in Government Schools, 2024'
 'PHYSICS LABORATORIES' 'SHORTAGE' 4.0 5.0 6.0 7.0 8.0 9.0 1 56 16 3 8]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.40: Number of Science Laboratories in Government Schools, 2024'
 'PHYSICS LABORATORIES' 'SHORTAGE' 87.0 5.0 6.0 7.0 8.0 9.0 1 56 16 3 8]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset


  Processing BEST 2025.xlsx...


 'Table 3.41: Number of Science Laboratories in Government Schools, 2025'
 'PHYSICS LABORATORIES' 'SHORTAGE' 4.0 5.0 6.0 7.0 8.0 9.0 3 62 22 7 9]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset
 'Table 3.41: Number of Science Laboratories in Government Schools, 2025'
 'PHYSICS LABORATORIES' 'SHORTAGE' 90.0 5.0 6.0 7.0 8.0 9.0 3 62 22 7 9]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset


  > Saving Laboratories files...
    -> Created 'Combined_Laboratories_All_G_NG.csv' (1840 rows).
    -> Created 'Combined_Laboratories_Govt.csv' (1840 rows).

=== Starting Extraction for: ICT_Equipment ===
  Processing BEST 2017.xlsx...
  Processing BEST 2018.xlsx...


  subset = subset.ffill(axis=1)
 'Table 2.43: Number of ICT Equipments  in Government Schools by Type, Region and Council, 2018'
 'Radio' 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.iloc[:limit] = subset


  Processing BEST 2019.xlsx...
  Processing BEST 2020.xlsx...
  Processing BEST 2021.xlsx...
  Processing BEST 2022.xlsx...
  Processing BEST 2023.xlsx...
  Processing BEST 2024.xlsx...
  Processing BEST 2025.xlsx...
  > Saving ICT_Equipment files...
    -> Created 'Combined_ICT_All_G_NG.csv' (1657 rows).
    -> Created 'Combined_ICT_Govt.csv' (1654 rows).

=== All Tasks Complete ===


In [5]:
import pandas as pd
import os
import re
import warnings

# ==========================================
# 0. Setup: Block Warnings
# ==========================================
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")

# ==========================================
# 1. Cleaning & Helper Functions
# ==========================================

def is_valid_text(value):
    """Checks if the value is Alphabetic or Alphanumeric."""
    s_val = str(value).strip()
    if not s_val:
        return False
    clean_val = s_val.replace(" ", "")
    if clean_val.isalnum():
        return True
    if re.search(r'[a-zA-Z]', s_val):
        return True
    return False

def normalize_merged_cells(df, header_rows=15):
    """
    Handles merged columns/rows in the header/label area.
    Duplicates text horizontally and vertically for merged cells.
    """
    if df.empty:
        return df

    limit = min(header_rows, len(df))
    subset = df.iloc[:limit].copy()

    # Forward fill horizontally and vertically
    subset = subset.ffill(axis=1)
    subset = subset.ffill(axis=0)

    df.iloc[:limit] = subset
    return df

def process_council_sheet(df):
    """
    Applies specific cleaning steps:
    1. Populate structural columns downwards.
    2. Remove 'Grand' and 'Total' rows.
    (Sparsity and Row checks are now moved to the final stage)
    """
    if df.empty:
        return df

    # Identify Region Column
    region_col = None
    for i, row in df.head(5).iterrows():
        for col in df.columns:
            val = str(row[col]).lower()
            if "region" in val or "mkoa" in val:
                region_col = col
                break
        if region_col is not None:
            break

    if region_col is None and not df.empty:
        region_col = df.columns[0]

    # Populate Columns Downwards (Unmerge Vertical for structural cols)
    cols_to_fill = list(df.columns[:3])
    if region_col is not None and region_col not in cols_to_fill:
        cols_to_fill.append(region_col)

    for col in cols_to_fill:
        if col in df.columns:
            df[col] = df[col].replace({0: None, '0': None})
            df[col] = df[col].ffill()

    # "Grand" Logic
    if region_col is not None and region_col in df.columns:
        grand_mask = df[region_col].astype(str).str.contains("Grand", case=False, na=False)
        if grand_mask.any():
            cutoff_idx = grand_mask.idxmax()
            df = df.loc[:cutoff_idx-1]

    # "Total" Logic (Region)
    if region_col is not None and region_col in df.columns:
        total_mask = df[region_col].astype(str).str.contains("Total", case=False, na=False)
        df = df[~total_mask]

    # "Total" Logic (Council)
    council_col = None
    council_keywords = ['council', 'halmashauri', 'district', 'lga', 'wilaya', 'municipal', 'town council']

    for i, row in df.head(5).iterrows():
        for col in df.columns:
            val = str(row[col]).lower()
            if any(kw in val for kw in council_keywords):
                council_col = col
                break
        if council_col is not None:
            break

    if council_col is not None and council_col in df.columns:
        pat = "Total|Sub-Total|Sub Total"
        council_total_mask = df[council_col].astype(str).str.contains(pat, case=False, na=False)
        df = df[~council_total_mask]

    # Final Cleanup: Remove rows with "Total" in first few columns
    target_indices = [0, 1, 2]
    for idx in target_indices:
        if idx < len(df.columns):
            col_name = df.columns[idx]
            mask = df[col_name].astype(str).str.contains("Total", case=False, na=False)
            df = df[~mask]

    # Duplicate Region Column Check
    if region_col is not None and region_col in df.columns:
        cols_to_drop = []
        for col in df.columns:
            if col == region_col: continue
            if df[col].equals(df[region_col]):
                cols_to_drop.append(col)
        if cols_to_drop:
            df = df.drop(columns=cols_to_drop)

    return df

def row_has_numeric(series):
    """Returns True if the row contains any numeric value."""
    for val in series:
        if isinstance(val, (int, float)) and not isinstance(val, bool):
            return True
        if isinstance(val, str):
            s = val.strip().replace(',', '')
            if s.replace('.', '', 1).isdigit():
                return True
    return False

def perform_final_cleanup(df):
    """
    Executes the final requested operations:
    1. Delete rows with only 1 cell of value (excluding the Source_Year column).
    2. Delete columns with >15% empty cells.
    """
    if df.empty:
        return df

    # --- 1. ROW CLEANUP ---
    # We check non-null count on all columns EXCEPT 'Source_Year'
    # If a row has <= 1 valid data cell, we drop it.
    cols_to_check = [c for c in df.columns if c != 'Source_Year']

    # Calculate non-nulls row-wise for data columns
    # We treat empty strings '' as Null here just in case
    temp_df = df[cols_to_check].replace('', None)
    row_counts = temp_df.notna().sum(axis=1)

    # Keep rows where we have MORE than 1 data value
    initial_rows = len(df)
    df = df[row_counts > 1]
    dropped_rows = initial_rows - len(df)
    if dropped_rows > 0:
        print(f"    (Cleaned {dropped_rows} rows having <= 1 data value)")

    # --- 2. COLUMN SPARSITY (15% Threshold) ---
    # Remove columns which have MORE than 15% empty cells
    threshold = 0.15
    initial_cols = len(df.columns)

    # Calculate null percentage
    # We treat 0 and '0' as valid values here? Usually yes, 0 is data.
    # But often empty strings are loaded as objects. Let's stick to standard NaNs/None.
    # If you consider '0' as empty, uncomment the replacement line below.
    # df_check = df.replace({0: None, '0': None, '': None})

    missing_pct = df.isna().mean()
    cols_to_keep = missing_pct[missing_pct <= threshold].index

    df = df[cols_to_keep]
    dropped_cols = initial_cols - len(df.columns)
    if dropped_cols > 0:
        print(f"    (Dropped {dropped_cols} columns with >15% empty cells)")

    return df

# ==========================================
# 2. Main Extraction and Combination Logic
# ==========================================

def extract_and_combine_all():
    base_dir = "/content/drive/MyDrive/BEST"

    # --- DEFINITION: Data Categories and Mappings ---

    # 1. Laboratories
    lab_mapping = {
        2016: ["3.26Lab", "3.27LabGov"],
        2017: ["3.36LabRegCoun", "3.37LabGovtRegCoun"],
        2018: ["3.37Lab", "3.38LabGov"],
        2019: ["3.37Lab", "3.38LabGov"],
        2020: ["3.37Lab", "3.38LabGov"],
        2021: ["T3.37Lab", "T3.38LabGov"],
        2022: ["T3.38Lab", "T3.39LabGov"],
        2023: ["T3.39LabG&NG", "T3.40LabG"],
        2024: ["T3.39LabG&NG", "T3.40LabG"],
        2025: ["T3.40LabG&NG", "T3.41LabG"]
    }

    # 2. ICT Equipment
    ict_mapping = {
        2017: ["T3.42ICTAllRegCoun", "T3.43ICTGovRegCoun"],
        2018: ["T2.42_ICT_G&N", "T2.43_ICT_G"],
        2019: ["Table170", "Table169"],
        2020: ["Table147", "Table148"],
        2021: ["Table155", "Table156"],
        2022: ["T2.43_ICT_G&N", "T2.44_ICT_G"],
        2023: ["T2.44_ICT_G&N", "T2.45_ICT_G"],
        2024: ["T2.44_ICT_G&N", "T2.45_ICT_G"],
        2025: ["T3.46ICT", "T3.47ICT_Gov"]
    }

    # 3. Electricity
    elec_mapping = {
        2017: ["T3.40SchElecAllRegCoun", "T3.41SchElecGovRegCoun"],
        2018: ["T2.44_Elect_G&N", "T2.45_Elect_G"],
        2019: ["Table165", "Table167"],
        2020: ["Table145", "Table146"],
        2021: ["Table152", "Table153"],
        2022: ["T2.41_Elect_G&N", "T2.42_Elect_G"],
        2023: ["T2.42_Elect_G&N", "T2.43_Elect_G"],
        2024: ["T2.42_Elect_G&N", "T2.43_Elect_G"],
        2025: ["T2.42_Elect_G&N", "T2.43_Elect_G"]
    }

    tasks = [
        ("Laboratories", lab_mapping, "Combined_Laboratories_All_G_NG.csv", "Combined_Laboratories_Govt.csv"),
        ("ICT_Equipment", ict_mapping, "Combined_ICT_All_G_NG.csv", "Combined_ICT_Govt.csv"),
        ("Electricity", elec_mapping, "Combined_Electricity_All_G_NG.csv", "Combined_Electricity_Govt.csv")
    ]

    for category_name, mapping, left_out, right_out in tasks:
        print(f"\n=======================================================")
        print(f" PROCESSING CATEGORY: {category_name}")
        print(f"=======================================================")

        left_dfs = []
        right_dfs = []

        processed_first_file = False

        for year, sheets in sorted(mapping.items()):
            filename = f"BEST {year}.xlsx"
            file_path = os.path.join(base_dir, filename)

            if not os.path.exists(file_path):
                print(f"Year {year}: File not found ({filename})")
                continue

            try:
                xls = pd.ExcelFile(file_path)
                available_sheets = xls.sheet_names

                # Normalize sheet names for robust lookup (ignore spaces)
                normalized_lookup = {s.replace(" ", ""): s for s in available_sheets}

                # Helper to process a single sheet
                def process_sheet_data(target_name_clean, is_first_time):
                    if target_name_clean in normalized_lookup:
                        real_sheet_name = normalized_lookup[target_name_clean]

                        # Load & Clean
                        df = pd.read_excel(xls, sheet_name=real_sheet_name, header=None)
                        df = normalize_merged_cells(df)
                        df = process_council_sheet(df)

                        # --- SMART ROW DELETION (Before adding Year) ---
                        status_suffix = ""
                        if not is_first_time:
                            check_limit = min(4, len(df))
                            top_slice = df.iloc[:check_limit]
                            rest_slice = df.iloc[check_limit:]

                            rows_to_keep = []
                            for idx in range(len(top_slice)):
                                row_data = top_slice.iloc[idx]
                                if row_has_numeric(row_data):
                                    rows_to_keep.append(top_slice.iloc[[idx]])

                            if rows_to_keep:
                                df = pd.concat(rows_to_keep + [rest_slice])
                                dropped_count = check_limit - len(rows_to_keep)
                                status_suffix = f"(Dropped {dropped_count} header rows, kept {len(rows_to_keep)} numeric rows)"
                            else:
                                df = rest_slice
                                status_suffix = f"(Dropped top {check_limit} header rows)"
                        else:
                            status_suffix = "(First file: All rows kept)"

                        # --- ADD YEAR COLUMN ---
                        df.insert(0, 'Source_Year', year)

                        print(f"  {year}: [FOUND] '{real_sheet_name}' {status_suffix}")
                        return df
                    else:
                        print(f"  {year}: [MISSING] '{target_name_clean}'")
                        return None

                # --- EXTRACT LEFT ---
                df_left = process_sheet_data(sheets[0], not processed_first_file)
                if df_left is not None:
                    left_dfs.append(df_left)

                # --- EXTRACT RIGHT ---
                if len(sheets) > 1:
                    df_right = process_sheet_data(sheets[1], not processed_first_file)
                    if df_right is not None:
                        right_dfs.append(df_right)

                # Mark success
                if df_left is not None or (len(sheets) > 1 and df_right is not None):
                    processed_first_file = True

            except Exception as e:
                print(f"  {year}: [ERROR] Processing file: {e}")

        # --- SAVE FILES FOR THIS CATEGORY ---
        print(f"\n--- Finalizing & Saving {category_name} ---")

        if left_dfs:
            combined_left = pd.concat(left_dfs, ignore_index=True)
            # PERFORM FINAL CLEANUP
            combined_left = perform_final_cleanup(combined_left)
            combined_left.to_csv(left_out, index=False, header=False)
            print(f"  -> Saved '{left_out}' ({len(combined_left)} rows)")
        else:
            print(f"  -> No data for '{left_out}'")

        if right_dfs:
            combined_right = pd.concat(right_dfs, ignore_index=True)
            # PERFORM FINAL CLEANUP
            combined_right = perform_final_cleanup(combined_right)
            combined_right.to_csv(right_out, index=False, header=False)
            print(f"  -> Saved '{right_out}' ({len(combined_right)} rows)")

    print("\n=== All Tasks Complete ===")

if __name__ == "__main__":
    extract_and_combine_all()


 PROCESSING CATEGORY: Laboratories
  2016: [FOUND] '3.26Lab' (First file: All rows kept)
  2016: [FOUND] '3.27LabGov' (First file: All rows kept)
  2017: [FOUND] '3.36LabRegCoun' (Dropped top 4 header rows)
  2017: [FOUND] '3.37LabGovtRegCoun' (Dropped top 4 header rows)
  2018: [FOUND] '3.37Lab' (Dropped top 4 header rows)
  2018: [FOUND] '3.38LabGov' (Dropped top 4 header rows)
  2019: [FOUND] '3.37Lab' (Dropped 2 header rows, kept 2 numeric rows)
  2019: [FOUND] '3.38LabGov' (Dropped 2 header rows, kept 2 numeric rows)
  2020: [FOUND] '3.37Lab' (Dropped 2 header rows, kept 2 numeric rows)
  2020: [FOUND] '3.38LabGov' (Dropped 2 header rows, kept 2 numeric rows)
  2021: [FOUND] 'T3.37Lab' (Dropped 2 header rows, kept 2 numeric rows)
  2021: [FOUND] 'T3.38LabGov' (Dropped 2 header rows, kept 2 numeric rows)
  2022: [FOUND] 'T3.38Lab' (Dropped top 4 header rows)
  2022: [FOUND] 'T3.39LabGov' (Dropped top 4 header rows)
  2023: [FOUND] 'T3.39LabG&NG' (Dropped top 4 header rows)
  2023:

In [None]:
import pandas as pd
import os
import re
import warnings

# ==========================================
# 0. Setup: Block Warnings
# ==========================================
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")

# ==========================================
# 1. Cleaning & Helper Functions
# ==========================================

def is_valid_text(value):
    """Checks if the value is Alphabetic or Alphanumeric."""
    s_val = str(value).strip()
    if not s_val:
        return False
    clean_val = s_val.replace(" ", "")
    if clean_val.isalnum():
        return True
    if re.search(r'[a-zA-Z]', s_val):
        return True
    return False

def normalize_merged_cells(df, header_rows=15):
    """
    Handles merged columns/rows in the header/label area.
    Duplicates text horizontally and vertically for merged cells.
    """
    if df.empty:
        return df

    limit = min(header_rows, len(df))
    subset = df.iloc[:limit].copy()

    # Forward fill horizontally and vertically
    subset = subset.ffill(axis=1)
    subset = subset.ffill(axis=0)

    df.iloc[:limit] = subset
    return df

def process_council_sheet(df):
    """
    Applies specific cleaning steps:
    1. Populate structural columns downwards.
    2. Remove 'Grand' and 'Total' rows.
    (Sparsity and Row checks are now moved to the final stage)
    """
    if df.empty:
        return df

    # Identify Region Column
    region_col = None
    for i, row in df.head(5).iterrows():
        for col in df.columns:
            val = str(row[col]).lower()
            if "region" in val or "mkoa" in val:
                region_col = col
                break
        if region_col is not None:
            break

    if region_col is None and not df.empty:
        region_col = df.columns[0]

    # Populate Columns Downwards (Unmerge Vertical for structural cols)
    cols_to_fill = list(df.columns[:3])
    if region_col is not None and region_col not in cols_to_fill:
        cols_to_fill.append(region_col)

    for col in cols_to_fill:
        if col in df.columns:
            df[col] = df[col].replace({0: None, '0': None})
            df[col] = df[col].ffill()

    # "Grand" Logic
    if region_col is not None and region_col in df.columns:
        grand_mask = df[region_col].astype(str).str.contains("Grand", case=False, na=False)
        if grand_mask.any():
            cutoff_idx = grand_mask.idxmax()
            df = df.loc[:cutoff_idx-1]

    # "Total" Logic (Region)
    if region_col is not None and region_col in df.columns:
        total_mask = df[region_col].astype(str).str.contains("Total", case=False, na=False)
        df = df[~total_mask]

    # "Total" Logic (Council)
    council_col = None
    council_keywords = ['council', 'halmashauri', 'district', 'lga', 'wilaya', 'municipal', 'town council']

    for i, row in df.head(5).iterrows():
        for col in df.columns:
            val = str(row[col]).lower()
            if any(kw in val for kw in council_keywords):
                council_col = col
                break
        if council_col is not None:
            break

    if council_col is not None and council_col in df.columns:
        pat = "Total|Sub-Total|Sub Total"
        council_total_mask = df[council_col].astype(str).str.contains(pat, case=False, na=False)
        df = df[~council_total_mask]

    # Final Cleanup: Remove rows with "Total" in first few columns
    target_indices = [0, 1, 2]
    for idx in target_indices:
        if idx < len(df.columns):
            col_name = df.columns[idx]
            mask = df[col_name].astype(str).str.contains("Total", case=False, na=False)
            df = df[~mask]

    # Duplicate Region Column Check
    if region_col is not None and region_col in df.columns:
        cols_to_drop = []
        for col in df.columns:
            if col == region_col: continue
            if df[col].equals(df[region_col]):
                cols_to_drop.append(col)
        if cols_to_drop:
            df = df.drop(columns=cols_to_drop)

    return df

def row_has_numeric(series):
    """Returns True if the row contains any numeric value."""
    for val in series:
        if isinstance(val, (int, float)) and not isinstance(val, bool):
            return True
        if isinstance(val, str):
            s = val.strip().replace(',', '')
            if s.replace('.', '', 1).isdigit():
                return True
    return False

def perform_final_cleanup(df):
    """
    Executes the final requested operations:
    1. Delete rows with only 1 cell of value (excluding the Source_Year column).
    2. Delete columns with >15% empty cells.
    """
    if df.empty:
        return df

    # --- 1. ROW CLEANUP ---
    # We check non-null count on all columns EXCEPT 'Source_Year'
    # If a row has <= 1 valid data cell, we drop it.
    cols_to_check = [c for c in df.columns if c != 'Source_Year']

    # Calculate non-nulls row-wise for data columns
    # We treat empty strings '' as Null here just in case
    temp_df = df[cols_to_check].replace('', None)
    row_counts = temp_df.notna().sum(axis=1)

    # Keep rows where we have MORE than 1 data value
    initial_rows = len(df)
    df = df[row_counts > 1]
    dropped_rows = initial_rows - len(df)
    if dropped_rows > 0:
        print(f"    (Cleaned {dropped_rows} rows having <= 1 data value)")

    # --- 2. COLUMN SPARSITY (15% Threshold) ---
    # Remove columns which have MORE than 15% empty cells
    threshold = 0.15
    initial_cols = len(df.columns)

    # Calculate null percentage
    # We treat 0 and '0' as valid values here? Usually yes, 0 is data.
    # But often empty strings are loaded as objects. Let's stick to standard NaNs/None.
    # If you consider '0' as empty, uncomment the replacement line below.
    # df_check = df.replace({0: None, '0': None, '': None})

    missing_pct = df.isna().mean()
    cols_to_keep = missing_pct[missing_pct <= threshold].index

    df = df[cols_to_keep]
    dropped_cols = initial_cols - len(df.columns)
    if dropped_cols > 0:
        print(f"    (Dropped {dropped_cols} columns with >15% empty cells)")

    return df

# ==========================================
# 2. Main Extraction and Combination Logic
# ==========================================

def extract_and_combine_all():
    base_dir = "/content/drive/MyDrive/BEST"

    # --- DEFINITION: Data Categories and Mappings ---

    # 1. Laboratories
    lab_mapping = {
        2016: ["3.26Lab", "3.27LabGov"],
        2017: ["3.36LabRegCoun", "3.37LabGovtRegCoun"],
        2018: ["3.37Lab", "3.38LabGov"],
        2019: ["3.37Lab", "3.38LabGov"],
        2020: ["3.37Lab", "3.38LabGov"],
        2021: ["T3.37Lab", "T3.38LabGov"],
        2022: ["T3.38Lab", "T3.39LabGov"],
        2023: ["T3.39LabG&NG", "T3.40LabG"],
        2024: ["T3.39LabG&NG", "T3.40LabG"],
        2025: ["T3.40LabG&NG", "T3.41LabG"]
    }

    # 2. ICT Equipment
    ict_mapping = {
        2017: ["T3.42ICTAllRegCoun", "T3.43ICTGovRegCoun"],
        2018: ["T2.42_ICT_G&N", "T2.43_ICT_G"],
        2019: ["Table170", "Table169"],
        2020: ["Table147", "Table148"],
        2021: ["Table155", "Table156"],
        2022: ["T2.43_ICT_G&N", "T2.44_ICT_G"],
        2023: ["T2.44_ICT_G&N", "T2.45_ICT_G"],
        2024: ["T2.44_ICT_G&N", "T2.45_ICT_G"],
        2025: ["T3.46ICT", "T3.47ICT_Gov"]
    }

    # 3. Electricity
    elec_mapping = {
        2017: ["T3.40SchElecAllRegCoun", "T3.41SchElecGovRegCoun"],
        2018: ["T2.44_Elect_G&N", "T2.45_Elect_G"],
        2019: ["Table165", "Table167"],
        2020: ["Table145", "Table146"],
        2021: ["Table152", "Table153"],
        2022: ["T2.41_Elect_G&N", "T2.42_Elect_G"],
        2023: ["T2.42_Elect_G&N", "T2.43_Elect_G"],
        2024: ["T2.42_Elect_G&N", "T2.43_Elect_G"],
        2025: ["T2.42_Elect_G&N", "T2.43_Elect_G"]
    }

    tasks = [
        ("Laboratories", lab_mapping, "Combined_Laboratories_All_G_NG.csv", "Combined_Laboratories_Govt.csv"),
        ("ICT_Equipment", ict_mapping, "Combined_ICT_All_G_NG.csv", "Combined_ICT_Govt.csv"),
        ("Electricity", elec_mapping, "Combined_Electricity_All_G_NG.csv", "Combined_Electricity_Govt.csv")
    ]

    for category_name, mapping, left_out, right_out in tasks:
        print(f"\n=======================================================")
        print(f" PROCESSING CATEGORY: {category_name}")
        print(f"=======================================================")

        left_dfs = []
        right_dfs = []

        processed_first_file = False

        for year, sheets in sorted(mapping.items()):
            filename = f"BEST {year}.xlsx"
            file_path = os.path.join(base_dir, filename)

            if not os.path.exists(file_path):
                print(f"Year {year}: File not found ({filename})")
                continue

            try:
                xls = pd.ExcelFile(file_path)
                available_sheets = xls.sheet_names

                # Normalize sheet names for robust lookup (ignore spaces)
                normalized_lookup = {s.replace(" ", ""): s for s in available_sheets}

                # Helper to process a single sheet
                def process_sheet_data(target_name_clean, is_first_time):
                    if target_name_clean in normalized_lookup:
                        real_sheet_name = normalized_lookup[target_name_clean]

                        # Load & Clean
                        df = pd.read_excel(xls, sheet_name=real_sheet_name, header=None)
                        df = normalize_merged_cells(df)
                        df = process_council_sheet(df)

                        # --- SMART ROW DELETION (Before adding Year) ---
                        status_suffix = ""
                        if not is_first_time:
                            check_limit = min(4, len(df))
                            top_slice = df.iloc[:check_limit]
                            rest_slice = df.iloc[check_limit:]

                            rows_to_keep = []
                            for idx in range(len(top_slice)):
                                row_data = top_slice.iloc[idx]
                                if row_has_numeric(row_data):
                                    rows_to_keep.append(top_slice.iloc[[idx]])

                            if rows_to_keep:
                                df = pd.concat(rows_to_keep + [rest_slice])
                                dropped_count = check_limit - len(rows_to_keep)
                                status_suffix = f"(Dropped {dropped_count} header rows, kept {len(rows_to_keep)} numeric rows)"
                            else:
                                df = rest_slice
                                status_suffix = f"(Dropped top {check_limit} header rows)"
                        else:
                            status_suffix = "(First file: All rows kept)"

                        # --- ADD YEAR COLUMN ---
                        df.insert(0, 'Source_Year', year)

                        print(f"  {year}: [FOUND] '{real_sheet_name}' {status_suffix}")
                        return df
                    else:
                        print(f"  {year}: [MISSING] '{target_name_clean}'")
                        return None

                # --- EXTRACT LEFT ---
                df_left = process_sheet_data(sheets[0], not processed_first_file)
                if df_left is not None:
                    left_dfs.append(df_left)

                # --- EXTRACT RIGHT ---
                if len(sheets) > 1:
                    df_right = process_sheet_data(sheets[1], not processed_first_file)
                    if df_right is not None:
                        right_dfs.append(df_right)

                # Mark success
                if df_left is not None or (len(sheets) > 1 and df_right is not None):
                    processed_first_file = True

            except Exception as e:
                print(f"  {year}: [ERROR] Processing file: {e}")

        # --- SAVE FILES FOR THIS CATEGORY ---
        print(f"\n--- Finalizing & Saving {category_name} ---")

        if left_dfs:
            combined_left = pd.concat(left_dfs, ignore_index=True)
            # PERFORM FINAL CLEANUP
            combined_left = perform_final_cleanup(combined_left)
            combined_left.to_csv(left_out, index=False, header=False)
            print(f"  -> Saved '{left_out}' ({len(combined_left)} rows)")
        else:
            print(f"  -> No data for '{left_out}'")

        if right_dfs:
            combined_right = pd.concat(right_dfs, ignore_index=True)
            # PERFORM FINAL CLEANUP
            combined_right = perform_final_cleanup(combined_right)
            combined_right.to_csv(right_out, index=False, header=False)
            print(f"  -> Saved '{right_out}' ({len(combined_right)} rows)")

    print("\n=== All Tasks Complete ===")

if __name__ == "__main__":
    extract_and_combine_all()


 PROCESSING CATEGORY: Laboratories
  2016: [FOUND] '3.26Lab' (First file: All rows kept)
  2016: [FOUND] '3.27LabGov' (First file: All rows kept)
  2017: [FOUND] '3.36LabRegCoun' (Dropped top 4 header rows)
  2017: [FOUND] '3.37LabGovtRegCoun' (Dropped top 4 header rows)
  2018: [FOUND] '3.37Lab' (Dropped top 4 header rows)
  2018: [FOUND] '3.38LabGov' (Dropped top 4 header rows)
  2019: [FOUND] '3.37Lab' (Dropped 2 header rows, kept 2 numeric rows)
  2019: [FOUND] '3.38LabGov' (Dropped 2 header rows, kept 2 numeric rows)
  2020: [FOUND] '3.37Lab' (Dropped 2 header rows, kept 2 numeric rows)
  2020: [FOUND] '3.38LabGov' (Dropped 2 header rows, kept 2 numeric rows)
  2021: [FOUND] 'T3.37Lab' (Dropped 2 header rows, kept 2 numeric rows)
  2021: [FOUND] 'T3.38LabGov' (Dropped 2 header rows, kept 2 numeric rows)
  2022: [FOUND] 'T3.38Lab' (Dropped top 4 header rows)
  2022: [FOUND] 'T3.39LabGov' (Dropped top 4 header rows)
  2023: [FOUND] 'T3.39LabG&NG' (Dropped top 4 header rows)
  2023: