In [4]:
#!pip install openpyxl

In [8]:
import pandas as pd
from itertools import combinations_with_replacement

# Path to your Excel file (adjust folder and filename as needed)
input_path = "../data/import/excel/MNI_Data_KLi_edited_foranalysis.xlsx"
sheets = ["Data"]  # list of sheet names you want

# Helper to normalise header names (case/space tolerant)
def norm(s):
    return str(s).strip().lower()

# 1) Get sheet list
xls = pd.ExcelFile(input_path)
#sheets = xls.sheet_names  # or your custom list

# 2) Read only headers per sheet
headers = {}
for sh in sheets:
    # nrows=0 reads just the header row; no data is loaded
    df = pd.read_excel(xls, sheet_name=sh, nrows=0)
    headers[sh] = set(norm(c) for c in df.columns)

# 3) Build pairwise “same” and “different” cross tables
same_mat = pd.DataFrame(0, index=sheets, columns=sheets, dtype=int)
diff_mat = pd.DataFrame(0, index=sheets, columns=sheets, dtype=int)

for a, b in combinations_with_replacement(sheets, 2):
    A, B = headers[a], headers[b]
    same = len(A & B)               # intersection
    different = len(A ^ B)          # symmetric difference (in A or B, but not both)

    same_mat.loc[a, b] = same
    same_mat.loc[b, a] = same
    diff_mat.loc[a, b] = different
    diff_mat.loc[b, a] = different

# (Optional) A similarity percentage (Jaccard) matrix
jaccard = pd.DataFrame(0.0, index=sheets, columns=sheets)
for a, b in combinations_with_replacement(sheets, 2):
    A, B = headers[a], headers[b]
    union = len(A | B) or 1
    jac = len(A & B) / union
    jaccard.loc[a, b] = jac
    jaccard.loc[b, a] = jac

# 4) Per-sheet coverage vs the union (how many headers missing vs global set)
all_headers = set().union(*headers.values())
coverage = pd.DataFrame({
    "n_headers": [len(headers[sh]) for sh in sheets],
    "n_in_union": [len(all_headers) for _ in sheets],
    "n_missing_from_union": [len(all_headers - headers[sh]) for sh in sheets],
    "n_extra_beyond_union": [0 for _ in sheets]  # always 0 by definition
}, index=sheets)

# Show results
same_mat, diff_mat, jaccard.round(3), coverage

Unnamed: 0,Date,Year,Month,Property,Sector,Block,Species,Total,Category,Remarks,ObjectId,GlobalID,CreationDate,Creator,EditDate,Editor


In [10]:
# Collect DataFrames
df_list = []
for sh in sheets:
    df = pd.read_excel(input_path, sheet_name=sh)
    # add Year column as integer
    try:
        df["Year"] = int(sh)
    except ValueError:
        # if sheet name is not a number, store as None or skip
        df["Year"] = None
    df_list.append(df)

# Concatenate everything
all_data = pd.concat(df_list, ignore_index=True)

# Show preview
all_data.to_pickle("../data/pkl/df_a2_bone_cencus.pkl")
all_data.head()

Unnamed: 0,Date,Year,Month,Property,Sector,Block,Species,Total,Category,Remarks,ObjectId,GlobalID,CreationDate,Creator,EditDate,Editor
0,2011-07-08,,,Ol Pejeta,Eastern,48,Eudorcas thomsonii,1,Unknown-adult-1,grass closed,,,2025-08-20,Kari Lintulaakso,,
1,2011-07-08,,,Ol Pejeta,Eastern,48,Equus burchellii,1,Unknown-adult-1,grass closed,,,2025-08-20,Kari Lintulaakso,,
2,2011-07-08,,,Ol Pejeta,Eastern,48,Equus burchellii,1,Unknown-adult-1-2,grass closed,,,2025-08-20,Kari Lintulaakso,,
3,2011-07-08,,,Ol Pejeta,Eastern,48,Equus burchellii,1,Unknown-adult-2,grass closed,,,2025-08-20,Kari Lintulaakso,,
4,2011-07-10,,,Ol Pejeta,Eastern,49,Alcelaphus buselaphus,2,Male-adult-3,grass closed,,,2025-08-20,Kari Lintulaakso,,
