In [4]:
#!pip install openpyxl

In [12]:
import pandas as pd
from itertools import combinations_with_replacement

# Path to your Excel file (adjust folder and filename as needed)
input_path = "../data/import/excel/Aerial_Census_Data_BPedited_foranalysis.xlsx"
sheets = ["2005", "2006", "2007", "2008", "2009", "2010", "2012", "2013", "2014", "2015", "2016", "2017", "2019", "2020", "2021", "2022", "2023"]  # list of sheet names you want

# Helper to normalise header names (case/space tolerant)
def norm(s):
    return str(s).strip().lower()

# 1) Get sheet list
xls = pd.ExcelFile(input_path)
#sheets = xls.sheet_names  # or your custom list

# 2) Read only headers per sheet
headers = {}
for sh in sheets:
    # nrows=0 reads just the header row; no data is loaded
    df = pd.read_excel(xls, sheet_name=sh, nrows=0)
    headers[sh] = set(norm(c) for c in df.columns)

# 3) Build pairwise “same” and “different” cross tables
same_mat = pd.DataFrame(0, index=sheets, columns=sheets, dtype=int)
diff_mat = pd.DataFrame(0, index=sheets, columns=sheets, dtype=int)

for a, b in combinations_with_replacement(sheets, 2):
    A, B = headers[a], headers[b]
    same = len(A & B)               # intersection
    different = len(A ^ B)          # symmetric difference (in A or B, but not both)

    same_mat.loc[a, b] = same
    same_mat.loc[b, a] = same
    diff_mat.loc[a, b] = different
    diff_mat.loc[b, a] = different

# (Optional) A similarity percentage (Jaccard) matrix
jaccard = pd.DataFrame(0.0, index=sheets, columns=sheets)
for a, b in combinations_with_replacement(sheets, 2):
    A, B = headers[a], headers[b]
    union = len(A | B) or 1
    jac = len(A & B) / union
    jaccard.loc[a, b] = jac
    jaccard.loc[b, a] = jac

# 4) Per-sheet coverage vs the union (how many headers missing vs global set)
all_headers = set().union(*headers.values())
coverage = pd.DataFrame({
    "n_headers": [len(headers[sh]) for sh in sheets],
    "n_in_union": [len(all_headers) for _ in sheets],
    "n_missing_from_union": [len(all_headers - headers[sh]) for sh in sheets],
    "n_extra_beyond_union": [0 for _ in sheets]  # always 0 by definition
}, index=sheets)

# Show results
same_mat, diff_mat, jaccard.round(3), coverage






#dfs = {}  # dictionary to store DataFrames
#for sheet in sheets:
#    globals()[f"sheet_{sheet}_df"] = pd.read_excel(input_path, sheet_name=sheet)

# Now you can use:
#print(sheet_2005_df.head())
#print(sheet_2006_df.head())






(      2005  2006  2007  2008  2009  2010  2012  2013  2014  2015  2016  2017  \
 2005    16    16    16    16    16    16    16    16    16    16    16    16   
 2006    16    16    16    16    16    16    16    16    16    16    16    16   
 2007    16    16    16    16    16    16    16    16    16    16    16    16   
 2008    16    16    16    16    16    16    16    16    16    16    16    16   
 2009    16    16    16    16    16    16    16    16    16    16    16    16   
 2010    16    16    16    16    16    16    16    16    16    16    16    16   
 2012    16    16    16    16    16    16    16    16    16    16    16    16   
 2013    16    16    16    16    16    16    16    16    16    16    16    16   
 2014    16    16    16    16    16    16    16    16    16    16    16    16   
 2015    16    16    16    16    16    16    16    16    16    16    16    16   
 2016    16    16    16    16    16    16    16    16    16    16    16    16   
 2017    16    16    16    1

In [20]:
# Collect DataFrames
df_list = []
for sh in sheets:
    df = pd.read_excel(input_path, sheet_name=sh)
    # add Year column as integer
    try:
        df["Year"] = int(sh)
    except ValueError:
        # if sheet name is not a number, store as None or skip
        df["Year"] = None
    df_list.append(df)

# Concatenate everything
all_data = pd.concat(df_list, ignore_index=True)

# Show preview
all_data.to_pickle("../data/pkl/df_a2_aerial_cencus.pkl")
all_data.head()

Unnamed: 0,Date,Year,Month,Property,Sector,Block,Species,Total,Category,Remarks,ObjectId,GlobalID,CreationDate,Creator,EditDate,Editor
0,2005-09-01,2005,September,Ol Pejeta,Eastern,,Elephant,40,Mega herbivore,,9513,69a31103-37bd-4189-93c7-0de8643687d5,2021-01-22 12:56:33.973,alfred.kibungei_Olpejeta,2021-01-22 12:56:33.973,alfred.kibungei_Olpejeta
1,2005-09-01,2005,September,Ol Pejeta,Eastern,,Buffalo,243,Mega herbivore,,9514,e8c6e83a-4de6-46b5-8a9e-299fb0a9077f,2021-01-22 12:56:33.973,alfred.kibungei_Olpejeta,2021-01-22 12:56:33.973,alfred.kibungei_Olpejeta
2,2005-09-01,2005,September,Ol Pejeta,Eastern,,Black rhino,10,Mega herbivore,,9515,3da8c74b-657d-4c5d-8abe-8a80ec37f859,2021-01-22 12:56:33.973,alfred.kibungei_Olpejeta,2021-01-22 12:56:33.973,alfred.kibungei_Olpejeta
3,2005-09-01,2005,September,Ol Pejeta,Eastern,,Plains zebra,431,Mega herbivore,,9516,36b1a748-cd5f-4cde-8980-4c523d252c5e,2021-01-22 12:56:33.973,alfred.kibungei_Olpejeta,2021-01-22 12:56:33.973,alfred.kibungei_Olpejeta
4,2005-09-01,2005,September,Ol Pejeta,Eastern,,Grevy's zebra,3,Mega herbivore,,9517,2cbbb7f2-69ac-44cc-9ee4-c3a8b605192e,2021-01-22 12:56:33.973,alfred.kibungei_Olpejeta,2021-01-22 12:56:33.973,alfred.kibungei_Olpejeta
