# MNI Calculation

This notebook prepares data for MNI calculation and writes the results to the `source_data.xlsx` file.

In [None]:
import pandas as pd
import numpy as np
import sys
sys.path.append('..')
from scripts.mni import calculate_mni

# Load source data and normalise column names
df_occurrences = pd.read_pickle('../data/pkl/df_occurrences.pkl')
df_briana_with_responses = pd.read_pickle('../data/pkl/df_briana_with_responses.pkl')
df_occurrences.columns = df_occurrences.columns.str.strip()
df_briana_with_responses.columns = df_briana_with_responses.columns.str.strip()

# Coerce identifiers to integers
df_occurrences['ID'] = pd.to_numeric(df_occurrences['ID'], errors='coerce').astype('Int64')
df_briana_with_responses['OccurrenceID'] = pd.to_numeric(df_briana_with_responses['OccurrenceID'], errors='coerce').astype('Int64')

# Ensure a populated 'Taxon Label' column exists
if 'Taxon Label' not in df_occurrences.columns or df_occurrences['Taxon Label'].isna().all():
    df_occurrences['Taxon Label'] = pd.NA
    for c in ['Post: Taxon Guess?', 'Pre: Taxon']:
        if c in df_occurrences.columns:
            df_occurrences['Taxon Label'] = df_occurrences['Taxon Label'].fillna(df_occurrences[c])

# Select required columns from each table
columns_occurrences = ['ID', 'TransectUID', 'Taxon Label', 'Pre: Sex', 'Pre: Age']
columns_briana = ['OccurrenceID', 'Weathering class', 'What element is this?', 'Side', 'Complete', 'Complete?']
df_occurrences = df_occurrences.reindex(columns=columns_occurrences)
df_briana_with_responses = df_briana_with_responses.reindex(columns=columns_briana)

# Fill missing 'Complete?' values with fallback from 'Complete'
df_briana_with_responses['Complete?'] = (
    df_briana_with_responses['Complete?']
    .replace(r'^\s*$', pd.NA, regex=True)
    .fillna(df_briana_with_responses['Complete'])
)
df_briana_with_responses = df_briana_with_responses.drop(columns=['Complete'])

# Merge and keep OccurrenceID for later grouping
df = (
    df_occurrences
    .merge(
        df_briana_with_responses,
        left_on='ID',
        right_on='OccurrenceID',
        how='left',
    )
    .drop(columns=['ID'])
)
df['TransectUID'] = pd.to_numeric(df['TransectUID'], errors='coerce').astype('Int64')

# For each OccurrenceID, retain the highest weathering class
import re
def _wx_score(val):
    nums = re.findall(r'\d+', str(val))
    return max(map(int, nums)) if nums else -1
df['Weathering class'] = (
    df.groupby('OccurrenceID')['Weathering class']
    .transform(lambda s: s.loc[s.map(_wx_score).idxmax()])
)

# Keep all completed rows; for incomplete surveys, drop duplicate entries across key fields
non_no = df[df['Complete?'] != 'No']
subset_cols = [
    'TransectUID',
    'OccurrenceID',
    'Taxon Label',
    'Pre: Sex',
    'Pre: Age',
    'Weathering class',
    'What element is this?',
    'Side',
]
no_rows = df[df['Complete?'] == 'No'].drop_duplicates(subset=subset_cols)
df = pd.concat([non_no, no_rows], ignore_index=True)

# Drop helper columns
df = df.drop(columns=['OccurrenceID', 'Complete?'])

# Remove high-level taxa not needed for MNI
df = df[~df['Taxon Label'].str.lower().isin(['mammalia indet', 'ungulate'])]


In [None]:
# Pivot side counts by transect, taxon, sex, age, weathering class, and element
pivot_df = (
    df.pivot_table(
        index=["TransectUID", "Taxon Label", "Pre: Sex", "Pre: Age", "Weathering class", "What element is this?"],
        columns="Side",
        aggfunc="size",
        fill_value=0,
    )
    .rename_axis(columns=None)
    .reset_index()
)
pivot_df["TransectUID"] = pd.to_numeric(pivot_df["TransectUID"], errors="coerce").astype("Int64")

# Split counts for observations with unknown side evenly between left and right
if "unknown" in pivot_df.columns:
    pivot_df["unknown"] = np.ceil(pivot_df["unknown"] / 2).astype(int)

pivot_df.head()
from pathlib import Path
pivot_output_path = Path("../data/export/excel/pivot_df.xlsx")
pivot_output_path.parent.mkdir(parents=True, exist_ok=True)
pivot_df.to_excel(pivot_output_path, index=False)


In [None]:
mni_per_transect = calculate_mni(pivot_df)
mni_per_transect

In [None]:
from pathlib import Path

output_path = Path("../data/export/excel/source_data.xlsx")
if output_path.exists():
    transect_sheet = pd.read_excel(output_path, sheet_name="Transects")
else:
    transect_sheet = pd.DataFrame(columns=["TransectUID"])

if "TransectUID" not in transect_sheet.columns:
    if "UID" in transect_sheet.columns:
        transect_sheet = transect_sheet.rename(columns={"UID": "TransectUID"})
    else:
        transect_sheet["TransectUID"] = pd.NA

transect_sheet["TransectUID"] = pd.to_numeric(transect_sheet["TransectUID"], errors="coerce").astype("Int64")
mni_per_transect["TransectUID"] = pd.to_numeric(mni_per_transect["TransectUID"], errors="coerce").astype("Int64")

transect_sheet = transect_sheet.merge(mni_per_transect, on="TransectUID", how="left")
transect_sheet = transect_sheet.rename(columns={"MNI": "MNI_calc"})

output_path.parent.mkdir(parents=True, exist_ok=True)
if output_path.exists():
    with pd.ExcelWriter(output_path, engine="openpyxl", mode="a", if_sheet_exists="replace") as writer:
        transect_sheet.to_excel(writer, sheet_name="Transects", index=False)
else:
    with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
        transect_sheet.to_excel(writer, sheet_name="Transects", index=False)
