In [1]:
# conda create -n physical_capital_england pandas python-docx ipykernel

# External packages
import pandas as pd
import docx

# Standard library
from pathlib import Path
from dataclasses import dataclass, asdict
from collections import defaultdict
from typing import List, Dict, Tuple, Literal, DefaultDict
import re
import warnings
from math import isnan

# Extract grain info from Manorial Records (English)

## Monetary Classes

In [2]:
@dataclass(slots=True)
class PoundsShillingsPence:
    pounds: float = 0.0
    shillings: float = 0.0
    pence: float = 0.0
    
    ## Methods
    def __add__(self, other:'PoundsShillingsPence') -> 'PoundsShillingsPence':
        if isinstance(other, PoundsShillingsPence):
            return PoundsShillingsPence(self.pounds + other.pounds, self.shillings + other.shillings, self.pence + other.pence)
        else:
            raise TypeError("Unsupported operand type(s) for +: 'PoundsShillingsPence' and '{}'".format(type(other).__name__))
    
    def __iadd__(self, other:'PoundsShillingsPence') -> 'PoundsShillingsPence':
        if isinstance(other, PoundsShillingsPence):
            self.pounds += other.pounds
            self.shillings += other.shillings
            self.pence += other.pence
            return self
        else:
            raise TypeError("Unsupported operand type(s) for +: 'PoundsShillingsPence' and '{}'".format(type(other).__name__))
    
    def __mul__(self, other:int|float) -> 'PoundsShillingsPence':
        if isinstance(other, (int, float)):
            return PoundsShillingsPence(self.pounds * other, self.shillings * other, self.pence * other)
        else:
            raise TypeError("Unsupported operand type(s) for *: 'PoundsShillingsPence' and '{}'".format(type(other).__name__))
        
    def is_empty(self) -> bool:
        return self.pounds == self.shillings == self.pence == 0.0
        
        
    def return_total_amount(self, output_format:Literal["pounds", "shillings", "pence"] = 'pounds') -> float:
        """Converts coinage to the specified format (pounds, shillings, or pence), see: https://w.wiki/DL3J."""
        total_amount_pounds = self.pounds + (self.shillings / 20) + (self.pence / 240)
        assert total_amount_pounds >= 0, f"Total monetary amount should be a non-negative number, currently it is {total_amount_pounds}."
        
        if output_format == 'pounds':
            return total_amount_pounds
        elif output_format == 'shillings':
            return total_amount_pounds * 20
        elif output_format == 'pence':
            return total_amount_pounds * 240
        else:
            raise ValueError("Invalid output format. Choose 'pounds', 'shillings', or 'pence'.")
    
    # Regex for currencies in pounds, shillings and pence
    _currency_regex = re.compile(r"""
    £(?P<pounds>\d+(\.\d+)?)\s |
    (?P<shillings>\d+(\.\d+)?)s\. |
    (?P<pence>\d+(\.\d+)?)d\.
    """, re.VERBOSE)
    
    # Create PoundsShillingsPence object from a string
    @staticmethod
    def create_from_string(string: str):
        """Extracts currency from a string."""
        currency = PoundsShillingsPence()
        # Find all matches
        matches = PoundsShillingsPence._currency_regex.finditer(string)
        for match in matches:
            coinage_type = match.lastgroup
            setattr(currency, coinage_type, float(match.group(coinage_type)))
        return currency

if __name__ == "__main__":
    print(PoundsShillingsPence.create_from_string("£69.7 5.2s. 11.26d."))
    print(PoundsShillingsPence.create_from_string("£ 5.02s. 11.26d."))
    
    print(PoundsShillingsPence(69, 7, 5.2) * 2)
    print(PoundsShillingsPence(69, 7, 5.2) * 2.5)
    print(PoundsShillingsPence(69, 7, 5.2).return_total_amount())
    print(PoundsShillingsPence(5, 5, 5) + PoundsShillingsPence(10, 10, 10))
    

PoundsShillingsPence(pounds=69.7, shillings=5.2, pence=11.26)
PoundsShillingsPence(pounds=0.0, shillings=5.02, pence=11.26)
PoundsShillingsPence(pounds=138, shillings=14, pence=10.4)
PoundsShillingsPence(pounds=172.5, shillings=17.5, pence=13.0)
69.37166666666666
PoundsShillingsPence(pounds=15, shillings=15, pence=15)


## General Grain Classes

In [3]:
class GrainTypes:
    VALID_GRAINS = ["wheat", "bere", "curall", "barley", "oats", "peas", "vetches", "rye", "maslin", "dredge", "beans", "mancorn", "unknown"]

    @classmethod
    def contains_valid_grain(cls, str):
        """Check if the string contains any valid grain types."""
        lower_str = str.lower()
        return any(grain in lower_str for grain in cls.VALID_GRAINS)
    
    @classmethod
    def is_valid_grain(cls, grain_type: str) -> bool:
        """Check if the grain type is valid."""
        return grain_type in cls.VALID_GRAINS
    
    @classmethod
    def get_valid_grain(cls, str: str) -> str|None:
        """Returns the first valid grain type found in the string. Or None if no valid grain type is found."""
        lower_str = str.lower()
        for grain in cls.VALID_GRAINS:
            if grain in lower_str:
                return grain
        return None
    
if __name__ == "__main__":
    # Example usage of the GrainTypes class
    test_string = "This is a test string with wheat and barley."
    print(GrainTypes.contains_valid_grain(test_string))  # Output: True
    print(GrainTypes.is_valid_grain("wheat"))  # Output: True
    
    
@dataclass(slots=True)
class GrainAmount:
    bushels: float = 0.0
    quarters: float = 0.0
    
    # Regex for bushels and quarters
    AMOUNT_REGEX = re.compile(
        r"""
        (?P<quarters>\d+(\.\d+)?)\s?qrs? |  # Matches "<number> qrs?" with named group 'quarters'.
        (?P<bushels>\d+(\.\d+)?)\s?bus     # Matches "<number> bus" with named group 'bushels'.
        """, 
        re.VERBOSE
    )
    
    def __add__(self, other:'GrainAmount') -> 'GrainAmount':
        if isinstance(other, GrainAmount):
            return GrainAmount(self.bushels + other.bushels, self.quarters + other.quarters)
        else:
            raise TypeError("Unsupported operand type(s) for +: 'GrainAmount' and '{}'".format(type(other).__name__))
        
    def __iadd__(self, other: 'GrainAmount') -> 'GrainAmount':
        if isinstance(other, GrainAmount):
            self.bushels += other.bushels
            self.quarters += other.quarters
            return self
        else:
            raise TypeError("Unsupported operand type(s) for +=: 'GrainAmount' and '{}'".format(type(other).__name__))
    
    def __imul__(self, other: int | float) -> 'GrainAmount':
        if isinstance(other, (int, float)):
            self.bushels *= other
            self.quarters *= other
            return self
        else:
            raise TypeError("Unsupported operand type(s) for *=: 'GrainAmount' and '{}'".format(type(other).__name__))
        
    def __lt__(self, other: 'GrainAmount') -> bool:
        return self.return_total_bushels() < other.return_total_bushels()
    
    def __gt__(self, other: 'GrainAmount') -> bool:
        return self.return_total_bushels() > other.return_total_bushels()
    
    def return_total_quarters(self) -> float:
        """Convert bushels and quarters to quarters see: https://www.nottingham.ac.uk/manuscriptsandspecialcollections/researchguidance/weightsandmeasures/volumes.aspx."""
        total_quarters = self.quarters + (self.bushels / 8)
        assert total_quarters > 0 or isnan(total_quarters), f"Total quarters should be a positive number or nan, currently it is: {total_quarters}."
        return total_quarters
    
    def return_total_quarters_unsafe(self) -> float:
        """Convert bushels and quarters to quarters without checking for negative values."""
        total_quarters = self.quarters + (self.bushels / 8)
        return total_quarters
    
    def return_total_bushels(self) -> float:
        """Convert bushels and quarters to bushels see: https://www.nottingham.ac.uk/manuscriptsandspecialcollections/researchguidance/weightsandmeasures/volumes.aspx."""
        total_bushels = self.bushels + (self.quarters * 8)
        assert total_bushels > 0 or isnan(total_bushels), f"Total bushels should be a positive number or nan, currently it is: {total_bushels}."
        return total_bushels
    
    def is_empty(self) -> bool:
        """Check if the grain amount is empty."""
        return self.bushels == 0 and self.quarters == 0
    
    def set_nan(self) -> None:
        """Set the bushels and quarters to nan."""
        self.bushels = float('nan')
        self.quarters = float('nan')
    
    @classmethod
    def from_string(cls, string: str) -> 'GrainAmount':
        """Extracts bushels and quarters from a string, and returns a GrainAmount object.
        NOTE: Returns an empty GrainAmount object if no bushels or quarters are found."""
        
        # If string contains "1⁄2" replace it with ".5"
        string = string.replace("1⁄2", ".5")
        
        quarters = 0.0
        bushels = 0.0
        for match in cls.AMOUNT_REGEX.finditer(string):
            if match.group("quarters"):
                if quarters != 0.0:
                    raise ValueError(f"Multiple quarters found in string: {string}.")
                quarters = float(match.group("quarters"))
            elif match.group("bushels"):
                if bushels != 0.0:
                    raise ValueError(f"Multiple bushels found in string: {string}.")
                bushels = float(match.group("bushels"))
        
        return cls(bushels, quarters)
    
    @classmethod
    def from_string_bias_first(cls, string:str) -> 'GrainAmount':
        """Extracts bushels and quarters from a string, and returns a GrainAmount object. If both bushels and quarters are found, it will bias towards the first one."""
        
        # If string contains "1⁄2" replace it with ".5"
        string = string.replace("1⁄2", ".5")
        
        quarters = 0.0
        bushels = 0.0
        for match in cls.AMOUNT_REGEX.finditer(string):
            if match.group("quarters"):
                if quarters != 0.0:
                    continue
                quarters = float(match.group("quarters"))
            elif match.group("bushels"):
                if bushels != 0.0:
                    continue
                bushels = float(match.group("bushels"))
        
        return cls(bushels, quarters)
    
    @classmethod
    def from_string_return_all(cls, string:str) -> List['GrainAmount']:
        """Extracts bushels and quarters from a string, and returns a list of GrainAmount objects."""
        
        # If string contains "1⁄2" replace it with ".5"
        string = string.replace("1⁄2", ".5")
        
        grain_amounts = []
        temp_grain_amount = GrainAmount()
        previous_end = len(string)  # Ensure that the first match does not automatically start a new GrainAmount
        
        # Find all matches
        for match in GrainAmount.AMOUNT_REGEX.finditer(string):
            # If this match is not adjacent to the previous one, start a new GrainAmount
            if match.start() - previous_end > 1:
                if temp_grain_amount.quarters is not None or temp_grain_amount.bushels is not None:
                    grain_amounts.append(temp_grain_amount)
                temp_grain_amount = GrainAmount()

            # Set the appropriate attribute
            if match.group("quarters"):
                temp_grain_amount.quarters = float(match.group("quarters"))
            elif match.group("bushels"):
                temp_grain_amount.bushels = float(match.group("bushels"))

            previous_end = match.end()

        # Append the final grain amount if it contains data
        if not temp_grain_amount.is_empty():
            grain_amounts.append(temp_grain_amount)
        
        return grain_amounts
    
if __name__ == "__main__":
    print(GrainAmount.from_string_bias_first("Sown over 55 acres measured by the perch in the fields of Ynlond', Schepelond' and la Combe, 20 qrs 5 bus., that is, 3 bus. an acre.."))
    print(max([GrainAmount(1, 2), GrainAmount(2, 2)]))
    print(GrainAmount.from_string_return_all("Sown over 55 acres measured by the perch in the fields of Ynlond', Schepelond' and la Combe, 20 qrs 5 bus., that is, 3 bus. an acre.."))
    print(sum([GrainAmount(2,3), GrainAmount(1,2), GrainAmount(3,4)], GrainAmount())) # Example of summing GrainAmount objects

True
True
GrainAmount(bushels=5.0, quarters=20.0)
GrainAmount(bushels=2, quarters=2)
[GrainAmount(bushels=5.0, quarters=20.0), GrainAmount(bushels=3.0, quarters=0.0)]
GrainAmount(bushels=6.0, quarters=9.0)


## Grain Sales classes

In [4]:
@dataclass(slots=True)
class GrainSale:
    grain_type: str
    unit_sale_type: str # Either 'bus' or 'qrs'
    per_unit_sale_price: PoundsShillingsPence
    grain_amount: GrainAmount
    
    def __init__(self, grain_type: str, unit_sale_type: str, per_unit_pounds:float, per_unit_shillings:float, per_unit_pence:float, bushels: float, quarters: float):
        self.grain_type = grain_type
        self.unit_sale_type = unit_sale_type
        self.per_unit_sale_price = PoundsShillingsPence(per_unit_pounds, per_unit_shillings, per_unit_pence)
        self.grain_amount = GrainAmount(bushels, quarters)
        self.__post_init__()

    def __post_init__(self):
        # Check if grain_type is valid
        if not GrainTypes.is_valid_grain(self.grain_type):
            raise ValueError(f"{self.grain_type} is not a valid grain type.")

    def return_total_quarters(self) -> float:
        """Return the total amount of quarters for the sale."""
        return self.grain_amount.return_total_quarters()
    
    def return_total_bushels(self) -> float:
        """Return the total amount of bushels for the sale."""
        return self.grain_amount.return_total_bushels()
    

@dataclass(slots=True)
class GrainStatistics: # TODO: Rename to GrainSaleStatistics
    average_price_per_unit: float
    calculated_total_amount: float
    written_total_amount: float
    amount_error: float
    amount_unit: Literal["bus", "qrs"]
    calculated_gross_sales: float
    written_gross_sales: float
    sale_error: float
    price_unit: Literal["pounds", "shillings", "pence"]
    

@dataclass(slots=True)
class GrainSales:
    """Grain sales for a single manorial record"""
    # Attributes
    manor_name: str
    total_sales: PoundsShillingsPence # Gross return for all grain sales.
    
    # Default Dictionary with grain_type as key and a list of GrainSale objects as values.
    # NOTE: We use a default dict because in some cases there are multiple lines per grain type.
    sales: DefaultDict[str, List[GrainSale]]
    
    # Dictionaries with grain_type as key, storing the gross sales (in both price and amount).
    written_gross_sales: DefaultDict[str, PoundsShillingsPence]
    written_gross_amount: DefaultDict[str, GrainAmount]
    
    def __init__(self, manor_name:str):
        
        # Set attributes
        self.manor_name = manor_name
        self.total_sales = PoundsShillingsPence()
        self.sales = defaultdict(list)
        self.written_gross_sales = defaultdict(PoundsShillingsPence)
        self.written_gross_amount = defaultdict(GrainAmount)
    
    ## Private methods (mainly helper functions)
    @staticmethod
    def _initialize_grain_sale_variables():
        """Internal function to initialize variables for a grain sale"""
        return 0.0, 0.0, 0.0, 0.0, 0.0, ""
    
    @staticmethod
    def _split_string_on_total(string: str) -> Tuple[str, str]:
        """Split a string on the first occurence of 'Total' case insensitive."""
        split_string = string.split("Total", 1)
        return split_string[0], split_string[1]
    
    @staticmethod
    def __init_grainsales_table(number_of_grain_types:int) -> pd.DataFrame:
        """Initialize a DataFrame to store the grain sales for a manor."""
        columns = [
            "grain_type", "average_price_per_bushel_pounds", "average_price_per_quarter_pounds",
            "written_gross_sales_pounds", "calculated_gross_sales_pounds", "gross_sales_error",
            "written_gross_amount_bushels", "calculated_gross_amount_bushels", "gross_amount_error_bushels",
            "written_gross_amount_quarters", "calculated_gross_amount_quarters", "gross_amount_error_quarters"
        ]
        # For optimal performance, we specify the columns and index in the DataFrame constructor.
        return pd.DataFrame(columns=columns, index=range(number_of_grain_types)).astype({"grain_type": "str"})
    
    # Regex to help extract data from a grain sale
    _SALE_OF_CORN_REGEX = re.compile(r"""
        (?P<units_qrs>\ba\ qr\b) |                         # Matches "a qr".
        (?P<units_bus>\ba\ bus\b) |                        # Matches "a bus".
        £(?P<pounds>\d+(\.\d+)?)\s |                       # Matches "£<number> ".
        (?P<shillings>\d+(\.\d+)?)s\. |                    # Matches "<number>s.".
        (?P<pence>\d+(\.\d+)?)d\. |                        # Matches "<number>d.".
        (?P<quarters>\d+(\.\d+)?)\s?qrs? |                 # Matches "<number> qrs?".
        (?P<bushels>\d+(\.\d+)?)\s?bus                     # Matches "<number> bus".
        """, re.VERBOSE)
    
    ## Public methods
    def add_grain_sale(self, string:str) -> None:
        """Adds a grain sale to the manorial record, from a line in the manorial records.

        Args:
            string (str): A sentence from a 'sale of corn' paragraph in the manorial records. Should be pre-split on ';'.

        Raises:
            ValueError: If gross sales could not be extracted from the string. Caused by missing 'from', in the string.
            ValueError: If the regex has an unknown match type. Should not happen, as all match types are predefined.
            ValueError: If the length of the grainsales_list is 0, and the unit sale type could not be imputed.
        """
        
        # Extract grain type
        grain_type = GrainTypes.get_valid_grain(string)
        if grain_type is None:
            grain_type = "unknown"
            warnings.warn(f"Setting grain type to unknown, for string: {string}.")
        
        ## Extracting gross grain sales (per grain type)
        from_marker:bool = False
        if 'from' in string:
            from_marker = True
            gross_grain_sales_string , string = string.split("from", 1)
            written_gross_grain_sales_graintype = PoundsShillingsPence.create_from_string(gross_grain_sales_string)
            self.written_gross_sales[grain_type] += written_gross_grain_sales_graintype
        else: 
            raise ValueError("Could not extract gross grain sales due to missing 'from' in string.")
        
        ## Extracting gross grain sales for manor, and identifying grain type       
        # If 'Total' is in the string split on it and extract the total for all grain types
        if "Total" in string:
            # Extract total for all grain types
            string, total_string = self._split_string_on_total(string)
            # Extract the total for all grain types
            self.total_sales = PoundsShillingsPence.create_from_string(total_string)
        
        ## Sale of corn regex matching
        # Extract type and value of the matches. NOTE: These are returned in order found from the start of the string. There should be no overlaps.
        matches: List[Tuple[str, str]] = []

        # Loop through all matches and append the type and value to the matches list.
        # The match_type is the name of the group in the regex pattern.
        for match in self._SALE_OF_CORN_REGEX.finditer(string):
            match_type = match.lastgroup
            if match_type == "units_qrs":
                matches.append(("units", "qrs"))
                continue
            elif match_type == "units_bus":
                matches.append(("units", "bus"))
                continue
            else:
                value = match.group(f"{match_type}")
            matches.append((match_type, value))
        
        # Initialize variables
        grainsales_list = []
        quarters, bushels, pence, pounds, shillings, per_unit = self._initialize_grain_sale_variables()
        
        # Loop through the matches in reverse order
        # This is done to ensure that we always have a amount, unit and price for a grain sale.
        for match_type, value in reversed(matches):
            
            # If the match_type is 'quarters' or 'bushels' and per_unit is not empty we are starting at a new block (or sale)
            if match_type in ["quarters", "bushels"] and per_unit != "":
                
                # Create a new GrainSale object with the extracted values
                grainsales_list.append(GrainSale(grain_type, per_unit, pounds, shillings, pence, bushels, quarters))
                # Reset the variables
                quarters, bushels, pence, pounds, shillings, per_unit = self._initialize_grain_sale_variables()
            
            # Set the values for the variables based on the match match_type:
            match match_type:
                case "quarters":
                    quarters = float(value)
                case "bushels":
                    bushels = float(value)
                case "units":
                    per_unit = value
                case "pounds":
                    pounds = float(value)
                case "shillings":
                    shillings = float(value)
                case "pence":
                    pence = float(value)
                case _:
                    raise ValueError(f"Unknown match type: {match_type}")

        # Get the total amount of grain sold (is always at the start of a string after from see: "£7 from 30 qrs of barley sold, at 4s. 8d. a qr")
        written_gross_amount_graintype = GrainAmount(bushels, quarters)
        
        # If the written gross amount is empty, we set it to 'nan'. In some cases this amount is not known therefore nan instead of 0.
        if written_gross_amount_graintype.is_empty():
            written_gross_amount_graintype.set_nan()
        
        # Set the written gross amount for the grain type (because this can be split across lines add in place)
        self.written_gross_amount[grain_type] += written_gross_amount_graintype
        
        ## Edge cases handling        
        # If the length of the list is 1, that sale amount is equal to the total amount of that grain sold.
        if len(grainsales_list) == 1:
            grainsales_list[0].grain_amount = written_gross_amount_graintype
            
        # In some cases the length is 0, example: "5s. 4d. from 2 qrs of dredge sold" or "4s. 1d. from 7 bus. of curall sold"
        # This means that a grain type was identified, but no concrete full sale was found, as the price for the amount is before the 'from'.
        # In this case we impute the price per unit, by dividing the gross price by the total amount of grain sold.
        if len(grainsales_list) == 0:
            
            # Depending if we have quarters or bushels noted we can impute per what unit type the price is.
            if quarters > 0 and bushels == 0:
                imputed_unit_sale = "qrs"
            elif bushels > 0 and quarters == 0:
                imputed_unit_sale = "bus"
            elif bushels > 0 and quarters > 0:
                # Convert to smallest unit (Bushels)
                bushels = bushels + (quarters * 8)
                quarters = 0.0
                imputed_unit_sale = "bus"
            else:
                # If the unit sale type can still not be imputed, we set it to 'unknown', with the price per unit as float('nan') for pounds, shillings, and pence.
                warnings.warn(f"Could not impute unit sale type for string: {string}, with grain type: {grain_type}, for manor: {self.manor_name}.")
                imputed_unit_sale = "unknown"
                pounds, shillings, pence, bushels, quarters = float('nan'), float('nan'), float('nan'), float('nan'), float('nan')
            
            # Calculate the price per unit (if we can!), by dividing the gross price by the total amount of grain sold.
            if imputed_unit_sale in ["bus", "qrs"]:
                
                # Extract the gross price from the written gross sales
                if from_marker:
                    gross_price:PoundsShillingsPence = written_gross_grain_sales_graintype
                else:
                    gross_price:PoundsShillingsPence = self.written_gross_sales[grain_type]
                
                # Calculate the price per unit in pence
                if imputed_unit_sale == "qrs":
                    pence = gross_price.return_total_amount("pence") / quarters
                else:
                    pence = gross_price.return_total_amount("pence") / bushels
            
            # Create and append a new GrainSale object with the imputed values
            grainsales_list.append(GrainSale(grain_type, imputed_unit_sale, pounds, shillings, pence, bushels, quarters))
        
        # Add the grain sales to the sales dictionary
        self.sales[grain_type].extend(grainsales_list)
        
    def add_grain_sale_paragraph(self, paragraph:str) -> None:
        """Add a paragraph with grain sales to the manorial record.

        Args:
            paragraph (str): A paragraph with grain sales from the manorial records.
        """
        
        # Replace 'a qr,' with 'a qr;' to ensure that the paragraph is split correctly.
        paragraph = paragraph.replace("a qr,", "a qr;")
        
        # Split the paragraph on ';', as sales per type of grain are separated by ';'.
        lines = paragraph.split(";")    
        
        # Loop through the lines (containing the sales of corn)
        for line in lines:
            self.add_grain_sale(line)  
    
    def return_grain_sales_statistics(self, grain_type: str, output_unit:Literal["bushels", "quarters"] = "bushels",
                                      prices_as:Literal["pounds", "shillings", "pence"] = "pounds") -> GrainStatistics:
        """Returns statistics for a specific grain type. Includes, average price per unit, total amount of grain sold, total amount of grain sold according to the written records,
        error in amount, output unit, written gross sales, calculated gross sales, and error in sales.

        Args:
            grain_type (str): The grain type to return statistics for.
            output_unit (Literal["bushels", "quarters"], optional): Output amounts in bushels or quarters. Defaults to "bushels".
            prices_as (Literal["pounds", "shillings", "pence"], optional): Output sales as pounds, shillings or pence. Defaults to "pounds".

        Raises:
            KeyError: If the grain_type is not in the sales dictionary.
            ValueError: If the output_unit is not a valid output unit.
            ValueError: If the prices_as is not a valid price unit.            

        Returns:
            GrainStatistics: A dataclass with the statistics for the grain type, from this Manorial record.
        """
        
        # Input checks
        if grain_type not in self.sales:
            raise KeyError(f"{grain_type} is not in the sales dictionary.")
        match output_unit:
            case "bushels":
                output_unit = "bus"
            case "quarters":
                output_unit = "qrs"
            case _:
                raise ValueError(f"{output_unit} is not a valid output unit.")
        if prices_as not in ["pounds", "shillings", "pence"]:
            raise ValueError(f"{prices_as} is not a valid price unit.")
        
        # If length of sales is 0, return a mostly empty GrainStatistics object.
        if len(self.sales[grain_type]) == 0:
            print("This should not get triggered, but it did, check it.")
            return GrainStatistics(0.0, 0.0, 0.0, 0.0, output_unit, 0.0, 0.0, 0.0, prices_as)
        
        # In case the length of the grain sales is 1, we might encounter 'nan' values, so we handle this case seperately.
        # We set the 'calculated' values to 'nan', as well as the amounts and return the written values for gross sales.
        if len(self.sales[grain_type]) == 1 and self.sales[grain_type][0].unit_sale_type == "unknown":
            return GrainStatistics(average_price_per_unit=float('nan'), calculated_total_amount=float('nan'), written_total_amount=float('nan'),
                                   amount_error=float('nan'), amount_unit=output_unit, calculated_gross_sales=float('nan'),
                                   written_gross_sales=self.written_gross_sales[grain_type].return_total_amount(prices_as), sale_error=float('nan'), price_unit=prices_as) 
            
        # Initialize variables
        total_sales = PoundsShillingsPence()
        total_grain_sold = GrainAmount()
        
        # loop through the sales (for the specific grain type) and calculate the total sales and total amount of grain sold. 
        for sale in self.sales[grain_type]:
            
            # Extract the amount of grain sold
            sale_grain_amount = sale.grain_amount
            total_grain_sold = sale_grain_amount + total_grain_sold
            
            # If sale grain amount is empty, we skip the sale. NOTE: This really only occurs in case the paragraph is not propertly split for the grain sales per type.
            # For now this is fixed by also splitting on 'a qr,', but we leave this here in case it becomes an issue again.
            if sale_grain_amount.is_empty():
                warnings.warn(f"Skipping sale: {sale}, for manor {self.manor_name}, due to empty grain amount (bus. and qrs, == 0).")
                continue
            
            # Extract the amount in bushels or quarters and multiply with sale price per unit. Gives the total (calculated) sale value.
            sale_unit_sale_type = sale.unit_sale_type
            match sale_unit_sale_type:
                case "bus":
                    sale_value:PoundsShillingsPence = sale.per_unit_sale_price * sale_grain_amount.return_total_bushels() 
                case "qrs":
                    sale_value:PoundsShillingsPence = sale.per_unit_sale_price * sale_grain_amount.return_total_quarters()
                case "unknown":
                    # If the sale unit is unknown it is impossible to calculate statistics, so we skip this sale.
                    warnings.warn(f"Skipping sale: {sale}, for manor {self.manor_name}, due to unknown unit sale type.")
                    continue
                
            # Add the sale value to the total sales
            total_sales += sale_value
          
        # Calculate the average price (per unit)
        match output_unit:
            case "bus":
                total_amount = total_grain_sold.return_total_bushels()
            case "qrs":
                total_amount = total_grain_sold.return_total_quarters()

        average_price_per_unit = total_sales.return_total_amount(prices_as) / total_amount
        
        # Extract written gross sales, compare with calculated gross sales output difference as error
        # NOTE: We do not use a simple '-' here as subtracting multiple currencies is very difficult to implement.
        total_sales_written = self.written_gross_sales[grain_type]
        written_gross_sales = total_sales_written.return_total_amount(prices_as)
        calculated_gross_sales = total_sales.return_total_amount(prices_as)
        sale_error = written_gross_sales - calculated_gross_sales
        
        # Extract written gross amount, compare with calculated gross amount output difference as error
        total_amount_written_grain_amount = self.written_gross_amount[grain_type]
        match output_unit:
            case "bus":
                total_amount_written = total_amount_written_grain_amount.return_total_bushels()
            case "qrs":
                total_amount_written = total_amount_written_grain_amount.return_total_quarters()
        amount_error = total_amount_written - total_amount
        return GrainStatistics(average_price_per_unit, total_amount, total_amount_written, amount_error, output_unit, calculated_gross_sales,  written_gross_sales, sale_error, prices_as)
    
    def to_grain_sales_df(self) -> Tuple[pd.DataFrame, float]:
        """Create a DataFrame with the grain sales for the manor, and calculates the total sales.

        Returns:
            Tuple[pd.DataFrame, float]: A DataFrame with the grain sales for the manor, and the total sales in Pounds (£).
        """
        
        # Create a DataFrame, preallocate the correct number of rows
        grainsales_df = self.__init_grainsales_table(len(self.sales))
        
        # Initialize total sales calculated, defaults to Pounds (£)
        total_sales_calculated = 0.0
        
        # Loop through the sales and add the data to the DataFrame
        for i, grain_type in enumerate(self.sales.keys()):
                            
            # Extract statistics for the grain type
            grain_statistics_bushels: GrainStatistics = self.return_grain_sales_statistics(grain_type, "bushels")
            grain_statistics_quarters: GrainStatistics = self.return_grain_sales_statistics(grain_type, "quarters")
            
            # Add values to the manor table directly using the index
            grainsales_df.loc[i] = {
                # Grain type
                "grain_type": grain_type,
                
                # Extract average price per unit (bushels and quarters), in Pounds (£)
                "average_price_per_bushel_pounds": grain_statistics_bushels.average_price_per_unit,
                "average_price_per_quarter_pounds": grain_statistics_quarters.average_price_per_unit,
                
                # Extract written and calculated gross sales, and the error in sales, in Pounds (£)
                "written_gross_sales_pounds": grain_statistics_bushels.written_gross_sales,
                "calculated_gross_sales_pounds": grain_statistics_bushels.calculated_gross_sales,
                "gross_sales_error": grain_statistics_bushels.sale_error,
                
                # Extract written and calculated gross amount, and the error in amount, in bushels
                "written_gross_amount_bushels": grain_statistics_bushels.written_total_amount,
                "calculated_gross_amount_bushels": grain_statistics_bushels.calculated_total_amount,
                "gross_amount_error_bushels": grain_statistics_bushels.amount_error,
                
                # Extract written and calculated gross amount, and the error in amount, in quarters
                "written_gross_amount_quarters": grain_statistics_quarters.written_total_amount,
                "calculated_gross_amount_quarters": grain_statistics_quarters.calculated_total_amount,
                "gross_amount_error_quarters": grain_statistics_quarters.amount_error
            }
            
            # Update total sales calculated (take the price from bushels, as this reduces floating point errors caused by dividing by 8)
            if not isnan(grain_statistics_bushels.calculated_gross_sales):
                total_sales_calculated += grain_statistics_bushels.calculated_gross_sales
        
        return grainsales_df, total_sales_calculated
                
if __name__ == "__main__":
    test_grain_sales = GrainSales("Test Manor")
    # test_grain_sales.add_grain_sale("£3 3s. 9d. from 19 qrs 1bus. of peas sold, at 3s. 4d. a qr")
    # print(test_grain_sales.sales['peas'])
    # test_grain_sales.add_grain_sale_paragraph("""The same render account for £22 18s. from 71 qrs 2 bus. of wheat sold, at 6s. a qr for 25.5 qrs, and 6s. 8d. a qr for 45 qrs 6 bus.; 15s. from 3 qrs of rye sold, at 5s. a qr; 17s. 6d. from 4 qrs 3 bus. of barley sold, at 4s. a qr; £5 3s. 10d. from 44.5 qrs of oats sold, at 2s. 4d. a qr; £1 4s. 11.5d. from 8 qrs 5 bus. of beans sold, at 2s. 8d. a qr for 2 qrs 6 bus., and 3s. a qr for 5 qrs 7 bus.; £1 4s. 3d. from 7 qrs 5 bus. of peas sold, at 1 s. 4d. for .5 qr, 3s. a qr for 2.5 qrs, and 3s. 4d. a qr for 4 qrs 5 bus.; 12s. 3d. from 6 qrs 1 bus. of vetches sold, at 2s. a qr; 1s. 6d. from 2 bus. of wheat sold at the audit; 6d. from 1 bus. of barley sold at the audit. Total, £32 17s. 9.5d."""
    # )
    # print(test_grain_sales.return_grain_sales_statistics('wheat', 'quarters'))
    # print(test_grain_sales.sales)
    test_grain_sales.add_grain_sale_paragraph("The same renders account for £3 5s. from 19 qrs of wheat sold, at 3s. 4d. a qr; £14 19s. 7d. from 111 qrs of rye sold, at 3s. a qr for 62.5 qrs, 2s. 4d. a qr for 42 qrs, and 2s. 2d. a qr for 6.5 qrs; £11 14s. 7d. from 85 qrs of dredge sold, at 3s. a qr for 60.5 qrs, and 2s. 2d. a qr for 24.5 qrs; 8s. 10d. from 6 qrs 5 bus. of peas sold, at 1s. 4d. a qr; £5 6s. from 53 qrs of oats sold, at 2s. a qr; 1s. 10.5d. from 5 bus. of rye sold at the audit. Total, £35 15s. 10.5d.")
    print(test_grain_sales.sales)
    
    # # Examples for add_grain_sale NOTE: The strings are read back to front! 
    # text = "The same render account for £13 15s. 3d. from 45.454455 qrs 2.54985 bus. of wheat sold, at 5.23s. a qr for 5 qrs, \
    #     5s. 6d. a qr for 13 qrs, 6s. 4d. a qr for 6 qrs, 6s. 6d. a qr for 3 qrs, and 6s. 8d. a qr for 18 qrs"
    # test_grain_sales.add_grain_sale(text)
    # text = "£3 8s. 6d. from 17 qrs 1 bus. of barley sold, at 4s. a qr"
    # test_grain_sales.add_grain_sale(text)
    # text = " 9s. 6d. from 4 qrs 7 bus. of vetches sold, at 2s. a qr for 4.5 qrs, and 2d. a bus. for 4 bus. Total, £7 4d."
    # test_grain_sales.add_grain_sale(text)
    # test_grain_sales.add_grain_sale("4s. 1d. from 7 bus. of curall sold")
    
    # print("Printing results from sample data run for GrainSales:")
    # print(f"Manor Name: {test_grain_sales.manor_name}")
    # print(f"Total Sales: {test_grain_sales.total_sales}")
    # print(f"Written Gross Sales: {test_grain_sales.written_gross_sales}")
    # print(f"Written Gross Amount: {test_grain_sales.written_gross_amount}")
    # print(f"Sales: {test_grain_sales.sales}")
    # print()
    # print("Printing results from sample data run for GrainStatistics vetches:")
    # print(test_grain_sales.return_grain_sales_statistics("vetches", "quarters"))
    # print(test_grain_sales.return_grain_sales_statistics("vetches", "quarters", "shillings"))
    # print(test_grain_sales.to_grain_sales_df()[0])

defaultdict(<class 'list'>, {'wheat': [GrainSale(grain_type='wheat', unit_sale_type='qrs', per_unit_sale_price=PoundsShillingsPence(pounds=0.0, shillings=3.0, pence=4.0), grain_amount=GrainAmount(bushels=0.0, quarters=19.0))], 'rye': [GrainSale(grain_type='rye', unit_sale_type='qrs', per_unit_sale_price=PoundsShillingsPence(pounds=0.0, shillings=2.0, pence=2.0), grain_amount=GrainAmount(bushels=0.0, quarters=6.5)), GrainSale(grain_type='rye', unit_sale_type='qrs', per_unit_sale_price=PoundsShillingsPence(pounds=0.0, shillings=2.0, pence=4.0), grain_amount=GrainAmount(bushels=0.0, quarters=42.0)), GrainSale(grain_type='rye', unit_sale_type='qrs', per_unit_sale_price=PoundsShillingsPence(pounds=0.0, shillings=3.0, pence=0.0), grain_amount=GrainAmount(bushels=0.0, quarters=62.5)), GrainSale(grain_type='rye', unit_sale_type='bus', per_unit_sale_price=PoundsShillingsPence(pounds=0.0, shillings=0.0, pence=4.5), grain_amount=GrainAmount(bushels=5.0, quarters=0.0))], 'dredge': [GrainSale(grain

## Structured Document class

In [5]:
@dataclass(slots=True)
class StructuredDocument:
    
    # Instance variables    
    document_title: str
    titles: List[str]
    paragraphs: List[str]
    issue_of_the_grange_index: int|None
    issue_of_the_grange_indices: List[int]
    sale_of_corn_index: int|None
    
    # Class variables
    SENTENCE_END_REGEX = re.compile(r'(?<=[.,])\s+(?=[A-Z])') # Regex to split the paragraph into sentences. NOTE: this finds a full stop or comma followed by a space and a capital letter.
    
    def __init__(self, docx_path:Path):
        self._extract_paragraphs(docx_path)
        self._get_issues_of_the_grange_indices()
        self._get_sale_of_corn_index()
    
    # Internal methods        
    def _is_bold(self, paragraph:docx.text.paragraph.Paragraph) -> bool:
        """Check if an entire paragraph is bold."""
        return all(run.bold for run in paragraph.runs if run.text.strip())  # Ensure non-empty bold text
    def _is_partly_bold(self, paragraph:docx.text.paragraph.Paragraph) -> bool:
        """Check if a paragraph contains any bold text."""
        return any(run.bold for run in paragraph.runs if run.text.strip())
    
    def _extract_paragraphs(self, doc_path: Path) -> None:
        """
        Extracts paragraphs from a .docx file, using bold lines as titles.
        If a title is empty, its paragraph is merged with the previous one.
        Sets two lists in self: one for titles and one for paragraphs.
        """
        doc = docx.Document(doc_path)
        self.document_title = doc_path.stem  # Use the file name as the document title
        self.titles = []
        self.paragraphs = []
        
        current_title = None
        current_paragraph = []

        # Loop through the docx paragraphs and create lists for titles and paragraphs
        for paragraph in doc.paragraphs:
            # If the paragraph is bold, treat it as a title
            if self._is_bold(paragraph): 
                new_title = paragraph.text.strip()
                if not new_title:  # If the bold title is empty, skip this docx paragraph
                    continue

                if current_paragraph:  # Save the previous paragraph
                    self.titles.append(current_title if current_title else "")
                    self.paragraphs.append('\n'.join(current_paragraph))
                    current_paragraph = []  # Reset for next paragraph
                current_title = new_title  # Store the new title
            
            # If part of the paragraph is bold save it as a paragraph. See EastMeonChurch.docx "Wheat from tithes" section.
            elif self._is_partly_bold(paragraph):
                
                # Save the previous paragraph
                if current_paragraph:
                    self.titles.append(current_title if current_title else "")
                    self.paragraphs.append('\n'.join(current_paragraph))
                    current_paragraph = []  # Reset for next paragraph
                
                # Store this paragraph as new paragraph and reinitialize the current paragraph/title
                bold_text = ""
                regular_text = ""
                for run in paragraph.runs:
                    if run.text.strip():
                        if run.bold:
                            bold_text += run.text.strip()
                        else:
                            regular_text += run.text.strip()
                current_title = bold_text.strip()
                current_paragraph.append(regular_text.strip())
                
            # If a regular line is encountered, add it to the current paragraph
            else:
                if paragraph.text.strip():  # Avoid empty lines
                    current_paragraph.append(paragraph.text.strip())

        # Append the last section if it exists
        if current_paragraph:
            self.titles.append(current_title if current_title else "")
            self.paragraphs.append('\n'.join(current_paragraph))
            
        assert len(self.titles) == len(self.paragraphs), "Mismatch between titles and paragraphs lengths."

    def _get_issue_of_the_grange_index(self) -> None:
        """Finds the index of where the issue of the grange paragraphs can start."""
        for i, title in enumerate(self.titles):
            if title in {"Total of all receipts", "Issues of the grange", "Total of all expenses"}:
                self.issue_of_the_grange_index = i 
                # NOTE: While breaking here appears smart, it causes issues in for instance the Adderbury file. Where all three of the headers can be found.
                # The issue is that the _get_issues_of_the_grange_indices() method will start looking at the first index, and find the Barley expenses paragraph.
                # We avoid this by looping until the end of the titles, so we can find the last possible starting index to look for the issues of the grange.
        if not hasattr(self, "issue_of_the_grange_index"):
            self.issue_of_the_grange_index = None  # If no title was found, set to None
    
    def _get_issues_of_the_grange_indices(self) -> None:
        """Finds the indices of the issues of the grange paragraphs."""
        
        # Get issue of the grange index
        self._get_issue_of_the_grange_index()
        
        # Initialize the list of indices
        self.issue_of_the_grange_indices = []
        
        # Get start index of the issue of the grange section
        if self.issue_of_the_grange_index is None:
            return None
        else:
            start_index = self.issue_of_the_grange_index
        
        # Loop through the titles and find all indices of the issue of the grange section
        for i, title in enumerate(self.titles[start_index:], start=start_index):
            if GrainTypes.contains_valid_grain(title):
                self.issue_of_the_grange_indices.append(i)   
            
    def _get_sale_of_corn_index(self) -> None:
        """Finds the index of the 'Sale of corn' title in the list of titles."""
        
        # Base case set sale_of_corn_index to None
        self.sale_of_corn_index = None
        
        # Try finding the index of 'Sale of corn' in the titles list.
        # If we cannot find it, we check for grain types in the titles before the 'Issue of the grange' section.
        try:
            self.sale_of_corn_index = self.titles.index("Sale of corn")
        except ValueError:
            # If we cannot find 'Sale of corn', we check for grain types in the titles before the 'Issue of the grange' section.
            limit_index = len(self.titles) if self.issue_of_the_grange_index is None else self.issue_of_the_grange_index
            for i, title in enumerate(self.titles):
                # If we are past the limit index, we break out of the loop.
                if i >= limit_index:
                    break
                # Else if the title contains a grain type, we set the sale_of_corn_index to the index of the title.
                elif GrainTypes.contains_valid_grain(title):
                    self.sale_of_corn_index = i
                    break
    
    # External methods
    @staticmethod
    def split_sentences(paragraph:str) -> List[str]:
        """Splits a paragraph into sentences."""
        return StructuredDocument.SENTENCE_END_REGEX.split(paragraph)

## Issues of the grange classes

In [6]:
@dataclass(slots=True)
class GrainYield:
    """Grain yield for a specific grain type."""
    
    ## Instance variables
    raw_string: str
    fold: float  # How many times more the yield is than the sown amount.
    fold_modifier: 'GrainAmount'  # How many more/less grain in bushels or quarters there is than the fold calculation indicates.
    
    ## Class variables
    WORD_TO_NUMBER = {
        "half": 0.5, 
        "one": 1,
        "two": 2,
        "three": 3,
        "four": 4,
        "five": 5,
        "six": 6,
        "seven": 7,
        "eight": 8,
        "nine": 9,
        "ten": 10,
        "fold": 0.0  # Placeholder for fold, as this is not a number.
    }
    
    # Class regex patterns
    NUMBER_REGEX = re.compile(r"^\d+(\.\d+)?$") # Regex to match numbers (including decimals).
    PERCENTAGE_REGEX = re.compile(r'(\d)(%)|%') # Regex to replace % with 0.5, unless there is a number before it then replace it with .5. OCR fix.
    HALF_REGEX = re.compile(r'(\d)(1⁄2)|1⁄2')  # Regex to replace '1⁄2' with 0.5, unless there is a number before it then replace it with .5. OCR fix.
    YIELD_REGEX = re.compile(r'[\(\{\[]yield:\s*(.*?)[\)\}\]]') # Regex to extract the yield from the string.

    def __init__(self, string: str):
        """Initializes the GrainYield object and extracts the fold from the yield substring."""
        
        # Set the raw string to the input string, and initialize fold.
        self.raw_string = string
        self.fold = 0.0
        
        # If the string is empty set fold_modifier to an empty GrainAmount object, and return as is.
        if string == "":
            self.fold_modifier = GrainAmount()
            return
        
        # If the string contains 'as sown' we can set 1.0 as the fold, with empty fold_modifier.
        if 'as sown' in string:
            self.fold += 1.0
            self.fold_modifier = GrainAmount()
            return
        
        # Continue on to the most common cases where we have a fold and a modifier.
        
        # Replace '%' or '1⁄2' with '0.5' or '.5' depending on the context. Fixes OCR errors.
        string = self._replace_percent(string)
        string = self._replace_half(string)
        
        # Split the string on '-', and extract the fold amount.
        for part in string.split(" ")[0].split("-"):
            if self.NUMBER_REGEX.match(part):
                self.fold += float(part)
            else:
                self.fold += self.WORD_TO_NUMBER.get(part, 0.0)
        
        # Find the modifier in the string, and extract the amount of bushels or quarters.
        self.fold_modifier = GrainAmount.from_string_bias_first(string)
    
        # Check if the string contains 'less' and modify the modifier accordingly.
        if 'less' in string and not self.fold_modifier.is_empty():
            self.fold_modifier *= -1.0
    
    ## Internal methods
    def _replace_percent(self, text):
        return self.PERCENTAGE_REGEX.sub(lambda m: f"{m.group(1)}.5" if m.group(1) else '0.5', text)
    def _replace_half(self, text):
        return self.HALF_REGEX.sub(lambda m: f"{m.group(1)}.5" if m.group(1) else '0.5', text)

if __name__ == "__main__":
    print(GrainYield("as sown"))
    print(GrainYield("two-fold less 5.5 qrs"))
    print(GrainYield(""))
    print(GrainYield("five-fold plus 2 qrs 61⁄2 bus."))
    print(GrainYield("five-fold less 2 qrs 1⁄2 bus."))
    print(GrainYield("two-fold less 5.5 qrs 0.5 bus."))
    print(GrainYield("five-and-a-half-fold less 2% qrs"))
    print(GrainYield("7.5-fold less % bus. with the remainder in the stack"))
    print(GrainYield("yield: three-and-a-half-fold plus 2 qrs 7 bus."))
    print(GrainYield("[yield: 23 qrs 1 bus. more than sown)"))
    print(GrainYield("seven-fold less 1 qr with 35 qrs by estimation remaining in the stack"))

GrainYield(raw_string='as sown', fold=1.0, fold_modifier=GrainAmount(bushels=0.0, quarters=0.0))
GrainYield(raw_string='two-fold less 5.5 qrs', fold=2.0, fold_modifier=GrainAmount(bushels=-0.0, quarters=-5.5))
GrainYield(raw_string='', fold=0.0, fold_modifier=GrainAmount(bushels=0.0, quarters=0.0))
GrainYield(raw_string='five-fold plus 2 qrs 61⁄2 bus.', fold=5.0, fold_modifier=GrainAmount(bushels=6.5, quarters=2.0))
GrainYield(raw_string='five-fold less 2 qrs 1⁄2 bus.', fold=5.0, fold_modifier=GrainAmount(bushels=-0.5, quarters=-2.0))
GrainYield(raw_string='two-fold less 5.5 qrs 0.5 bus.', fold=2.0, fold_modifier=GrainAmount(bushels=-0.5, quarters=-5.5))
GrainYield(raw_string='five-and-a-half-fold less 2% qrs', fold=5.5, fold_modifier=GrainAmount(bushels=-0.0, quarters=-2.5))
GrainYield(raw_string='7.5-fold less % bus. with the remainder in the stack', fold=7.5, fold_modifier=GrainAmount(bushels=-0.5, quarters=-0.0))
GrainYield(raw_string='yield: three-and-a-half-fold plus 2 qrs 7 bus.

In [18]:
@dataclass(slots=True)
class IssuesOfTheGrange:
    """Stores issues of the grange for a specific manor, including yields, issues, bought amounts, inbound and outbound amounts, and balances."""
    
    # Instance variables
    manor_name: str
    yields: Dict[str, GrainYield] # Yields for each grain type, stored as a GrainYield object.
    issues: Dict[str, GrainAmount]  # Amount grown on the manor stored per grain type, stored as GrainAmount.
    
    # Variables to store all the raw encountered amounts in a list per grain type.
    inbound_raw: DefaultDict[str, List[GrainAmount]]  # Amounts that are not issues or bought, stored per grain type, stored as list of GrainAmount.
    outbound_raw: DefaultDict[str, List[GrainAmount]]  # Amounts that are not issues or bought, stored per grain type, stored as list of GrainAmount.
    
    # Stats of interest for the manor per grain type.
    inbound_total: Dict[str, GrainAmount]  # Total amount of grain inbound, stored per grain type, stored as GrainAmount.
    inbound_other: Dict[str, GrainAmount]  # Amount that is not issues or bought, stored per grain type, stored as single GrainAmount.
    inbound_bought: Dict[str, GrainAmount]  # Amount bought by the manor stored per grain type, stored as GrainAmount.
    inbound_net: Dict[str, float] # Net inbound amount of grain for each grain type, stored as float (number of quarters).
    outbound_total: Dict[str, GrainAmount]  # Total amount of grain outbound, stored per grain type, stored as GrainAmount.
    
    def __init__(self, manor_name:str):
        self.manor_name = manor_name
        self.yields = {}  # Yields for each grain type, stored as a GrainYield object.
        self.issues = {} # Amount grown on the manor stored per grain type, stored as GrainAmount.
        
        self.inbound_raw = defaultdict(list) # Amounts that are not issues or bought, stored per grain type, stored as list of GrainAmount.
        self.outbound_raw = defaultdict(list) # Amounts that are not issues or bought, stored per grain type, stored as list of GrainAmount.
        
        self.inbound_total = {} # Total amount of grain inbound, stored per grain type, stored as GrainAmount.
        self.inbound_other = {} # Amount that is not issues or bought, stored per grain type, stored as single GrainAmount.
        self.inbound_bought = {} # Amount bought by the manor stored per grain type, stored as GrainAmount.
        self.inbound_net = {}  # Net inbound amount of grain for each grain type, stored as float (number of quarters).
        self.outbound_total = {}  # Total amount of grain outbound, stored per grain type, stored as GrainAmount.
        
    ## Internal methods
    def _extract_yield(self, paragraph:str, grain_type:str) -> str:
        """Extracts the yield from a paragraph and returns the modified paragraph.

        Args:
            paragraph (str): Paragraph with the yield.
            grain_type (str): Grain type.

        Returns:
            str: Paragraph without the yield part.
            
        Raises:
            ValueError: If the grain type is already in the yields dictionary.
        """
        
        # Raise ValueError if the GrainType is already in the yields dictionary.
        if grain_type in self.yields:
            raise ValueError(f"{grain_type} is already in the yields dictionary.")
        
        # Extract the yield from the paragraph using the regex, and create a GrainYield object.
        yield_match = GrainYield.YIELD_REGEX.search(paragraph)
        if yield_match:
            yield_string = yield_match.group(1)  
            self.yields[grain_type] = GrainYield(yield_string)
        else:
            self.yields[grain_type] = GrainYield("")
        
        # Modify string to remove the yield part, if it was found.
        paragraph = paragraph[yield_match.end():] if yield_match else paragraph
        return paragraph
    
    def _extract_issue_from_first_sentence(self, first_sentence:str, grain_type:str) -> None:
        """Extracts the issue from the first sentence and stores it in the issues dictionary. Also extracts any bought amounts present.
        Finally stores any remaining inbound amounts in the inbound dictionary.

        Args:
            first_sentence (str): The first sentence of the paragraph.
            grain_type (str): The grain type.

        Raises:
            ValueError: If no grain amounts are found in the first sentence.
        """
        
        # Extract all grain amounts from the first sentence
        first_sentence_amounts = GrainAmount.from_string_return_all(first_sentence)
        if not first_sentence_amounts:
            # If no amounts are found, check if the sentence contains 'nothing' and set the amount to 0.
            if 'nothing' in first_sentence.lower():
                first_sentence_amounts = [GrainAmount()]
            else:
                raise ValueError(
                    f"No grain amounts found in first sentence: {first_sentence} "
                    f"in manor: {self.manor_name} for grain type: {grain_type}."
                )

        # Case 1: Single amount and sentence mentions 'issue' — it's the issue amount.
        elif len(first_sentence_amounts) == 1 and 'issue' in first_sentence:
            self.issues[grain_type] = first_sentence_amounts[0]

        # Case 2: Multiple amounts — determine which one refers to the issue (and optionally find an amount bought).
        elif len(first_sentence_amounts) >= 1:
            # Split sentence by ';' if present, otherwise by ','
            delimiters = ';' if ';' in first_sentence else ','
            sentence_parts = first_sentence.split(delimiters)

            # Look for the part that mentions 'issue' and use the corresponding amount
            for part in sentence_parts:
                amount = GrainAmount.from_string_bias_first(part)
                if 'issue' in part:
                    self.issues[grain_type] = amount
                elif 'bought' in part:
                    self.inbound_bought[grain_type] = amount
                else:
                    self.inbound_raw[grain_type].append(amount)
                    
        # Case 3: No 'issue' in the first sentence. Either grain from previous years or just bought. Set issue to empty.      
        if grain_type not in self.issues:
            self.issues[grain_type] = GrainAmount()
        if grain_type not in self.inbound_bought:
            self.inbound_bought[grain_type] = GrainAmount()
        if grain_type not in self.inbound_raw:
            self.inbound_raw[grain_type] = []
        
    ## Public methods
    def add_issue_from_paragraph(self, title:str, paragraph:str) -> None:
        """Adds issues of the grange from a paragraph to inbound and outbound dictionaries.

        Args:
            title (str): Paragraph title.
            paragraph (str): Paragraph text.

        Raises:
            ValueError: If no sentences are found in the paragraph.
        """
        
        # Strip title and transform into lowercase.
        grain_type = title.strip().lower()
        if not GrainTypes.is_valid_grain(grain_type):
            warnings.warn(f"Skipping {grain_type} in {self.manor_name} as it is not a valid grain type.")
            return
        if grain_type in self.issues:
            warnings.warn(f"Skipping {grain_type} in {self.manor_name} as it has already been encountered.")
            return
        
        # Extract the yield from the paragraph and store it in the yields dictionary. Return paragraph without the yield part.
        paragraph = self._extract_yield(paragraph, grain_type)
        
        # Split the sentences
        split_sentences = StructuredDocument.split_sentences(paragraph)
        if len(split_sentences) == 0:
            raise ValueError(f"No sentences found in paragraph: {paragraph} in manor: {self.manor_name}.")
        
        # NOTE: We split the methodology on the number of sentences, in most cases the first sentence is inbound and everything after is outbound.
        
        ## Inbound
        # Extract the issue from the first sentence and store it in the issues dictionary.
        self._extract_issue_from_first_sentence(split_sentences[0], grain_type)
        
        # If the second sentence contains 'total' this is the total inbound amount of grain for that year.
        if len(split_sentences) > 1 and 'total' in split_sentences[1].lower():
            self.inbound_total[grain_type] = GrainAmount.from_string(split_sentences[1])
        # Else we calculate the total inbound from combining the inbound, bought, and issues amounts.
        else:
            self.inbound_total[grain_type] = self.issues[grain_type] + self.inbound_bought[grain_type]
            for amount in self.inbound_raw[grain_type]:
                self.inbound_total[grain_type] += amount
        
        ## Outbound
        for sentence in split_sentences[1:]:
            # If the sentence contains 'total' we skip it, as it is already handled (or it is the total of outbound which we do not need).
            if 'total' in sentence.lower():
                continue
            
            # Extract the issue from the sentence and store it in the outbound dictionary.
            outbound_amount = GrainAmount.from_string_bias_first(sentence)
            if outbound_amount.is_empty():
                continue
            self.outbound_raw[grain_type].append(outbound_amount)
            
        # Calculate the total outbound amount of grain for that year.
        self.outbound_total[grain_type] = sum(self.outbound_raw[grain_type], start=GrainAmount())
        
        # If outbound_total is empty, and paragraph contains 'all sold' then set it to the total inbound amount.
        if self.outbound_total[grain_type].is_empty() and ('all sold' in paragraph.lower() or 'none remains' in paragraph.lower()):
            self.outbound_total[grain_type] = self.inbound_total[grain_type]
        
        # Calculate inbound other
        self.inbound_other[grain_type] = sum(self.inbound_raw[grain_type], start=GrainAmount())
        
        # Calculate the net inbound amount of grain for that year.
        self.inbound_net[grain_type] = self.inbound_total[grain_type].return_total_quarters_unsafe() \
            - self.inbound_other[grain_type].return_total_quarters_unsafe() \
            - self.inbound_bought[grain_type].return_total_quarters_unsafe()

        
if __name__ == "__main__":
    grange_issues_test = IssuesOfTheGrange('Test Manor')
    grange_issues_test.add_issue_from_paragraph(title="bere", paragraph="""
                                                (yield: two-fold less 3.5 qrs)
The same render account for 17 qrs 1 bus. from the whole issue of bere by struck measure; 4 bus. by estimation in 67 sheaves delivered for the reaping of 33.5 acres by labour service, at 2 sheaves an acre; 10 qrs 6 bus. bought; 5 bus. of wheat mixed for
sowing. Total, 29 qrs.
Sown over 81 acres as they lie in Hathfeld, 15 qrs 1 bus.; that is, 1 bus. an acre, less' bus. in all. Delivered to 1 carter, who takes a quarter every 10 weeks for the year, 5 qrs 1.5 bus. Delivered to 1 oxherd from Michaelmas (29 September) until Sunday after the feast of St Matthew the Apostle (25 February), for 21 weeks 3 days, 2 qrs 1 bus. Delivered to 1 shepherd from Michaelmas until the Purification (2 February), for 18 weeks, 1 qr 6 bus. Delivered to 1 dairymaid for the year, who takes a quarter every 12 weeks, 4 qrs 2.5 bus. Delivered to the reapers by estimation in 67 sheaves, 4 bus.
"""
    )
    
    for key, value in asdict(grange_issues_test).items():
        print(f"{key}: {value}")

    grange_issues_test = IssuesOfTheGrange('Test Manor')
    grange_issues_test.add_issue_from_paragraph("oats", """{yield: two-fold less 10.5 qrs)
The same render account for 164 qrs from the whole issue of oats by struck measure; 26 qrs by estimation in sheaves given to oxen; 15 qrs bought. Total, 205 qrs.
Sown over 195 acres 1 perch which lie in the great field, 97 qrs 5 bus.; that is, 0.5 qr an acre measured by the perch. In fodder of 4 cart-horses, 28 qrs. In fodder of 12 plough-horses between Christmas and Saturday before the feast of St George (21 April), 9 qrs 1 bus. In fodder of oxen by estimation in sheaves, 26 qrs. In fodder of the steward's horses for his visits, 2 bus. In fodder of the bailiff's horses for his visits, 2 qrs. Sold, 42 qrs.
""")
    print()
    for key, value in asdict(grange_issues_test).items():
        print(f"{key}: {value}")
    
    grange_issues_test = IssuesOfTheGrange('Test Manor')
    grange_issues_test.add_issue_from_paragraph("peas", """{yield: 7.5-fold less % bus. with the remainder in the stack}
The same render account for 6 qrs of peas remaining from the previous account; 1 qr 5 bus. from this year's issue. Total, 7 qrs 5 bus.
Sown over 2 acres, 2 bus. In supporting pigs in winter, 1.5 qrs. Sold within, 5 qrs. Total as above. 3 qrs by estimation remain in the stack.
""")
    print()
    for key, value in asdict(grange_issues_test).items():
        print(f"{key}: {value}")
    
    grange_issues_test = IssuesOfTheGrange('Test Manor')
    grange_issues_test.add_issue_from_paragraph("oats", """The same render account for 22 qrs 6 bus. of oats bought as noted within. Total, 22 qrs 6 bus. All in fodder of the mill-horse. None remains.
Horses: 1 remains. Total, 1. 1 remains.
""")
    print()
    for key, value in asdict(grange_issues_test).items():
        print(f"{key}: {value}")
        
    grange_issues_test = IssuesOfTheGrange('Test Manor')
    grange_issues_test.add_issue_from_paragraph("wheat", """{yield: three-fold plus 1.5 bus.}
The same render account for 12 qrs 7 bus. from the whole issue of wheat by struck measure; 1 qr by estimation in 2 acres delivered to the reeve and hayward according to custom; 2 qrs 5.5 bus. received from churchscot; 4.5 qrs bought. Total, 21 qrs .5 bus.
Sown over 48 acres as they lie in the field of la Worth', 6.5 qrs 2.5 bus., that is, 1 bus. an acre, plus 5.5 bus. in all. Delivered to 1 reeve and 1 hayward by estimation in 2 acres according to custom, 1 qr. In customary payment to the hayward, 1 bus. 
Sold, 13 qrs 1 bus.
""")
    print()
    for key, value in asdict(grange_issues_test).items():
        print(f"{key}: {value}")

manor_name: Test Manor
yields: {'bere': {'raw_string': 'two-fold less 3.5 qrs', 'fold': 2.0, 'fold_modifier': {'bushels': -0.0, 'quarters': -3.5}}}
issues: {'bere': {'bushels': 1.0, 'quarters': 17.0}}
inbound_raw: defaultdict(<class 'list'>, {'bere': [{'bushels': 4.0, 'quarters': 0.0}, {'bushels': 5.0, 'quarters': 0.0}]})
outbound_raw: defaultdict(<class 'list'>, {'bere': [{'bushels': 1.0, 'quarters': 15.0}, {'bushels': 1.5, 'quarters': 5.0}, {'bushels': 1.0, 'quarters': 2.0}, {'bushels': 6.0, 'quarters': 1.0}, {'bushels': 2.5, 'quarters': 4.0}, {'bushels': 4.0, 'quarters': 0.0}]})
inbound_total: {'bere': {'bushels': 0.0, 'quarters': 29.0}}
inbound_other: {'bere': {'bushels': 9.0, 'quarters': 0.0}}
inbound_bought: {'bere': {'bushels': 6.0, 'quarters': 10.0}}
inbound_net: {'bere': 17.125}
outbound_total: {'bere': {'bushels': 16.0, 'quarters': 27.0}}

manor_name: Test Manor
yields: {'oats': {'raw_string': 'two-fold less 10.5 qrs', 'fold': 2.0, 'fold_modifier': {'bushels': -0.0, 'quarters

## Reader Class

In [None]:
@dataclass(slots=True)
class ManorialRecordsReader:
    """Class to read data from OCRed manorial records in .docx format."""
    
    parent_folder: Path  # Folder containing the .docx files
    docx_files: List[Path]
    structured_documents: List[StructuredDocument]
    
    # Grain sales data
    manor_tables_sales:Dict[str, pd.DataFrame]
    overview_table_sales: pd.DataFrame
    raw_table_sales: pd.DataFrame
    
    # Issues of the grange data
    manor_issues_tables: Dict[str, pd.DataFrame]
    
    def __init__(self, folder: str|Path):
        """Initialize the ManorialRecordsReader with a folder containing .docx files.

        Args:
            folder (str | Path): Path to the folder containing .docx files.
        """
        # Init paths
        self.parent_folder = Path(folder)
        self._extract_structured_documents(self.parent_folder)
        
        ## Init structures to store extracted data
        # Initialize dict to store the dataframes, and two dataframes to store the overview and raw data for the grain sales.
        self.manor_tables_sales:Dict[str, pd.DataFrame] = dict()
        self.overview_table_sales = pd.DataFrame(columns=["manor_name", "total_sales_written_pounds", "total_sales_calculated_pounds",
                                            "total_sales_error"]).astype({"manor_name": "str"})
        self.raw_table_sales = pd.DataFrame(columns=["manor_name", "raw_sales"]).astype({"manor_name": "str"})
        
        # Structures for issues of the grange data.
        self.manor_issues_tables: Dict[str, pd.DataFrame] = dict()


    ## Internal methods
    def _get_docx_files(self, folder: Path) -> List[Path]:
        """Get all .docx files in a folder, excluding those with '_' in their names."""
        return [file for file in folder.iterdir() if file.suffix == ".docx" and "_" not in file.name and not file.name.startswith("~$")]
    
    def _extract_structured_documents(self, folder: Path) -> None:
        """Extracts structured documents from the .docx files in the folder."""
        self.docx_files = self._get_docx_files(folder)
        assert len(self.docx_files) > 0, f"No .docx files found in {folder}."
        self.structured_documents = [StructuredDocument(docx_file) for docx_file in self.docx_files]
     
    def _extract_grain_sales(self, doc: StructuredDocument, table_index:int) -> None:
        """Method to extract grain sales from a structured document.
        This method is called for each document in the list of structured documents.

        Args:
            doc (StructuredDocument): StructuredDocument object containing the document title, titles, and paragraphs.
            table_index (int): Index of the document in the list of structured documents.
        """
        
        # Get the manor_name
        manor_name = doc.document_title
                    
        # If no grain sales paragraph was found, we skip the file
        if doc.sale_of_corn_index is None:
            warnings.warn(f"Sale of corn not found for manor: {manor_name} using grain type matching method or searching for 'Sale of corn'. Skipping file.")
            return None
        
        # Create GrainSales object using the document title as the manor name
        manor_grainsales = GrainSales(manor_name)
        
        # Add the data from the Sale of corn paragraph to the manor_grainsales object
        manor_grainsales.add_grain_sale_paragraph(doc.paragraphs[doc.sale_of_corn_index])
        
        # Extract the data to a DataFrame, and get the total sales
        manor_df, total_sales_calculated = manor_grainsales.to_grain_sales_df()
        self.manor_tables_sales[manor_name] = manor_df
        
        # Get total sales written in Pounds (£)
        total_sales_written = manor_grainsales.total_sales.return_total_amount()
        
        # Calculate the difference between the calculated and written total sales
        total_sales_error = total_sales_written - total_sales_calculated
        
        # Add the overview to the overview_table
        self.overview_table_sales.loc[table_index] = {
            "manor_name": manor_grainsales.manor_name,
            "total_sales_written_pounds": total_sales_written,
            "total_sales_calculated_pounds": total_sales_calculated,
            "total_sales_error": total_sales_error
        }    
        # Add the raw data to the raw_table
        self.raw_table_sales.loc[table_index] = {
            "manor_name": manor_grainsales.manor_name,
            "raw_sales": manor_grainsales
        }
        return None
    
    def _extract_issues_of_the_grange(self, doc: StructuredDocument) -> None:
        """Method to extract issues of the grange from a structured document. Stores inbound, outbound, balance, issue (amount grown), and bought amounts in quarters per grain type.

        Args:
            doc (StructuredDocument): StructuredDocument object containing the document title, titles, and paragraphs.

        """
        
        # Get the manor_name
        manor_name = doc.document_title
        
        # If no issues of the grange starting paragraph was found, we skip the file
        if len(doc.issue_of_the_grange_indices) == 0:
            warnings.warn(f"Issues of the grange not found for manor: {manor_name}. Skipping file.")
            return None
            
        # Create IssuesOfTheGrange object using the document title as the manor name
        manor_issues = IssuesOfTheGrange(manor_name)
        
        # Add the data from the issues of the grange paragraphs to the manor_issues object
        for i in doc.issue_of_the_grange_indices:
            manor_issues.add_issue_from_paragraph(doc.titles[i], doc.paragraphs[i])
            
        # Create a DataFrame for the issues of the grange
        issues_data = []
        for grain_type in manor_issues.inbound_total.keys():
            
            # Get all required output fields for the DataFrame
            inbound_total = manor_issues.inbound_total[grain_type]
            inbound_other = manor_issues.inbound_other[grain_type]
            inbound_bought = manor_issues.inbound_bought[grain_type]
            inbound_net = manor_issues.inbound_net[grain_type]
            outbound_total = manor_issues.outbound_total[grain_type]
            yield_grain = manor_issues.yields[grain_type]
            
            issues_data.append({
                "grain_type": grain_type,
                "inbound_total": inbound_total.return_total_quarters_unsafe(),
                "inbound_other": inbound_other.return_total_quarters_unsafe(),
                "inbound_bought": inbound_bought.return_total_quarters_unsafe(),
                "inbound_net": inbound_net,
                "outbound_total": outbound_total.return_total_quarters_unsafe(),
                "yield": yield_grain  
            })

        # Create DataFrame
        self.manor_issues_tables[manor_name] = pd.DataFrame(issues_data).astype({
            "grain_type": "str",
            "inbound_total": "float",
            "inbound_other": "float",
            "inbound_bought": "float",
            "inbound_net": "float",
            "outbound_total": "float",
            "yield": "object"  # Keep yield as object to store GrainYield objects
        })

                
    def _write_to_excel(self, output_path: Path) -> None:
        """Writes the overview, raw data and Manorial data to an Excel file."""
        
        # Write grain sales data to an Excel file    
        with pd.ExcelWriter(output_path / "sales_of_corn.xlsx") as writer:
            self.overview_table_sales.to_excel(writer, sheet_name='overview', index=False)
            self.raw_table_sales.to_excel(writer, sheet_name='raw_data', index=False)
            for manor_name, df in self.manor_tables_sales.items():
                df.to_excel(writer, sheet_name=manor_name, index=False)
        
        with pd.ExcelWriter(output_path / "issues_of_the_grange.xlsx") as writer:
            for manor_name, df in self.manor_issues_tables.items():
                df.to_excel(writer, sheet_name=manor_name, index=False)
    
    ## External methods
    def extract_and_save_data(self, output_path:str|Path):
        """Extracts data from the structured documents and saves it to an Excel file.

        Args:
            output_path (str | Path): Path to an output folder.

        Raises:
            ValueError: If the output path is not a valid folder.
        """
        
        # Initialize output path
        output_path = Path(output_path)
        # If output path is not a folder raise an error.
        if not output_path.exists() or output_path.is_file():
            raise ValueError(f"Output path folder {output_path} does not exist.")
        
        # Loop through each structured document
        for i, doc in enumerate(self.structured_documents):
            
            # Extract the grain sales from the document
            self._extract_grain_sales(doc, i)
            
            # Extract 'Issues of the grange' from the document
            self._extract_issues_of_the_grange(doc)
            
        # Write the data to an Excel file
        self._write_to_excel(output_path)

if __name__ == "__main__":
    # Example usage of the ManorialRecordsReader class
    reader = ManorialRecordsReader(folder=r"C:\Users\kubak\OneDrive - Wageningen University & Research\WUR\2024-2025\research_assistant_ox\workfolder\input\OCR 1301\\") # Change this to the folder containing your .docx files!
    reader.extract_and_save_data(r"C:\Users\kubak\Documents\GitHub\student_assistant_manorial_records\output\OCR 1301-1302\python transcription\issue_of_the_grange") # Change this to your desired output folder!

