Merge 8e2faa1 into 5748edd

jbms · Oct 25, 2020 · 9b88d43 · 9b88d43
2 parents 5748edd + 8e2faa1
commit 9b88d43
Show file tree

Hide file tree

Showing 9 changed files with 1,827 additions and 30 deletions.
diff --git a/beancount_import/amount_parsing.py b/beancount_import/amount_parsing.py
@@ -25,18 +25,20 @@ def parse_number(x):
     sign, number_str = parse_possible_negative(x)
     return sign * D(number_str)
 
-def parse_amount(x):
+def parse_amount(x, assumed_currency=None):
     """Parses a number and currency."""
     if not x:
         return None
     sign, amount_str = parse_possible_negative(x)
-    m = re.fullmatch(r'([\$€£])?((?:[0-9](?:,?[0-9])*|(?=\.))(?:\.[0-9]+)?)(?:\s+([A-Z]{3}))?', amount_str)
+    m = re.fullmatch(r'(?:[(][^)]+[)])?\s*([\$€£])?((?:[0-9](?:,?[0-9])*|(?=\.))(?:\.[0-9]+)?)(?:\s+([A-Z]{3}))?', amount_str)
     if m is None:
         raise ValueError('Failed to parse amount from %r' % amount_str)
     if m.group(1):
         currency = {'$': 'USD', '€': 'EUR', '£': 'GBP'}[m.group(1)]
     elif m.group(3):
         currency = m.group(3)
+    elif assumed_currency is not None:
+        currency = assumed_currency
     else:
         raise ValueError('Failed to determine currency from %r' % amount_str)
     number = D(m.group(2))

diff --git a/beancount_import/source/amazon.py b/beancount_import/source/amazon.py
@@ -35,6 +35,7 @@
 
     dict(module='beancount_import.source.amazon',
          directory=os.path.join(journal_dir, 'data/amazon'),
+         pickle_dir=os.path.join(journal_dir, 'data/amazon/.pickle')
          amazon_account='name@domain.com',
          posttax_adjustment_accounts={
              'Gift Card Amount': 'Assets:Gift-Cards:Amazon',
@@ -236,12 +237,33 @@
 `beancount_import.source.amazon_invoice` module, which has a command-line
 interface, to try to debug it.
 
+Caching Parsing Results
+=======================
+
+Parsing the HTML files can be slow, so this module uses the Python pickle 
+module/file format to cache the results of parsing the individual HTML files.
+These cached results are loaded as long as their mtime is more recent than
+the HTML file to load. If you want to enable this funcionality, pass a path
+to the `pickle_dir` parameter when initializing this class.
+
+Skipping Older Invoices
+=======================
+
+In the event you want to only process invoices that happened after a certain
+date you can pass the earliest date you want to process as a configuration 
+parameter `earliest_date` when initializing the this class.
+
+This requires parsing the HTML file in order to determine the date of the
+invoice, so it is recommended to use the caching/pickling mechanism described
+above if you choose to have a large number of invoices in your data folder that
+are not accounted for in your journal.
 """
 
 import collections
 from typing import Dict, List, Tuple, Optional
 import os
 import sys
+import pickle
 
 from beancount.core.data import Transaction, Posting, Balance, Commodity, Price, EMPTY_SET, Directive
 from beancount.core.amount import Amount
@@ -256,6 +278,8 @@
 from . import ImportResult, Source, SourceResults, InvalidSourceReference, AssociatedData
 from ..journal_editor import JournalEditor
 
+import datetime
+
 ITEM_DESCRIPTION_KEY = 'amazon_item_description'
 ITEM_URL_KEY = 'amazon_item_url'
 ITEM_BY_KEY = 'amazon_item_by'
@@ -452,12 +476,67 @@ def get_order_ids_seen(journal: JournalEditor,
         order_ids.setdefault(order_id, []).append(entry)
     return order_ids
 
+class AmazonPickler():
+    def __init__( self, pickle_dir: str ):
+        self.pickle_dir = pickle_dir
+        if pickle_dir is not None and not os.access(pickle_dir, os.W_OK):
+            raise Exception("Amazon pickled invoice path is not writable: %s" % pickle_dir)
+
+    @staticmethod
+    def try_get_mtime( path: str ):
+        try:
+            return os.stat(path).st_mtime
+        except:
+            return None
+
+    def _build_pickle_path( self, invoice_path: str ):
+        invoice_dir, invoice_file = os.path.split(invoice_path)
+        pickle_file = invoice_file.replace(".html", ".order.p")
+        return os.path.join(self.pickle_dir, pickle_file)
+
+    def load( self, results: SourceResults, invoice_path: str ):
+        if not self.pickle_dir: return None
+
+        try:
+            pickle_path = self._build_pickle_path( invoice_path )
+            invoice_mtime = AmazonPickler.try_get_mtime( invoice_path )
+            pickle_mtime  = AmazonPickler.try_get_mtime( pickle_path  )
+
+            if invoice_mtime is None or pickle_mtime is None: return None
+            if pickle_mtime < invoice_mtime: return None
+
+            with open(pickle_path, "rb") as f:
+                return pickle.load( f )
+        except:
+            results.add_error('Failed to load pickled invoice %s: %s' % (
+                        pickle_path, sys.exc_info()))
+
+    def dump( self, results: SourceResults, invoice_path: str, invoice: Order):
+        if not self.pickle_dir: return None
+
+        try:
+            pickle_path = self._build_pickle_path( invoice_path )
+
+            if invoice is None:
+                # remove existing pickles if invoice couldn't be parsed
+                pickle_mtime = AmazonPickler.try_get_mtime( pickle_path ) 
+                if pickle_mtime: os.remove( pickle_path )
+                return
+
+            with open(pickle_path, "wb") as f:
+                return pickle.dump( invoice, f )
+
+        except:
+            results.add_error('Failed to save pickled invoice %s: %s' % (
+                        pickle_path, sys.exc_info()))
 
 class AmazonSource(Source):
     def __init__(self,
                  directory: str,
                  amazon_account: str,
                  posttax_adjustment_accounts: Dict[str, str] = {},
+                 pickle_dir: str = None,
+                 earliest_date: datetime.date = None,
                  **kwargs) -> None:
         super().__init__(**kwargs)
         self.directory = directory
@@ -467,6 +546,9 @@ def __init__(self,
         self.example_posting_key_extractors[CREDIT_CARD_DESCRIPTION_KEY] = None
         self.example_posting_key_extractors[POSTTAX_DESCRIPTION_KEY] = None
         self.example_transaction_key_extractors[AMAZON_ACCOUNT_KEY] = None
+        self.pickler = AmazonPickler(pickle_dir)
+
+        self.earliest_date = earliest_date
 
         self.invoice_filenames = []  # type: List[Tuple[str, str]]
         for filename in os.listdir(self.directory):
@@ -478,14 +560,19 @@ def __init__(self,
         self._cached_invoices = {
         }  # type: Dict[str, Tuple[Optional[Order], str]]
 
-    def _get_invoice(self, invoice_filename: str):
+    def _get_invoice(self, results: SourceResults, order_id: str, invoice_filename: str):
         if invoice_filename in self._cached_invoices:
             return self._cached_invoices.get(invoice_filename)
-        path = os.path.realpath(os.path.join(self.directory, invoice_filename))
-        self.log_status('amazon: processing %s' % (path, ))
-        invoice = parse_invoice(path)  # type: Optional[Order]
-        self._cached_invoices[invoice_filename] = invoice, path
-        return invoice, path
+        invoice_path = os.path.realpath(os.path.join(self.directory, invoice_filename))
+
+        invoice = self.pickler.load(results, invoice_path)
+        if invoice is None:
+            self.log_status('amazon: processing %s: %s' % (order_id, invoice_path, ))
+            invoice = parse_invoice(invoice_path)  # type: Optional[Order]
+            self.pickler.dump( results, invoice_path, invoice )
+
+        self._cached_invoices[invoice_filename] = invoice, invoice_path
+        return invoice, invoice_path
 
     def prepare(self, journal: JournalEditor, results: SourceResults):
         credit_card_accounts = get_credit_card_accounts(journal)
@@ -500,13 +587,18 @@ def prepare(self, journal: JournalEditor, results: SourceResults):
         for order_id, invoice_filename in self.invoice_filenames:
             if order_id in order_ids_seen: continue
             try:
-              invoice, path = self._get_invoice(invoice_filename)
+              invoice, path = self._get_invoice(results, order_id, invoice_filename)
             except:
                 results.add_error('Failed to parse invoice %s: %s' % (
                     invoice_filename, sys.exc_info()))
                 continue
             if invoice is None:
               continue
+
+            if self.earliest_date is not None and invoice.order_date < self.earliest_date:
+                self.log_status("Skipping order with date [%s] before [%s]" % ( str(invoice.order_date), self.earliest_date ) )
+                continue
+
             transaction = make_amazon_transaction(
                 invoice=invoice,
                 posttax_adjustment_accounts=self.posttax_adjustment_accounts,

diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py
@@ -80,7 +80,7 @@
     '(?:.*) Discount',
     'Gift[ -]Wrap',
 ]) + ') *:')
-posttax_adjustment_fields_pattern = r'Gift Card Amount:|Rewards Points:|Recycle Fee \$X'
+posttax_adjustment_fields_pattern = r'Gift Card Amount:|Rewards Points:|Tip [(]optional[)]:|Recycle Fee \$X'
 
 
 def to_json(obj):
@@ -127,12 +127,13 @@ def predicate(node):
     return results
 
 
-def get_adjustments_in_table(table, pattern):
+def get_adjustments_in_table(table, pattern, assumed_currency=None):
     adjustments = []
     for label, amount_str in get_field_in_table(
             table, pattern, allow_multiple=True, return_label=True):
         adjustments.append(
-            Adjustment(amount=parse_amount(amount_str), description=label))
+            Adjustment(amount=parse_amount(amount_str, assumed_currency), 
+                       description=label))
     return adjustments
 
 
@@ -153,6 +154,7 @@ def parse_shipments(soup) -> List[Shipment]:
         'Service completed',
         'Preparing for Shipment',
         'Not Yet Shipped',
+        'Shipping now'
     }
 
     def is_shipment_header_table(node):
@@ -198,15 +200,41 @@ def is_items_ordered_header(node):
             price_node = tds[1]
             price = price_node.text.strip()
 
-            pattern_without_condition = r'^\s*(?P<quantity>[0-9]+)\s+of:(?P<description>.*)\n\s*(?:Sold|Provided) by:? (?P<sold_by>[^\n]+)'
+            price = parse_amount(price)
+            if price is None:
+                price = Amount(D(0), 'USD')
+
+            # 1 of: 365 Everyday Value, Potato Yellow Bag Organic, 48 Ounce
+            # 2 (1.04 lb) of: Broccoli Crowns Conventional, 1 Each
+            # 2.07 lb of: Pork Sausage Link Italian Mild Step 1
+
+            pattern_quantity = r'^\s*(?:(?P<quantity>[0-9]+)|(?P<weight1>[0-9.]+\s+(?:lb|kg))|(?:(?P<quantityIgnore>[0-9.]+) [(](?P<weight2>[^)]+)[)]))\s+of:'
+            m = re.match(pattern_quantity, description_node.text, re.UNICODE|re.DOTALL)
+            quantity = 1
+            if m is not None:
+                # Amazon will say you got, e.g. 2 broccoli crowns at $1.69/lb - but then this code multiplies the 2 by the price listed
+                # on the invoice, which is the total price in this case (but the per-unit price in other cases) - so if there's a quantity
+                # and a weight, ignore the quantity and treat it as 1
+                # alternately, capture the weight and the per-unit price and multiply out
+                quantity = m.group("quantity") # ignore quantity for weight items
+
+            if quantity is None:
+                #print("Unable to extract quantity, using 1: %s" % description_node.text)
+                quantity = D(1)
+            else:
+                quantity = D(quantity)
+
+            text = description_node.text.split("of:",1)[1]
+
+            pattern_without_condition = r'(?P<description>.*)\n\s*(?:Sold|Provided) by:? (?P<sold_by>[^\n]+)'
             pattern_with_condition = pattern_without_condition + r'\n.*\n\s*Condition: (?P<condition>[^\n]+)'
 
-            m = re.match(pattern_with_condition, description_node.text,
-                         re.UNICODE | re.DOTALL)
+            m = re.match(pattern_with_condition, text, re.UNICODE | re.DOTALL)
             if m is None:
-                m = re.match(pattern_without_condition, description_node.text,
-                             re.UNICODE | re.DOTALL)
-            assert m is not None
+                m = re.match(pattern_without_condition, text, re.UNICODE | re.DOTALL)
+            if m is None:
+                raise Exception("Could not extract item from row", text)
+
             description = re.sub(r'\s+', ' ', m.group('description').strip())
             sold_by = re.sub(r'\s+', ' ', m.group('sold_by').strip())
             try:
@@ -218,11 +246,11 @@ def is_items_ordered_header(node):
                 sold_by = sold_by[:-len(suffix)]
             items.append(
                 Item(
-                    quantity=D(m.group('quantity')),
+                    quantity=quantity,
                     description=description,
                     sold_by=sold_by,
                     condition=condition,
-                    price=parse_amount(price),
+                    price=price,
                 ))
 
         items_subtotal = parse_amount(
@@ -358,27 +386,38 @@ def parse_regular_order_invoice(path: str) -> Order:
     output_fields = dict()
     output_fields['pretax_adjustments'] = get_adjustments_in_table(
         payment_table, pretax_adjustment_fields_pattern)
-    amount = reduce_amounts(
-        a.amount for a in output_fields['pretax_adjustments'])
     payment_adjustments = collections.OrderedDict()  # type: Dict[str, Amount]
+
+    # older invoices put pre-tax amounts on a per-shipment basis
+    # new invoices only put pre-tax amounts on the overall payments section
+    # detect which this is
+    pretax_amount = reduce_amounts(
+        a.amount for a in output_fields['pretax_adjustments'])
+    shipments_pretax_amount = None
+
     if any(s.pretax_adjustments for s in shipments):
-        expected_amount = reduce_amounts(
-            a.amount
+        shipments_pretax_amount = reduce_amounts(a.amount
             for shipment in shipments
-            for a in shipment.pretax_adjustments)
-        if expected_amount != amount:
+            for a in shipment.pretax_adjustments)            
+
+        if shipments_pretax_amount != pretax_amount:
             errors.append(
                 'expected total pretax adjustment to be %s, but parsed total is %s'
-                % (expected_amount, amount))
+                % (expected_amount, pretax_amount))
 
     payments_total_adjustments = []
     shipments_total_adjustments = []
 
+    # parse first to get an idea of the working currency
+    grand_total = parse_amount(
+        get_field_in_table(payment_table, 'Grand Total:'))
+
     def resolve_posttax_adjustments():
         payment_adjustments.update(
             reduce_adjustments(
                 get_adjustments_in_table(payment_table,
-                                         posttax_adjustment_fields_pattern)))
+                                         posttax_adjustment_fields_pattern,
+                                         assumed_currency=grand_total.currency)))
         all_shipments_adjustments = collections.OrderedDict(
             reduce_adjustments(
                 sum((x.posttax_adjustments for x in shipments), [])))
@@ -419,11 +458,15 @@ def resolve_posttax_adjustments():
 
     payments_total_adjustment = reduce_amounts(payments_total_adjustments)
     shipments_total_adjustment = reduce_amounts(shipments_total_adjustments)
-    grand_total = parse_amount(
-        get_field_in_table(payment_table, 'Grand Total:'))
 
     expected_total = add_amount(shipments_total_adjustment,
                                 reduce_amounts(x.total for x in shipments))
+
+    # if no shipments pre-tax section, then the expected total isn't accounting
+    # for the pre-tax adjustments yet since they are only in the grand total section
+    if shipments_pretax_amount is None:
+        expected_total = add_amount(expected_total, pretax_amount)
+
     adjusted_grand_total = add_amount(payments_total_adjustment, grand_total)
     if expected_total != adjusted_grand_total:
         errors.append('expected grand total is %s, but parsed value is %s' %