Skip to content

Commit

Permalink
Merge 8e2faa1 into 5748edd
Browse files Browse the repository at this point in the history
  • Loading branch information
mbafford committed Oct 25, 2020
2 parents 5748edd + 8e2faa1 commit 9b88d43
Show file tree
Hide file tree
Showing 9 changed files with 1,827 additions and 30 deletions.
6 changes: 4 additions & 2 deletions beancount_import/amount_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,20 @@ def parse_number(x):
sign, number_str = parse_possible_negative(x)
return sign * D(number_str)

def parse_amount(x):
def parse_amount(x, assumed_currency=None):
"""Parses a number and currency."""
if not x:
return None
sign, amount_str = parse_possible_negative(x)
m = re.fullmatch(r'([\$€£])?((?:[0-9](?:,?[0-9])*|(?=\.))(?:\.[0-9]+)?)(?:\s+([A-Z]{3}))?', amount_str)
m = re.fullmatch(r'(?:[(][^)]+[)])?\s*([\$€£])?((?:[0-9](?:,?[0-9])*|(?=\.))(?:\.[0-9]+)?)(?:\s+([A-Z]{3}))?', amount_str)
if m is None:
raise ValueError('Failed to parse amount from %r' % amount_str)
if m.group(1):
currency = {'$': 'USD', '€': 'EUR', '£': 'GBP'}[m.group(1)]
elif m.group(3):
currency = m.group(3)
elif assumed_currency is not None:
currency = assumed_currency
else:
raise ValueError('Failed to determine currency from %r' % amount_str)
number = D(m.group(2))
Expand Down
106 changes: 99 additions & 7 deletions beancount_import/source/amazon.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
dict(module='beancount_import.source.amazon',
directory=os.path.join(journal_dir, 'data/amazon'),
pickle_dir=os.path.join(journal_dir, 'data/amazon/.pickle')
amazon_account='name@domain.com',
posttax_adjustment_accounts={
'Gift Card Amount': 'Assets:Gift-Cards:Amazon',
Expand Down Expand Up @@ -236,12 +237,33 @@
`beancount_import.source.amazon_invoice` module, which has a command-line
interface, to try to debug it.
Caching Parsing Results
=======================
Parsing the HTML files can be slow, so this module uses the Python pickle
module/file format to cache the results of parsing the individual HTML files.
These cached results are loaded as long as their mtime is more recent than
the HTML file to load. If you want to enable this funcionality, pass a path
to the `pickle_dir` parameter when initializing this class.
Skipping Older Invoices
=======================
In the event you want to only process invoices that happened after a certain
date you can pass the earliest date you want to process as a configuration
parameter `earliest_date` when initializing the this class.
This requires parsing the HTML file in order to determine the date of the
invoice, so it is recommended to use the caching/pickling mechanism described
above if you choose to have a large number of invoices in your data folder that
are not accounted for in your journal.
"""

import collections
from typing import Dict, List, Tuple, Optional
import os
import sys
import pickle

from beancount.core.data import Transaction, Posting, Balance, Commodity, Price, EMPTY_SET, Directive
from beancount.core.amount import Amount
Expand All @@ -256,6 +278,8 @@
from . import ImportResult, Source, SourceResults, InvalidSourceReference, AssociatedData
from ..journal_editor import JournalEditor

import datetime

ITEM_DESCRIPTION_KEY = 'amazon_item_description'
ITEM_URL_KEY = 'amazon_item_url'
ITEM_BY_KEY = 'amazon_item_by'
Expand Down Expand Up @@ -452,12 +476,67 @@ def get_order_ids_seen(journal: JournalEditor,
order_ids.setdefault(order_id, []).append(entry)
return order_ids

class AmazonPickler():
def __init__( self, pickle_dir: str ):
self.pickle_dir = pickle_dir
if pickle_dir is not None and not os.access(pickle_dir, os.W_OK):
raise Exception("Amazon pickled invoice path is not writable: %s" % pickle_dir)

@staticmethod
def try_get_mtime( path: str ):
try:
return os.stat(path).st_mtime
except:
return None

def _build_pickle_path( self, invoice_path: str ):
invoice_dir, invoice_file = os.path.split(invoice_path)
pickle_file = invoice_file.replace(".html", ".order.p")
return os.path.join(self.pickle_dir, pickle_file)

def load( self, results: SourceResults, invoice_path: str ):
if not self.pickle_dir: return None

try:
pickle_path = self._build_pickle_path( invoice_path )
invoice_mtime = AmazonPickler.try_get_mtime( invoice_path )
pickle_mtime = AmazonPickler.try_get_mtime( pickle_path )

if invoice_mtime is None or pickle_mtime is None: return None
if pickle_mtime < invoice_mtime: return None

with open(pickle_path, "rb") as f:
return pickle.load( f )
except:
results.add_error('Failed to load pickled invoice %s: %s' % (
pickle_path, sys.exc_info()))

def dump( self, results: SourceResults, invoice_path: str, invoice: Order):
if not self.pickle_dir: return None

try:
pickle_path = self._build_pickle_path( invoice_path )

if invoice is None:
# remove existing pickles if invoice couldn't be parsed
pickle_mtime = AmazonPickler.try_get_mtime( pickle_path )
if pickle_mtime: os.remove( pickle_path )
return

with open(pickle_path, "wb") as f:
return pickle.dump( invoice, f )

except:
results.add_error('Failed to save pickled invoice %s: %s' % (
pickle_path, sys.exc_info()))

class AmazonSource(Source):
def __init__(self,
directory: str,
amazon_account: str,
posttax_adjustment_accounts: Dict[str, str] = {},
pickle_dir: str = None,
earliest_date: datetime.date = None,
**kwargs) -> None:
super().__init__(**kwargs)
self.directory = directory
Expand All @@ -467,6 +546,9 @@ def __init__(self,
self.example_posting_key_extractors[CREDIT_CARD_DESCRIPTION_KEY] = None
self.example_posting_key_extractors[POSTTAX_DESCRIPTION_KEY] = None
self.example_transaction_key_extractors[AMAZON_ACCOUNT_KEY] = None
self.pickler = AmazonPickler(pickle_dir)

self.earliest_date = earliest_date

self.invoice_filenames = [] # type: List[Tuple[str, str]]
for filename in os.listdir(self.directory):
Expand All @@ -478,14 +560,19 @@ def __init__(self,
self._cached_invoices = {
} # type: Dict[str, Tuple[Optional[Order], str]]

def _get_invoice(self, invoice_filename: str):
def _get_invoice(self, results: SourceResults, order_id: str, invoice_filename: str):
if invoice_filename in self._cached_invoices:
return self._cached_invoices.get(invoice_filename)
path = os.path.realpath(os.path.join(self.directory, invoice_filename))
self.log_status('amazon: processing %s' % (path, ))
invoice = parse_invoice(path) # type: Optional[Order]
self._cached_invoices[invoice_filename] = invoice, path
return invoice, path
invoice_path = os.path.realpath(os.path.join(self.directory, invoice_filename))

invoice = self.pickler.load(results, invoice_path)
if invoice is None:
self.log_status('amazon: processing %s: %s' % (order_id, invoice_path, ))
invoice = parse_invoice(invoice_path) # type: Optional[Order]
self.pickler.dump( results, invoice_path, invoice )

self._cached_invoices[invoice_filename] = invoice, invoice_path
return invoice, invoice_path

def prepare(self, journal: JournalEditor, results: SourceResults):
credit_card_accounts = get_credit_card_accounts(journal)
Expand All @@ -500,13 +587,18 @@ def prepare(self, journal: JournalEditor, results: SourceResults):
for order_id, invoice_filename in self.invoice_filenames:
if order_id in order_ids_seen: continue
try:
invoice, path = self._get_invoice(invoice_filename)
invoice, path = self._get_invoice(results, order_id, invoice_filename)
except:
results.add_error('Failed to parse invoice %s: %s' % (
invoice_filename, sys.exc_info()))
continue
if invoice is None:
continue

if self.earliest_date is not None and invoice.order_date < self.earliest_date:
self.log_status("Skipping order with date [%s] before [%s]" % ( str(invoice.order_date), self.earliest_date ) )
continue

transaction = make_amazon_transaction(
invoice=invoice,
posttax_adjustment_accounts=self.posttax_adjustment_accounts,
Expand Down
85 changes: 64 additions & 21 deletions beancount_import/source/amazon_invoice.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@
'(?:.*) Discount',
'Gift[ -]Wrap',
]) + ') *:')
posttax_adjustment_fields_pattern = r'Gift Card Amount:|Rewards Points:|Recycle Fee \$X'
posttax_adjustment_fields_pattern = r'Gift Card Amount:|Rewards Points:|Tip [(]optional[)]:|Recycle Fee \$X'


def to_json(obj):
Expand Down Expand Up @@ -127,12 +127,13 @@ def predicate(node):
return results


def get_adjustments_in_table(table, pattern):
def get_adjustments_in_table(table, pattern, assumed_currency=None):
adjustments = []
for label, amount_str in get_field_in_table(
table, pattern, allow_multiple=True, return_label=True):
adjustments.append(
Adjustment(amount=parse_amount(amount_str), description=label))
Adjustment(amount=parse_amount(amount_str, assumed_currency),
description=label))
return adjustments


Expand All @@ -153,6 +154,7 @@ def parse_shipments(soup) -> List[Shipment]:
'Service completed',
'Preparing for Shipment',
'Not Yet Shipped',
'Shipping now'
}

def is_shipment_header_table(node):
Expand Down Expand Up @@ -198,15 +200,41 @@ def is_items_ordered_header(node):
price_node = tds[1]
price = price_node.text.strip()

pattern_without_condition = r'^\s*(?P<quantity>[0-9]+)\s+of:(?P<description>.*)\n\s*(?:Sold|Provided) by:? (?P<sold_by>[^\n]+)'
price = parse_amount(price)
if price is None:
price = Amount(D(0), 'USD')

# 1 of: 365 Everyday Value, Potato Yellow Bag Organic, 48 Ounce
# 2 (1.04 lb) of: Broccoli Crowns Conventional, 1 Each
# 2.07 lb of: Pork Sausage Link Italian Mild Step 1

pattern_quantity = r'^\s*(?:(?P<quantity>[0-9]+)|(?P<weight1>[0-9.]+\s+(?:lb|kg))|(?:(?P<quantityIgnore>[0-9.]+) [(](?P<weight2>[^)]+)[)]))\s+of:'
m = re.match(pattern_quantity, description_node.text, re.UNICODE|re.DOTALL)
quantity = 1
if m is not None:
# Amazon will say you got, e.g. 2 broccoli crowns at $1.69/lb - but then this code multiplies the 2 by the price listed
# on the invoice, which is the total price in this case (but the per-unit price in other cases) - so if there's a quantity
# and a weight, ignore the quantity and treat it as 1
# alternately, capture the weight and the per-unit price and multiply out
quantity = m.group("quantity") # ignore quantity for weight items

if quantity is None:
#print("Unable to extract quantity, using 1: %s" % description_node.text)
quantity = D(1)
else:
quantity = D(quantity)

text = description_node.text.split("of:",1)[1]

pattern_without_condition = r'(?P<description>.*)\n\s*(?:Sold|Provided) by:? (?P<sold_by>[^\n]+)'
pattern_with_condition = pattern_without_condition + r'\n.*\n\s*Condition: (?P<condition>[^\n]+)'

m = re.match(pattern_with_condition, description_node.text,
re.UNICODE | re.DOTALL)
m = re.match(pattern_with_condition, text, re.UNICODE | re.DOTALL)
if m is None:
m = re.match(pattern_without_condition, description_node.text,
re.UNICODE | re.DOTALL)
assert m is not None
m = re.match(pattern_without_condition, text, re.UNICODE | re.DOTALL)
if m is None:
raise Exception("Could not extract item from row", text)

description = re.sub(r'\s+', ' ', m.group('description').strip())
sold_by = re.sub(r'\s+', ' ', m.group('sold_by').strip())
try:
Expand All @@ -218,11 +246,11 @@ def is_items_ordered_header(node):
sold_by = sold_by[:-len(suffix)]
items.append(
Item(
quantity=D(m.group('quantity')),
quantity=quantity,
description=description,
sold_by=sold_by,
condition=condition,
price=parse_amount(price),
price=price,
))

items_subtotal = parse_amount(
Expand Down Expand Up @@ -358,27 +386,38 @@ def parse_regular_order_invoice(path: str) -> Order:
output_fields = dict()
output_fields['pretax_adjustments'] = get_adjustments_in_table(
payment_table, pretax_adjustment_fields_pattern)
amount = reduce_amounts(
a.amount for a in output_fields['pretax_adjustments'])
payment_adjustments = collections.OrderedDict() # type: Dict[str, Amount]

# older invoices put pre-tax amounts on a per-shipment basis
# new invoices only put pre-tax amounts on the overall payments section
# detect which this is
pretax_amount = reduce_amounts(
a.amount for a in output_fields['pretax_adjustments'])
shipments_pretax_amount = None

if any(s.pretax_adjustments for s in shipments):
expected_amount = reduce_amounts(
a.amount
shipments_pretax_amount = reduce_amounts(a.amount
for shipment in shipments
for a in shipment.pretax_adjustments)
if expected_amount != amount:
for a in shipment.pretax_adjustments)

if shipments_pretax_amount != pretax_amount:
errors.append(
'expected total pretax adjustment to be %s, but parsed total is %s'
% (expected_amount, amount))
% (expected_amount, pretax_amount))

payments_total_adjustments = []
shipments_total_adjustments = []

# parse first to get an idea of the working currency
grand_total = parse_amount(
get_field_in_table(payment_table, 'Grand Total:'))

def resolve_posttax_adjustments():
payment_adjustments.update(
reduce_adjustments(
get_adjustments_in_table(payment_table,
posttax_adjustment_fields_pattern)))
posttax_adjustment_fields_pattern,
assumed_currency=grand_total.currency)))
all_shipments_adjustments = collections.OrderedDict(
reduce_adjustments(
sum((x.posttax_adjustments for x in shipments), [])))
Expand Down Expand Up @@ -419,11 +458,15 @@ def resolve_posttax_adjustments():

payments_total_adjustment = reduce_amounts(payments_total_adjustments)
shipments_total_adjustment = reduce_amounts(shipments_total_adjustments)
grand_total = parse_amount(
get_field_in_table(payment_table, 'Grand Total:'))

expected_total = add_amount(shipments_total_adjustment,
reduce_amounts(x.total for x in shipments))

# if no shipments pre-tax section, then the expected total isn't accounting
# for the pre-tax adjustments yet since they are only in the grand total section
if shipments_pretax_amount is None:
expected_total = add_amount(expected_total, pretax_amount)

adjusted_grand_total = add_amount(payments_total_adjustment, grand_total)
if expected_total != adjusted_grand_total:
errors.append('expected grand total is %s, but parsed value is %s' %
Expand Down

0 comments on commit 9b88d43

Please sign in to comment.