fix: Force utf-8 encoding when reading and writing text files

This may fix character encoding problems when the system default encoding is not UTF-8.
jbms · Mar 24, 2019 · f2eb320 · f2eb320
1 parent 131c357
commit f2eb320
Show file tree

Hide file tree

Showing 10 changed files with 18 additions and 17 deletions.
diff --git a/beancount_import/source/amazon_invoice.py b/beancount_import/source/amazon_invoice.py
@@ -333,7 +333,7 @@ def parse_invoice(path: str) -> Order:
 
 def parse_regular_order_invoice(path: str) -> Order:
     errors = []
-    with open(path, 'r') as f:
+    with open(path, 'rb') as f:
         soup = bs4.BeautifulSoup(f.read(), 'lxml')
     shipments = parse_shipments(soup)
     payment_table_header = soup.find(
@@ -462,7 +462,7 @@ def get_text_lines(parent_node):
 
 def parse_digital_order_invoice(path: str) -> Order:
     errors = []
-    with open(path, 'r') as f:
+    with open(path, 'rb') as f:
         soup = bs4.BeautifulSoup(f.read(), 'lxml')
 
     digital_order_pattern = 'Digital Order: (.*)'

diff --git a/beancount_import/source/amazon_invoice_sanitize.py b/beancount_import/source/amazon_invoice_sanitize.py
@@ -70,8 +70,8 @@ def remove_tag(soup: bs4.BeautifulSoup, tag: str):
 
 def sanitize_invoice(input_path: str, output_path: str,
                      credit_card_digits: str):
-    with open(input_path, 'r') as f:
-        soup = bs4.BeautifulSoup(f.read(), 'lxml')
+    with open(input_path, 'rb') as fb:
+        soup = bs4.BeautifulSoup(fb.read(), 'lxml')
     comments = soup.find_all(text=lambda text: isinstance(text, bs4.Comment))
     remove_tag(soup, 'script')
     remove_tag(soup, 'style')
@@ -93,7 +93,7 @@ def sanitize_invoice(input_path: str, output_path: str,
         output_name, _ = sanitize_order_ids(
             os.path.basename(input_path), order_id_replacements)
         output_path = os.path.join(output_path, output_name)
-    with open(output_path, 'w') as f:
+    with open(output_path, 'w', encoding='utf-8', newline='\n') as f:
         f.write(new_output)
 
 

diff --git a/beancount_import/source/google_purchases.py b/beancount_import/source/google_purchases.py
@@ -260,7 +260,7 @@ def prepare(self, journal, results: SourceResults):
             path = os.path.join(self.directory,
                                 prefix + receipt_id + json_suffix)
             self.log_status('google_purchases: processing %s' % (path, ))
-            with open(path, 'r') as f:
+            with open(path, 'r', encoding='utf-8', newline='\n') as f:
                 receipt = json.load(f)
             if receipt_id in takeout_receipt_ids:
                 import_result = make_takeout_import_result(

diff --git a/beancount_import/source/healthequity.py b/beancount_import/source/healthequity.py
@@ -205,7 +205,7 @@ def load_cash_transactions(filename: str, account: str,
     expected_field_names = ['Date', 'Transaction', 'Amount', 'Cash Balance']
     transactions = []
     filename = os.path.abspath(filename)
-    with open(filename, 'r', newline='') as csvfile:
+    with open(filename, 'r', encoding='utf-8', newline='') as csvfile:
         reader = csv.DictReader(csvfile)
         if reader.fieldnames != expected_field_names:
             raise RuntimeError(
@@ -236,7 +236,7 @@ def load_fund_transactions(filename: str,
     ]
     transactions = []  # type: List[FundTransaction]
     filename = os.path.abspath(filename)
-    with open(filename, 'r', newline='') as csvfile:
+    with open(filename, 'r', encoding='utf-8', newline='') as csvfile:
         reader = csv.DictReader(csvfile)
         if reader.fieldnames != expected_field_names:
             raise RuntimeError(
@@ -268,7 +268,7 @@ def load_balances(filename: str, date: datetime.date,
     ]
     balances = []  # type: List[ImportedBalance]
     filename = os.path.abspath(filename)
-    with open(filename, 'r', newline='') as csvfile:
+    with open(filename, 'r', encoding='utf-8', newline='') as csvfile:
         reader = csv.DictReader(csvfile)
         if reader.fieldnames != expected_field_names:
             raise RuntimeError(

diff --git a/beancount_import/source/mint.py b/beancount_import/source/mint.py
@@ -184,7 +184,7 @@ def load_transactions(filename: str, currency: str = 'USD') -> List[MintEntry]:
     try:
         entries = []
         filename = os.path.abspath(filename)
-        with open(filename, 'r', newline='') as csvfile:
+        with open(filename, 'r', encoding='utf-8', newline='') as csvfile:
             reader = csv.DictReader(csvfile)
             if reader.fieldnames != expected_field_names:
                 raise RuntimeError(
@@ -232,7 +232,7 @@ def load_balances(filename: str) -> List[RawBalance]:
     ]
     balances = []
     filename = os.path.abspath(filename)
-    with open(filename, 'r', newline='') as csvfile:
+    with open(filename, 'r', encoding='utf-8', newline='') as csvfile:
         reader = csv.DictReader(csvfile)
         if reader.fieldnames != expected_field_names:
             raise RuntimeError(

diff --git a/beancount_import/source/ofx.py b/beancount_import/source/ofx.py
@@ -1123,8 +1123,9 @@ def __init__(self, seen_fitids, filename):
         self.filename = filename
         parsed_statements = self.parsed_statements = []
 
-        with open(filename, 'r') as f:
+        with open(filename, 'rb') as f:
             contents = f.read()
+        # A byte string passed to BeautifulSoup is assumed to be UTF-8
         soup = bs4.BeautifulSoup(contents, 'html.parser')
 
         # Get the description of securities used in this file.

diff --git a/beancount_import/source/paypal.py b/beancount_import/source/paypal.py
@@ -613,7 +613,7 @@ def prepare(self, journal: JournalEditor, results: SourceResults):
             path = os.path.join(self.directory,
                                 txn_id + transaction_json_suffix)
             self.log_status('paypal: processing %s' % (path, ))
-            with open(path, 'r') as f:
+            with open(path, 'r', encoding='utf-8', newline='\n') as f:
                 txn = json.load(f)
             jsonschema.validate(txn, transaction_schema)
             results.add_pending_entry(

diff --git a/beancount_import/source/paypal_sanitize.py b/beancount_import/source/paypal_sanitize.py
@@ -14,13 +14,13 @@
 
 def sanitize(input_path: str, output_directory: str):
     txn_id = os.path.splitext(os.path.basename(input_path))[0]
-    with open(input_path, 'r') as f:
+    with open(input_path, 'r', encoding='utf-8', newline='\n') as f:
         content = f.read()
     base_36_chars = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'
     new_txn_id = ''.join(base_36_chars[random.randint(0, 35)] for _ in txn_id)
     content = content.replace(txn_id, new_txn_id)
     output_path = os.path.join(output_directory, new_txn_id + '.json')
-    with open(output_path, 'w') as f:
+    with open(output_path, 'w', encoding='utf-8', newline='\n') as f:
         f.write(content)
     print('Wrote: %s' % output_path)
 

diff --git a/beancount_import/source/venmo.py b/beancount_import/source/venmo.py
@@ -197,7 +197,7 @@ def get_info(raw: Union[RawTransaction, RawBalance]):
 
 def load_csv(path: str, field_names: List[str]) -> List[Dict[str, Union[str,int]]]:
     path = os.path.abspath(path)
-    with open(path, 'r', newline='') as f:
+    with open(path, 'r', newline='', encoding='utf-8') as f:
         csv_reader = csv.DictReader(f)
         assert csv_reader.fieldnames == field_names
         return [add_line_and_filename(x, path, line_i + 1) for line_i, x in enumerate(csv_reader)]

diff --git a/beancount_import/source/waveapps.py b/beancount_import/source/waveapps.py
@@ -190,7 +190,7 @@ def prepare(self, journal, results: SourceResults):
             path = os.path.join(self.receipt_directory,
                                 receipt_id + json_suffix)
             self.log_status('waveapps: processing %s' % (path, ))
-            with open(path, 'r') as f:
+            with open(path, 'r', newline='\n', encoding='utf-8') as f:
                 receipt = json.load(f)
             jsonschema.validate(receipt, schema)
             if receipt['status'] != 'Ready':