Skip to content

Commit

Permalink
Fixes #243 - Support import_set for html input
Browse files Browse the repository at this point in the history
  • Loading branch information
claudep committed Jul 2, 2023
1 parent f3ef2e9 commit 2dec898
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 3 deletions.
4 changes: 4 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

## Unreleased

### Improvements

- The html format now supports importing from HTML content (#243)

### Changes

- The html export format does not depend on MarkupPy any longer, therefore the
Expand Down
18 changes: 15 additions & 3 deletions docs/formats.rst
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,21 @@ install Tablib with ``pip install "tablib[pandas]"`` to make the format availabl
html
====

The ``html`` format is currently export-only. The exports produce an HTML page
with the data in a ``<table>``. If headers have been set, they will be used as
table headers.
The exports produce an HTML page with the data in a ``<table>``. If headers have
been set, they will be used as table headers (``thead``).

When you import HTML, you can specify a specific table to import by providing
the ``table_id`` argument::

import tablib

tablib.import_set(your_html, format='html', table_id='some_table_id')

Otherwise, the first table found will be imported.

.. versionchanged:: 3.6.0

The ability to import HTML was added. The dependency on MarkupPy was dropped.

jira
====
Expand Down
64 changes: 64 additions & 0 deletions src/tablib/formats/_html.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
""" Tablib - HTML export support.
"""
from html.parser import HTMLParser
from xml.etree import ElementTree as ET


Expand Down Expand Up @@ -48,3 +49,66 @@ def export_book(cls, databook):
result += '\n'

return result

@classmethod
def import_set(cls, dset, in_stream, table_id=None, **kwargs):
"""Returns dataset from HTML content."""

dset.wipe()
parser = TablibHTMLParser(dset, table_id=table_id)
parser.feed(in_stream.read())
if not parser.table_found:
if table_id:
raise ValueError(f'No <table> found with id="{table_id}" in input HTML')

Check warning on line 62 in src/tablib/formats/_html.py

View check run for this annotation

Codecov / codecov/patch

src/tablib/formats/_html.py#L62

Added line #L62 was not covered by tests
else:
raise ValueError('No <table> found in input HTML')


class TablibHTMLParser(HTMLParser):
def __init__(self, dataset, *args, table_id=None, **kwargs):
super().__init__(*args, **kwargs)
self.dset = dataset
self.table_id = table_id
self.table_found = False
self.table_open = False
self.thead_open = False
self.cell_open = False
self.headers = []
self.current_row = []
self.current_data = ''

def handle_starttag(self, tag, attrs):
if (
tag == 'table' and not self.table_found and
(not self.table_id or dict(attrs).get('id') == self.table_id)
):
self.table_open = True
self.table_found = True
elif self.table_open:
if tag == 'thead':
self.thead_open = True
elif tag in ['td', 'th']:
self.cell_open = True

def handle_endtag(self, tag):
if not self.table_open:
return
if tag == 'table':
self.table_open = False
elif tag == 'thead':
self.thead_open = False
self.dset.headers = self.headers
elif tag == 'tr' and self.current_row:
self.dset.append(self.current_row)
self.current_row = []
elif tag in ['td', 'th']:
if self.thead_open:
self.headers.append(self.current_data)
else:
self.current_row.append(self.current_data)
self.cell_open = False
self.current_data = ''

def handle_data(self, data):
if self.cell_open:
self.current_data += data
48 changes: 48 additions & 0 deletions tests/test_tablib.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,54 @@ def test_html_databook_export(self):
f"<h3>Founders</h3>{self.founders_html}<h3>Founders</h3>{self.founders_html}"
)

def test_html_import(self):
data.html = self.founders_html

self.assertEqual(['first_name', 'last_name', 'gpa'], data.headers)
self.assertEqual([
('John', 'Adams', '90'),
('George', 'Washington', '67'),
('Thomas', 'Jefferson', '50'),
], data[:])

def test_html_import_no_headers(self):
data.html = """
<table>
<tr><td>John</td><td><i>Adams</i></td><td>90</td></tr>"
<tr><td>George</td><td><i>Wash</i>ington</td><td>67</td></tr>"
</table>
"""

self.assertIsNone(data.headers)
self.assertEqual([
('John', 'Adams', '90'),
('George', 'Washington', '67'),
], data[:])

def test_html_import_no_table(self):
html = "<html><body></body></html>"

with self.assertRaises(ValueError) as exc:
data.html = html
self.assertEqual('No <table> found in input HTML', str(exc.exception))

def test_html_import_table_id(self):
"""A table with a specific id can be targeted for import."""
html_input = """
<html><body>
<table id="ignore">
<thead><tr><th>IGNORE</th></tr></thead><tr><td>IGNORE</td></tr>
</table>
<table id="import">
<thead><tr><th>first_name</th><th>last_name</th></tr></thead>
<tr><td>John</td><td>Adams</td></tr>"
</table>
</html></body>
"""
dataset = tablib.import_set(html_input, format="html", table_id="import")
self.assertEqual(['first_name', 'last_name'], dataset.headers)
self.assertEqual([('John', 'Adams')], dataset[:])


class RSTTests(BaseTestCase):
def test_rst_force_grid(self):
Expand Down

0 comments on commit 2dec898

Please sign in to comment.