From 6b71a491b1c925004e4c6eda20b9842c16faed62 Mon Sep 17 00:00:00 2001 From: Nikita Marchant Date: Sat, 17 Aug 2019 20:41:59 +0200 Subject: [PATCH] Add Tatsu to have a real PEG parser from the EBNF grammar (#194) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add 竜 TatSu as a dependency. This enables us to have a real PEG parser and not a combination of regexes and string splitting. Fix parsing of quoted values as well as escaped semi-columns This fixes #185 and fixes #193 Note : Adding Tatsu might have made the parser significantly slower in some cases. --- CHANGELOG.rst | 15 +++++++++-- MANIFEST.in | 1 + README.rst | 5 +--- ics/contentline.ebnf | 31 +++++++++++++++++++++++ ics/parse.py | 59 +++++++++++++++++++------------------------- requirements.txt | 1 + tests/contentline.py | 13 +++++++++- 7 files changed, 85 insertions(+), 40 deletions(-) create mode 100644 ics/contentline.ebnf diff --git a/CHANGELOG.rst b/CHANGELOG.rst index cd242b51..13e351f5 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,13 +7,24 @@ Ics.py changelog 0.6 ************** +Major changes: + - Drop support for Python 3.5. Python 3.7 is now distributed in both Ubuntu LTS + and Debian stable, the PSF is providing only security fixes. It's time + to move on ! + - Add `竜 TatSu `_ as a dependency. + This enables us to have a real PEG parser and not a combination of + regexes and string splitting. + +Minor features: - Add mypy - - Drop support for Python 3.5. Python 3.7 is now distributed in both Ubuntu LTS and Debian stable, - the PSF is providing only security fixes. It's time to move on ! - Add GEO (thanks @johnnoone !) Bug fixes: - Events no longer have the TRANSP property by default (Fixes #190) + - Fix parsing of quoted values as well as escaped semi-columns (#185 and #193) + +Regressions: + - Adding Tatsu might have made the parser significantly slower in some cases. ************** diff --git a/MANIFEST.in b/MANIFEST.in index eab7c837..9f03cafd 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,6 +3,7 @@ include CHANGELOG.rst include LICENSE include AUTHORS.rst include CONTRIBUTING.rst +include ics/contentline.ebnf include meta.py include tests/fixtures/README diff --git a/README.rst b/README.rst index aa87e0d7..a0316efa 100644 --- a/README.rst +++ b/README.rst @@ -6,9 +6,6 @@ Ics.py : iCalendar for Humans .. image:: https://travis-ci.org/C4ptainCrunch/ics.py.png?branch=master :target: https://travis-ci.org/C4ptainCrunch/ics.py -.. image:: https://coveralls.io/repos/C4ptainCrunch/ics.py/badge.png - :target: https://coveralls.io/r/C4ptainCrunch/ics.py - :alt: Coverage .. image:: https://img.shields.io/github/license/c4ptaincrunch/ics.py.svg :target: https://pypi.python.org/pypi/ics/ @@ -21,7 +18,7 @@ iCalendar is a widely-used and useful format but not user friendly. Ics.py is th It should be able to parse every calendar that respects the `rfc5545 `_ and maybe some more… It also outputs rfc compliant calendars. -iCalendar (file extension `.ics`) is used by Sunbird, Google Calendar, Apple Calendar, Android… +iCalendar (file extension `.ics`) is used by Google Calendar, Apple Calendar, Android and many more. Ics.py is available for Python>=3.6 and is Apache2 Licensed. diff --git a/ics/contentline.ebnf b/ics/contentline.ebnf new file mode 100644 index 00000000..95254441 --- /dev/null +++ b/ics/contentline.ebnf @@ -0,0 +1,31 @@ +@@grammar::contentline +@@whitespace :: // + +start = contentline $ ; + +ALPHA = ?"[a-zA-Z]" ; +DIGIT = ? "[0-9]" ; +CRLF = "\r\n" ; +WSP = " "; + +DQUOTE = '"' ; + +QSAFE_CHAR = WSP | ?"\x21" | ?"[\x23-\x7E]" | ?"[\u0080-\uffff]"; +SAFE_CHAR = WSP | ?"\x21" | ?"[\x23-\x2B]" | ?"[\x2D-\x39]" | ?"[\x3C-\x7E]" | ?"[\u0080-\uffff]" ; +VALUE_CHAR = WSP | ?"[\x21-\x7E]" | ?"[\u0080-\uffff]"; + + +name = iana_token | x_name ; +iana_token = {(ALPHA | DIGIT | "-")}+ ; +x_name = "X-" [vendorid "-"] {(ALPHA | DIGIT | "-")}+ ; +vendorid = (ALPHA | DIGIT) (ALPHA | DIGIT) {(ALPHA | DIGIT)}+ ; + +contentline = name:name {(";" params+:param )}* ":" value:value CRLF ; + +param = name:param_name "=" values+:param_value {("," values+:param_value)}* ; +param_name = iana_token | x_name ; +param_value = quoted_string | paramtext ; + +paramtext = {SAFE_CHAR}* ; +value = {VALUE_CHAR}* ; +quoted_string = DQUOTE @:{QSAFE_CHAR}* DQUOTE ; diff --git a/ics/parse.py b/ics/parse.py index 672c07a8..15672953 100644 --- a/ics/parse.py +++ b/ics/parse.py @@ -2,11 +2,18 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals, absolute_import +from pathlib import Path import collections +import tatsu CRLF = '\r\n' +grammar_path = Path(__file__).parent.joinpath('contentline.ebnf') + +with open(grammar_path) as fd: + GRAMMAR = tatsu.compile(fd.read()) + class ParseError(Exception): pass @@ -58,24 +65,18 @@ def __setitem__(self, item, *values): @classmethod def parse(cls, line): - if ':' not in line: - raise ParseError("No ':' in line '{}'".format(line)) - - # Separate key and value - splitted = line.split(':', 1) - key, value = splitted[0], splitted[1].strip() - - # Separate name and params - splitted = key.split(';') - name, params_strings = splitted[0], splitted[1:] + try: + ast = GRAMMAR.parse(line + CRLF) + except tatsu.exceptions.FailedToken: + raise ParseError() - # Separate key and values for params + name = ''.join(ast['name']) + value = ''.join(ast['value']) params = {} - for paramstr in params_strings: - if '=' not in paramstr: - raise ParseError("No '=' in line '{}'".format(paramstr)) - pname, pvals = paramstr.split('=', 1) - params[pname] = pvals.split(',') + for param_ast in ast.get('params', []): + param_name = ''.join(param_ast["name"]) + param_values = [''.join(x) for x in param_ast["values_"]] + params[param_name] = param_values return cls(name, params, value) def clone(self): @@ -170,20 +171,12 @@ def string_to_container(txt): return lines_to_container(txt.splitlines()) -if __name__ == "__main__": - from tests.fixture import cal1 - - def print_tree(elem, lvl=0): - if isinstance(elem, list) or isinstance(elem, Container): - if isinstance(elem, Container): - print("{}{}".format(' ' * lvl, elem.name)) - for sub_elem in elem: - print_tree(sub_elem, lvl + 1) - elif isinstance(elem, ContentLine): - print("{}{}{}".format(' ' * lvl, - elem.name, elem.params, elem.value)) - else: - print('Wuuut?') - - cal = string_to_container(cal1) - print_tree(cal) +def interpret_ast(ast): + name = ''.join(ast['name']) + value = ''.join(ast['value']) + params = {} + for param_ast in ast.get('params', []): + param_name = ''.join(param_ast["name"]) + param_values = [''.join(x) for x in param_ast["values_"]] + params[param_name] = param_values + return ContentLine(name, params, value) diff --git a/requirements.txt b/requirements.txt index 8a1c10a6..cc4c8f6f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ python-dateutil arrow>=0.11,<0.12 six>1.5 +tatsu>4.2 diff --git a/tests/contentline.py b/tests/contentline.py index 6125e914..45ef6c8a 100644 --- a/tests/contentline.py +++ b/tests/contentline.py @@ -7,7 +7,6 @@ class TestContentLine(unittest.TestCase): dataset = { 'HAHA:': ContentLine('haha'), - ':hoho': ContentLine('', {}, 'hoho'), 'HAHA:hoho': ContentLine('haha', {}, 'hoho'), 'HAHA:hoho:hihi': ContentLine('haha', {}, 'hoho:hihi'), 'HAHA;hoho=1:hoho': ContentLine('haha', {'hoho': ['1']}, 'hoho'), @@ -44,6 +43,18 @@ class TestContentLine(unittest.TestCase): {'hoho': ['p1', 'p2'], 'hihi': ['p3', 'p4', 'p5']}, 'blabla:blublu' ), + r'ATTENDEE;X-A="I&rsquo\;ll be in NYC":mailto:a@a.com': + ContentLine( + 'ATTENDEE', + {'X-A': [r"I&rsquo\;ll be in NYC"]}, + 'mailto:a@a.com', + ), + 'DTEND;TZID="UTC":20190107T000000': + ContentLine( + "DTEND", + {'TZID': ['UTC']}, + "20190107T000000" + ) } def test_errors(self):