Add Tatsu to have a real PEG parser from the EBNF grammar (#194)

Add 竜 TatSu as a dependency. This enables us to have a real PEG parser and not a combination of regexes and string splitting. Fix parsing of quoted values as well as escaped semi-columns This fixes #185 and fixes #193 Note : Adding Tatsu might have made the parser significantly slower in some cases.
ics-py · Aug 17, 2019 · 6b71a49 · 6b71a49
1 parent 61453e4
commit 6b71a49
Show file tree

Hide file tree

Showing 7 changed files with 85 additions and 40 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -7,13 +7,24 @@ Ics.py changelog
 0.6
 **************
 
+Major changes:
+ - Drop support for Python 3.5. Python 3.7 is now distributed in both Ubuntu LTS
+   and Debian stable, the PSF is providing only security fixes. It's time
+   to move on !
+ - Add `竜 TatSu <https://pypi.org/project/TatSu/>`_ as a dependency.
+   This enables us to have a real PEG parser and not a combination of
+   regexes and string splitting.
+
+Minor features:
  - Add mypy
- - Drop support for Python 3.5. Python 3.7 is now distributed in both Ubuntu LTS and Debian stable,
-   the PSF is providing only security fixes. It's time to move on !
  - Add GEO (thanks @johnnoone !)
 
 Bug fixes:
  - Events no longer have the TRANSP property by default (Fixes #190)
+ - Fix parsing of quoted values as well as escaped semi-columns (#185 and #193)
+
+Regressions:
+ - Adding Tatsu might have made the parser significantly slower in some cases.
 
 
 **************

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -3,6 +3,7 @@ include CHANGELOG.rst
 include LICENSE
 include AUTHORS.rst
 include CONTRIBUTING.rst
+include ics/contentline.ebnf
 include meta.py
 include tests/fixtures/README
 

diff --git a/README.rst b/README.rst
@@ -6,9 +6,6 @@ Ics.py : iCalendar for Humans
 .. image:: https://travis-ci.org/C4ptainCrunch/ics.py.png?branch=master
    :target: https://travis-ci.org/C4ptainCrunch/ics.py
 
-.. image:: https://coveralls.io/repos/C4ptainCrunch/ics.py/badge.png
-   :target: https://coveralls.io/r/C4ptainCrunch/ics.py
-   :alt: Coverage
 
 .. image:: https://img.shields.io/github/license/c4ptaincrunch/ics.py.svg
     :target: https://pypi.python.org/pypi/ics/
@@ -21,7 +18,7 @@ iCalendar is a widely-used and useful format but not user friendly. Ics.py is th
 
 It should be able to parse every calendar that respects the `rfc5545 <http://tools.ietf.org/html/rfc5545>`_ and maybe some more… It also outputs rfc compliant calendars.
 
-iCalendar (file extension `.ics`) is used by Sunbird, Google Calendar, Apple Calendar, Android…
+iCalendar (file extension `.ics`) is used by Google Calendar, Apple Calendar, Android and many more.
 
 
 Ics.py is available for Python>=3.6 and is Apache2 Licensed.

diff --git a/ics/contentline.ebnf b/ics/contentline.ebnf
@@ -0,0 +1,31 @@
+@@grammar::contentline
+@@whitespace :: //
+
+start = contentline $ ;
+
+ALPHA = ?"[a-zA-Z]" ;
+DIGIT = ? "[0-9]" ;
+CRLF = "\r\n" ;
+WSP = " ";
+
+DQUOTE = '"' ;
+
+QSAFE_CHAR    = WSP | ?"\x21" | ?"[\x23-\x7E]" | ?"[\u0080-\uffff]";
+SAFE_CHAR     = WSP | ?"\x21" | ?"[\x23-\x2B]" | ?"[\x2D-\x39]" | ?"[\x3C-\x7E]" | ?"[\u0080-\uffff]" ;
+VALUE_CHAR    = WSP | ?"[\x21-\x7E]" | ?"[\u0080-\uffff]";
+
+
+name          = iana_token | x_name ;
+iana_token    = {(ALPHA | DIGIT | "-")}+ ;
+x_name        = "X-" [vendorid "-"] {(ALPHA | DIGIT | "-")}+ ;
+vendorid      = (ALPHA | DIGIT) (ALPHA | DIGIT) {(ALPHA | DIGIT)}+ ;
+
+contentline   = name:name {(";" params+:param )}* ":" value:value CRLF ;
+
+param         = name:param_name "=" values+:param_value {("," values+:param_value)}* ;
+param_name    = iana_token | x_name ;
+param_value   = quoted_string | paramtext ;
+
+paramtext     = {SAFE_CHAR}* ;
+value         = {VALUE_CHAR}* ;
+quoted_string = DQUOTE @:{QSAFE_CHAR}* DQUOTE ;
diff --git a/ics/parse.py b/ics/parse.py
@@ -2,11 +2,18 @@
 # -*- coding: utf-8 -*-
 
 from __future__ import unicode_literals, absolute_import
+from pathlib import Path
 
 import collections
+import tatsu
 
 CRLF = '\r\n'
 
+grammar_path = Path(__file__).parent.joinpath('contentline.ebnf')
+
+with open(grammar_path) as fd:
+    GRAMMAR = tatsu.compile(fd.read())
+
 
 class ParseError(Exception):
     pass
@@ -58,24 +65,18 @@ def __setitem__(self, item, *values):
 
     @classmethod
     def parse(cls, line):
-        if ':' not in line:
-            raise ParseError("No ':' in line '{}'".format(line))
-
-        # Separate key and value
-        splitted = line.split(':', 1)
-        key, value = splitted[0], splitted[1].strip()
-
-        # Separate name and params
-        splitted = key.split(';')
-        name, params_strings = splitted[0], splitted[1:]
+        try:
+            ast = GRAMMAR.parse(line + CRLF)
+        except tatsu.exceptions.FailedToken:
+            raise ParseError()
 
-        # Separate key and values for params
+        name = ''.join(ast['name'])
+        value = ''.join(ast['value'])
         params = {}
-        for paramstr in params_strings:
-            if '=' not in paramstr:
-                raise ParseError("No '=' in line '{}'".format(paramstr))
-            pname, pvals = paramstr.split('=', 1)
-            params[pname] = pvals.split(',')
+        for param_ast in ast.get('params', []):
+            param_name = ''.join(param_ast["name"])
+            param_values = [''.join(x) for x in param_ast["values_"]]
+            params[param_name] = param_values
         return cls(name, params, value)
 
     def clone(self):
@@ -170,20 +171,12 @@ def string_to_container(txt):
     return lines_to_container(txt.splitlines())
 
 
-if __name__ == "__main__":
-    from tests.fixture import cal1
-
-    def print_tree(elem, lvl=0):
-        if isinstance(elem, list) or isinstance(elem, Container):
-            if isinstance(elem, Container):
-                print("{}{}".format('   ' * lvl, elem.name))
-            for sub_elem in elem:
-                print_tree(sub_elem, lvl + 1)
-        elif isinstance(elem, ContentLine):
-            print("{}{}{}".format('   ' * lvl,
-                  elem.name, elem.params, elem.value))
-        else:
-            print('Wuuut?')
-
-    cal = string_to_container(cal1)
-    print_tree(cal)
+def interpret_ast(ast):
+    name = ''.join(ast['name'])
+    value = ''.join(ast['value'])
+    params = {}
+    for param_ast in ast.get('params', []):
+        param_name = ''.join(param_ast["name"])
+        param_values = [''.join(x) for x in param_ast["values_"]]
+        params[param_name] = param_values
+    return ContentLine(name, params, value)
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
 python-dateutil
 arrow>=0.11,<0.12
 six>1.5
+tatsu>4.2
diff --git a/tests/contentline.py b/tests/contentline.py
@@ -7,7 +7,6 @@ class TestContentLine(unittest.TestCase):
 
     dataset = {
         'HAHA:': ContentLine('haha'),
-        ':hoho': ContentLine('', {}, 'hoho'),
         'HAHA:hoho': ContentLine('haha', {}, 'hoho'),
         'HAHA:hoho:hihi': ContentLine('haha', {}, 'hoho:hihi'),
         'HAHA;hoho=1:hoho': ContentLine('haha', {'hoho': ['1']}, 'hoho'),
@@ -44,6 +43,18 @@ class TestContentLine(unittest.TestCase):
             {'hoho': ['p1', 'p2'], 'hihi': ['p3', 'p4', 'p5']},
             'blabla:blublu'
         ),
+        r'ATTENDEE;X-A="I&rsquo\;ll be in NYC":mailto:a@a.com':
+        ContentLine(
+            'ATTENDEE',
+            {'X-A': [r"I&rsquo\;ll be in NYC"]},
+            'mailto:a@a.com',
+        ),
+        'DTEND;TZID="UTC":20190107T000000':
+        ContentLine(
+            "DTEND",
+            {'TZID': ['UTC']},
+            "20190107T000000"
+        )
     }
 
     def test_errors(self):