lark-parser · MegaIng · Oct 6, 2023 · Oct 7, 2023 · Oct 7, 2023 · Oct 7, 2023
diff --git a/examples/python-grammar-tests/Cpython-tests/.gitignore b/examples/python-grammar-tests/Cpython-tests/.gitignore
@@ -0,0 +1 @@
+*.py
diff --git a/examples/python-grammar-tests/all_parse_succesfully.py b/examples/python-grammar-tests/all_parse_succesfully.py
@@ -0,0 +1,19 @@
+from __future__ import annotations
+
+import logging
+import os
+
+from lark import Lark, UnexpectedInput, logger
+from lark.indenter import PythonIndenter
+from pathlib import Path
+logger.setLevel(logging.DEBUG)
+python_parser3 = Lark.open_from_package('lark', 'python.lark', ['grammars'],
+                                        parser='lalr', postlex=PythonIndenter(), start=['file_input', 'single_input', 'eval_input'], debug=True)
+# python_parser3.parse('def f(it, *varargs, **kwargs):\n    return list(it)\n\n\n', start="single_input")
+
+for file in (Path(__file__).parent / "Cpython-tests").glob("*.py"):
+    try:
+        tree = python_parser3.parse(file.read_text(encoding="utf-8"), start="file_input")
+    except UnexpectedInput as e:
+        print(f'File "{file}", line {e.line}')
+        print(f"{e.__class__.__qualname__}: {str(e)}")
diff --git a/examples/python-grammar-tests/get_std_tests.py b/examples/python-grammar-tests/get_std_tests.py
@@ -0,0 +1,45 @@
+"""
+Downloads the three test files from the Cpython repo for their parser.
+These are then analyzed, preprocessed and then run by other scripts in this folder
+"""
+import urllib.request
+import os
+
+files = {
+    "Lib/test/test_grammar.py": ["test_with_statement"],  # List of function names to comment out
+    "Lib/test/test_syntax.py": [],
+    "Lib/test/test_exceptions.py": [],
+    "Lib/test/test_patma.py": [],
+    "Lib/test/test_pep646_syntax.py": [],
+}
+
+url_template = "https://raw.githubusercontent.com/python/cpython/main/{}"
+file_template = f"{os.path.dirname(__file__)}/CPython-tests/{{}}"
+
+for filename in files:
+    file = file_template.format(filename.rpartition("/")[2])
+    print(file)
+    urllib.request.urlretrieve(
+        url_template.format(filename),
+        file
+    )
+    if files[filename]:
+        with open(file, "r+", encoding="utf-8") as f:
+            out = []
+            commenting_out = None
+            f.seek(0)
+            for line in f.readlines():
+                if any(name in line for name in files[filename]):
+                    commenting_out = line[:line.index("def")] + ' '
+                    out.append(f"# {line}")
+                    continue
+                if commenting_out is not None and (
+                        line.startswith(commenting_out) or
+                        line.strip() == '' or
+                        line.strip().startswith('#')):
+                    out.append(f"# {line}")
+                else:
+                    commenting_out = None
+                    out.append(line)
+            f.seek(0)
+            f.writelines(out)
diff --git a/examples/python-grammar-tests/semi_run_doctests.py b/examples/python-grammar-tests/semi_run_doctests.py
@@ -0,0 +1,69 @@
+from __future__ import annotations
+from __future__ import annotations
+
+import logging
+import os
+import doctest
+
+from lark import Lark, UnexpectedInput, logger
+from lark.indenter import PythonIndenter
+from pathlib import Path
+logger.setLevel(logging.DEBUG)
+python_parser3 = Lark.open_from_package('lark', 'python.lark', ['grammars'],
+                                        parser='lalr', postlex=PythonIndenter(), start=['file_input', 'single_input', 'eval_input'], debug=True)
+
+no_print = {
+    "fp": lambda *args, **kwargs: None,
+    "fn": lambda *args, **kwargs: None,
+
+    "tp": lambda *args, **kwargs: None,
+    "tn": lambda *args, **kwargs: None
+}
+
+
+print_all = {
+    "fp": print,
+    "fn": print,
+
+    "tp": print,
+    "tn": print
+}
+
+for file in (Path(__file__).parent / "Cpython-tests").glob("*.py"):
+    text = file.read_text(encoding="utf-8")
+    if "import doctest" in text:
+        doc_parser = doctest.DocTestParser()
+        docstring = next(t.value for t in python_parser3.lex(text) if "STRING" in t.type)
+        examples = doc_parser.get_examples(eval(docstring), str(file))
+        data = {"fp": 0, "tp": 0, "fn": 0, "tn": 0}
+        if "test_pep646_syntax" in file.name:
+            functions = print_all
+        else:
+            functions = no_print
+        for example in examples:
+            try:
+                tree = python_parser3.parse(example.source + "\n", start="single_input")
+                err = None
+            except UnexpectedInput as e:
+                tree = None
+                err = e
+            if example.exc_msg is not None:
+                if err is None:
+                    functions["fp"](f"Unexpected success with example:\n{example.source.rstrip()}")
+                    functions["fp"]("Excepted error message:", example.exc_msg.rstrip())
+                    functions["fp"]()
+                    data["fp"] += 1
+                else:
+                    functions["tn"]("Correctly errored on:\n", example.source.rstrip())
+                    data["tn"] += 1
+            else:
+                if err is not None:
+                    functions["fn"](f"Unexpected failure with example:\n{example.source.rstrip()}")
+                    functions["fn"](f"Got error message: {err.__class__.__qualname__}: {str(err)}")
+                    functions["fn"](repr(example.source))
+                    functions["fn"]()
+                    data["fn"] += 1
+                else:
+                    functions["tp"]("Correctly parsed:\n", example.source.rstrip())
+                    data["tp"] += 1
+        print(file, data)
diff --git a/lark/grammars/python.lark b/lark/grammars/python.lark
@@ -11,30 +11,31 @@
 // NB: compound_stmt in single_input is followed by extra NEWLINE!
 //
 
-single_input: _NEWLINE | simple_stmt | compound_stmt _NEWLINE
+single_input: _NEWLINE* (simple_stmt | compound_stmt _NEWLINE*)
 file_input: (_NEWLINE | stmt)*
 eval_input: testlist _NEWLINE*
 
-decorator: "@" dotted_name [ "(" [arguments] ")" ] _NEWLINE
+decorator: "@" test _NEWLINE
 decorators: decorator+
 decorated: decorators (classdef | funcdef | async_funcdef)
 
 async_funcdef: "async" funcdef
 funcdef: "def" name "(" [parameters] ")" ["->" test] ":" suite
 
-parameters: paramvalue ("," paramvalue)* ["," SLASH ("," paramvalue)*] ["," [starparams | kwparams]]
-          | starparams
-          | kwparams
 
-SLASH: "/" // Otherwise the it will completely disappear and it will be undisguisable in the result
-starparams: (starparam | starguard) poststarparams
-starparam: "*" typedparam
-starguard: "*"
-poststarparams: ("," paramvalue)* ["," kwparams]
-kwparams: "**" typedparam ","?
+parameters: slash_params ("," paramvalue)* ("," star_etc?)?
+          | paramvalue ("," paramvalue)* ("," star_etc?)?
+          | star_etc
 
-?paramvalue: typedparam ("=" test)?
-?typedparam: name (":" test)?
+slash_params: paramvalue ("," paramvalue)* "," "/"
+star_etc: kwds ","?
+        | "*" typedstarparam ("," paramvalue)* ("," kwds)? ","?
+        | "*" ("," paramvalue)+ ("," kwds)? ","?
+kwds: "**" typedparam
+
+paramvalue: typedparam ("=" test)?
+typedparam: name (":" test)?
+typedstarparam: name (":" (test | star_expr))?
 
 
 lambdef: "lambda" [lambda_params] ":" test
@@ -95,16 +96,18 @@ for_stmt: "for" exprlist "in" testlist ":" suite ["else" ":" suite]
 try_stmt: "try" ":" suite except_clauses ["else" ":" suite] [finally]
         | "try" ":" suite finally   -> try_finally
 finally: "finally" ":" suite
-except_clauses: except_clause+
+except_clauses: (except_clause+ | except_star_clause+)
+except_star_clause: EXCEPT_STAR [test ["as" name]] ":" suite
 except_clause: "except" [test ["as" name]] ":" suite
+EXCEPT_STAR.1: "except*"
 // NB compile.c makes sure that the default except clause is last
 
 
 with_stmt: "with" with_items ":" suite
 with_items: with_item ("," with_item)*
-with_item: test ["as" name]
+with_item: test ["as" (name|("(" _cs_list{name} ")"))]
 
-match_stmt: "match" test ":" _NEWLINE _INDENT case+ _DEDENT
+match_stmt: "match" testlist ":" _NEWLINE _INDENT case+ _DEDENT
 
 case: "case" pattern ["if" test] ":" suite
 
@@ -125,11 +128,13 @@ case: "case" pattern ["if" test] ":" suite
 
 literal_pattern: inner_literal_pattern
 
-?inner_literal_pattern: "None" -> const_none
-                      | "True" -> const_true
-                      | "False" -> const_false
-                      | STRING -> string
-                      | number
+!?inner_literal_pattern: "None" -> const_none
+                       | "True" -> const_true
+                       | "False" -> const_false
+                       | STRING -> string
+                       | number
+                       | "-" number -> neg_number
+                       | "-"? number ("+"|"-") number -> complex_number
 
 attr_pattern: NAME ("." NAME)+ -> value
 
@@ -142,11 +147,9 @@ _sequence_pattern: (sequence_item_pattern ("," sequence_item_pattern)* ","?)?
                       | "*" NAME -> star_pattern
 
 class_pattern: name_or_attr_pattern "(" [arguments_pattern ","?] ")"
-arguments_pattern: pos_arg_pattern ["," keyws_arg_pattern]
-                 | keyws_arg_pattern -> no_pos_arguments
+arguments_pattern: as_pattern ("," as_pattern)* ("," keyw_arg_pattern ("," keyw_arg_pattern)*)?
+                 | keyw_arg_pattern ("," keyw_arg_pattern)*
 
-pos_arg_pattern: as_pattern ("," as_pattern)*
-keyws_arg_pattern: keyw_arg_pattern ("," keyw_arg_pattern)*
 keyw_arg_pattern: NAME "=" as_pattern
 
 
@@ -222,12 +225,12 @@ _tuple_inner: test_or_star_expr (("," test_or_star_expr)+ [","] | ",")
 
 ?subscriptlist: subscript
               | subscript (("," subscript)+ [","] | ",") -> subscript_tuple
-?subscript: test | ([test] ":" [test] [sliceop]) -> slice
+?subscript: test | star_expr | ([test] ":" [test] [sliceop]) -> slice
 sliceop: ":" [test]
 ?exprlist: (expr|star_expr)
          | (expr|star_expr) (("," (expr|star_expr))+ [","]|",")
 ?testlist: test | testlist_tuple
-testlist_tuple: test (("," test)+ [","] | ",")
+testlist_tuple: (test|star_expr) (("," (test|star_expr))+ [","] | ",")
 _dict_exprlist: (key_value | "**" expr) ("," (key_value | "**" expr))* [","]
 
 key_value: test ":"  test
@@ -250,15 +253,12 @@ kwargs: "**" test ("," argvalue)*
 ?argvalue: test ("=" test)?
 
 
-comprehension{comp_result}: comp_result comp_fors [comp_if]
-comp_fors: comp_for+
-comp_for: [ASYNC] "for" exprlist "in" or_test
+comprehension{comp_result}: comp_result comp_forifs
+comp_forifs: comp_forif+
+comp_forif: [ASYNC] "for" exprlist "in" or_test comp_if*
 ASYNC: "async"
 ?comp_if: "if" test_nocond
 
-// not used in grammar, but may appear in "node" passed from Parser to Compiler
-encoding_decl: name
-
 yield_expr: "yield" [testlist]
           | "yield" "from" test -> yield_from
 
@@ -267,7 +267,7 @@ string: STRING | LONG_STRING
 
 // Other terminals
 
-_NEWLINE: ( /\r?\n[\t ]*/ | COMMENT )+
+_NEWLINE: (COMMENT? /\r?\n[\t ]*/ )+
 
 %ignore /[\t \f]+/  // WS
 %ignore /\\[\t \f]*\r?\n/   // LINE_CONT
@@ -281,7 +281,7 @@ _NEWLINE: ( /\r?\n[\t ]*/ | COMMENT )+
 NAME: /[^\W\d]\w*/
 COMMENT: /#[^\n]*/
 
-STRING: /([ubf]?r?|r[ubf])("(?!"").*?(?<!\\)(\\\\)*?"|'(?!'').*?(?<!\\)(\\\\)*?')/i
+STRING: /([ubf]?r?|r[ubf])("(?!"")([^\\\n"]|\\(.|\n))*"|'(?!'')([^\\\n']|\\(.|\n))*')/i
 LONG_STRING: /([ubf]?r?|r[ubf])(""".*?(?<!\\)(\\\\)*?"""|'''.*?(?<!\\)(\\\\)*?''')/is
 
 _SPECIAL_DEC: "0".."9"        ("_"?  "0".."9"                       )*

diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py
@@ -101,7 +101,7 @@ def parse_from_state(self, state: ParserState, last_token: Optional[Token]=None)
                 assert token is not None
                 state.feed_token(token)
 
-            end_token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
+            end_token = Token.new_borrow_pos('$END', '', token) if token is not None else Token('$END', '', 0, 1, 1)
             return state.feed_token(end_token, True)
         except UnexpectedInput as e:
             try: