comment_parser: Add Python parser.

jeanralphaviles · Sep 28, 2019 · 237b77d · 237b77d
1 parent 81fe6f6
commit 237b77d
Show file tree

Hide file tree

Showing 5 changed files with 80 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -150,6 +150,7 @@ twine upload dist/*
 | HTML        | text/html                |
 | Java        | text/x-java-source       |
 | Javascript  | application/javascript   |
+| Python      | text/x-python            |
 | Ruby        | text/x-ruby              |
 | Shell       | text/x-shellscript       |
 | XML         | text/xml                 |

diff --git a/comment_parser/comment_parser.py b/comment_parser/comment_parser.py
@@ -25,6 +25,7 @@
 from comment_parser.parsers import go_parser
 from comment_parser.parsers import html_parser
 from comment_parser.parsers import js_parser
+from comment_parser.parsers import python_parser
 from comment_parser.parsers import ruby_parser
 from comment_parser.parsers import shell_parser
 
@@ -36,6 +37,7 @@
     'text/x-go': go_parser,  # Go
     'text/x-java-source': c_parser,  # Java
     'text/x-javascript': js_parser,  # Javascript
+    'text/x-python': python_parser,  # Python
     'text/x-ruby': ruby_parser,  # Ruby
     'text/x-shellscript': shell_parser,  # Unix shell
     'text/xml': html_parser,  # XML

diff --git a/comment_parser/parsers/python_parser.py b/comment_parser/parsers/python_parser.py
@@ -0,0 +1,29 @@
+#!/usr/bin/python
+"""This module provides methods for parsing comments from Python scripts."""
+
+import io
+import tokenize
+from comment_parser.parsers import common
+
+
+def extract_comments(code):
+  """Extracts a list of comments from the given Python script.
+
+  Comments are identified using the tokenize module. Does not include function,
+  class, or module docstrings. All comments are single line comments.
+
+  Args:
+    code: String containing code to extract comments from.
+  Returns:
+    Python list of common.Comment in the order that they appear in the code.
+  Raises:
+    tokenize.TokenError
+  """
+  comments = []
+  tokens = tokenize.tokenize(io.BytesIO(code.encode()).readline)
+  for toknum, tokstring, tokloc, _, _ in tokens:
+    if toknum == tokenize.COMMENT:
+      # Removes leading '#' character.
+      tokstring = tokstring[1:]
+      comments.append(common.Comment(tokstring, tokloc[0], False))
+  return comments
diff --git a/comment_parser/parsers/tests/python_parser_test.py b/comment_parser/parsers/tests/python_parser_test.py
@@ -0,0 +1,47 @@
+#!/usr/bin/python
+"""Tests for comment_parser.parsers.python_parser.py"""
+
+import unittest
+from comment_parser.parsers import common
+from comment_parser.parsers import python_parser
+
+
+class PythonParserTest(unittest.TestCase):
+
+  def testComment(self):
+    code = '# comment'
+    comments = python_parser.extract_comments(code)
+    expected = [common.Comment(code[1:], 1, multiline=False)]
+    self.assertEqual(comments, expected)
+
+  def testCommentInSingleQuotedString(self):
+    code = "'this is # not a comment'"
+    comments = python_parser.extract_comments(code)
+    self.assertEqual(comments, [])
+
+  def testCommentInDoubleQuotedString(self):
+    code = '"this is # not a comment"'
+    comments = python_parser.extract_comments(code)
+    self.assertEqual(comments, [])
+
+  def testNestedStringSingleOutside(self):
+    code = "'this is \"# not a comment\"'"
+    comments = python_parser.extract_comments(code)
+    self.assertEqual(comments, [])
+
+  def testNestedStringDoubleOutside(self):
+    code = '"this is \'# not a comment\'"'
+    comments = python_parser.extract_comments(code)
+    self.assertEqual(comments, [])
+
+  def testEscapedSingleQuote(self):
+    code = "\\'# this is a comment"
+    comments = python_parser.extract_comments(code)
+    expected = [common.Comment(code[3:], 1, multiline=False)]
+    self.assertEqual(comments, expected)
+
+  def testEscapedDoubleQuote(self):
+    code = '\\"# this is a comment'
+    comments = python_parser.extract_comments(code)
+    expected = [common.Comment(code[3:], 1, multiline=False)]
+    self.assertEqual(comments, expected)
diff --git a/setup.py b/setup.py
@@ -8,7 +8,7 @@ def readme():
 
 setup(
     name='comment_parser',
-    version='1.1.4',
+    version='1.1.5',
     description='Parse comments from various source files.',
     classifiers=[
         'Development Status :: 5 - Production/Stable',