Initial attempts at a "commented out code" finder

landscapeio · Aug 21, 2014 · 8e56df9 · 8e56df9
1 parent 775cdfd
commit 8e56df9
Show file tree

Hide file tree

Showing 6 changed files with 172 additions and 18 deletions.
diff --git a/dodgy/coc.py b/dodgy/coc.py
@@ -0,0 +1,113 @@
+"""
+Simple module to tokenize a python module to extract comments, then
+analyse the comments line-by-line for a probability of being code.
+If a certain threshold is reached, then a warning about the comment
+being commented-out-code is created.
+"""
+import re
+import tokenize
+import keyword
+from tokenize import TokenError
+from collections import defaultdict
+from cStringIO import StringIO
+
+
+_TOKEN_WEIGHTS = {
+    # Comments inside comments is a very strong indicator
+    # that this was code commented out automatically!
+    tokenize.COMMENT: 2
+}
+
+_KEYWORD_WEIGHTS = {
+    'and': 0.5,
+    'def': 2,
+    'elif': 2,
+    'exec': 2,
+    'is': 0.5,
+    'lambda': 2,
+}
+
+_MESSAGE_KEY = 'coc'
+_MESSAGE_TEXT = 'Commented out code'
+_THRESHOLD = 0.75
+
+# PEP263, see http://legacy.python.org/dev/peps/pep-0263/
+_ENCODING_REGEXP = re.compile(r'coding[:=]\s*([-\w.]+)')
+
+
+def _make_tokens(string):
+    readline = StringIO(string).readline
+    return tokenize.generate_tokens(readline)
+
+
+def _calculate_token_scores(text):
+    # First, let's ignore any "special" comments
+    if _ENCODING_REGEXP.search(text):
+        return None, None
+
+    # token.NAME indicates either a variable or a keyword. If the
+    # text is mostly variables, then it's probably just a normal
+    # comment. The presence of non-NAME tokens indicates it is
+    # code, and the higher the number of non-NAME tokens, the higher the
+    # probability that this particular piece of text is code.
+    try:
+        tokens = list(_make_tokens(text))
+    except TokenError:
+        # We won't be able to tokenize every line properly, and if we
+        # can't, that likely means that the line is not real code.
+        return None, None
+
+    weight = 0.0
+    for tok in tokens:
+        tok_weight = _TOKEN_WEIGHTS.get(tok[0], 1)
+        if tok[0] == tokenize.NAME:
+            # This may be a keyword
+            if keyword.iskeyword(tok[1]):
+                tok_weight *= _KEYWORD_WEIGHTS.get(tok[1], 1)
+            else:
+                tok_weight = 0
+        weight += tok_weight
+
+    return weight, len(tokens)
+
+
+def calculate_line_scores(text):
+    lines = defaultdict(lambda:[0.0, 0])
+    # first parse the contents and get the comments
+    tokens = _make_tokens(text)
+    for token in tokens:
+        if token[0] != tokenize.COMMENT:
+            continue
+        text = re.sub('^\s*#\s*', '', token[1])
+        score, token_count = _calculate_token_scores(text)
+        if score is None:
+            continue
+        start_line = token[2][0]
+        lines[start_line][0] += score
+        lines[start_line][1] += token_count
+
+    return { lno: w[0]/w[1] for lno, w in lines.iteritems() }
+
+
+def check_file(filepath, contents):
+    if not filepath.endswith('.py'):
+        return []
+    scores = calculate_line_scores(contents)
+    scores = list(scores.iteritems())
+    scores.sort(key=lambda x:x[0])
+    print scores
+
+    # Combine adjacent lines - multiple lines containing probable code
+    # increases the likelyhood we have a correct match
+    combined = {}
+    for lno, weight in scores:
+        if lno-1 in combined:
+            combined[lno-1].append(weight)
+        else:
+            combined[lno] = [weight]
+
+    print combined
+
+    return 1
+    return [(lno, _MESSAGE_KEY, _MESSAGE_TEXT)
+            for lno, w in combined.iteritems() if w >= _THRESHOLD]
diff --git a/dodgy/checks.py → dodgy/passwords.py b/dodgy/checks.py → dodgy/passwords.py
@@ -56,15 +56,9 @@ def check_line(line, check_list):
     return messages
 
 
-def check_file(filepath):
-    with open(filepath) as to_check:
-        return check_file_contents(to_check.read())
-
-
-def check_file_contents(file_contents):
+def check_file(_, contents):
     messages = []
-
-    for line_number0, line in enumerate(file_contents.split('\n')):
+    for line_number0, line in enumerate(contents.split('\n')):
         for check_list in (STRING_VALS, LINE_VALS, VAR_NAMES):
             messages += [(line_number0+1, key, msg) for key, msg in check_line(line, check_list)]
 

diff --git a/dodgy/run.py b/dodgy/run.py
@@ -3,7 +3,7 @@
 import os
 import mimetypes
 import json
-from dodgy.checks import check_file
+from dodgy import passwords, coc
 
 
 IGNORE_PATHS = [re.compile(patt % {'sep': os.path.sep}) for patt in (
@@ -28,6 +28,8 @@ def run_checks(directory, ignore_paths=None):
     ignore_paths = [re.compile(patt) for patt in ignore_paths]
     ignore_paths += IGNORE_PATHS
 
+    checks = [passwords, coc]
+
     filepaths = list_files(directory)
     for filepath in filepaths:
         relpath = os.path.relpath(filepath, directory)
@@ -39,13 +41,21 @@ def run_checks(directory, ignore_paths=None):
         if mimetype[0] is None or not mimetype[0].startswith('text/'):
             continue
 
-        for msg_parts in check_file(filepath):
-            warnings.append({
-                'path': relpath,
-                'line': msg_parts[0],
-                'code': msg_parts[1],
-                'message': msg_parts[2]
-            })
+        for check in checks:
+            contents = open(filepath).read()
+            for msg_parts in check.check_file(filepath, contents):
+                warnings.append({
+                    'path': relpath,
+                    'line': msg_parts[0],
+                    'code': msg_parts[1],
+                    'message': msg_parts[2]
+                })
+
+    def _sort(a, b):
+        if a['path'] == b['path']:
+            return a['line'] < b['line']
+        return a['path'] < b['path']
+    warnings.sort(cmp=_sort)
 
     return warnings
 

diff --git a/tests/test_checks.py b/tests/test_checks.py
@@ -1,13 +1,14 @@
 import os
 from unittest import TestCase
-from dodgy.checks import check_file
+from dodgy.passwords import check_file
 
 
 class TestChecks(TestCase):
 
     def _run_checks(self, file_name):
         filepath = os.path.join(os.path.dirname(__file__), 'testdata', file_name)
-        return check_file(filepath)
+        contents = open(filepath).read()
+        return check_file(filepath, contents)
 
     def _check_messages(self, messages, expected_keys):
         if expected_keys == (None,):

diff --git a/tests/test_coc.py b/tests/test_coc.py
@@ -0,0 +1,28 @@
+from unittest import TestCase
+from dodgy.coc import calculate_line_scores
+import os
+import re
+
+class TestCOC(TestCase):
+
+    def test_coc(self):
+        testdir = os.path.join(os.path.dirname(__file__), 'testdata', 'coc')
+        testdir = os.path.abspath(testdir)
+        for filename in os.listdir(testdir):
+            filepath = os.path.join(testdir, filename)
+            contents = open(filepath).readlines()
+            expected = re.match(r'^# --CODE:(.*)$', contents[0]).group(1)
+            expected = set(map(int, expected.split(',')))
+            results = calculate_line_scores(''.join(contents))
+
+            found = set(results.keys())
+            # we ignore line 1, since it's the list if lines to expect
+            found.remove(1)
+
+            not_found = expected - found
+            not_expected = found - expected
+
+            if len(not_found) > 0:
+                self.fail("The following lines should have been marked as code in file %s: %s" % (filename, not_found))
+            if len(not_expected) > 0:
+                self.fail("The following lines should not have been marked in file %s: %s" % (filename, not_expected))
diff --git a/tests/testdata/coc/module1.py b/tests/testdata/coc/module1.py
@@ -0,0 +1,8 @@
+# --CODE: 6
+
+def do_some_stuff(x):
+    x += 5
+    y = 2 * x
+    # y = 3 * x
+    print y
+    return y