Skip to content

Commit

Permalink
Initial attempts at a "commented out code" finder
Browse files Browse the repository at this point in the history
  • Loading branch information
carlio committed Aug 21, 2014
1 parent 775cdfd commit 8e56df9
Show file tree
Hide file tree
Showing 6 changed files with 172 additions and 18 deletions.
113 changes: 113 additions & 0 deletions dodgy/coc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""
Simple module to tokenize a python module to extract comments, then
analyse the comments line-by-line for a probability of being code.
If a certain threshold is reached, then a warning about the comment
being commented-out-code is created.
"""
import re
import tokenize
import keyword
from tokenize import TokenError
from collections import defaultdict
from cStringIO import StringIO


_TOKEN_WEIGHTS = {
# Comments inside comments is a very strong indicator
# that this was code commented out automatically!
tokenize.COMMENT: 2
}

_KEYWORD_WEIGHTS = {
'and': 0.5,
'def': 2,
'elif': 2,
'exec': 2,
'is': 0.5,
'lambda': 2,
}

_MESSAGE_KEY = 'coc'
_MESSAGE_TEXT = 'Commented out code'
_THRESHOLD = 0.75

# PEP263, see http://legacy.python.org/dev/peps/pep-0263/
_ENCODING_REGEXP = re.compile(r'coding[:=]\s*([-\w.]+)')


def _make_tokens(string):
readline = StringIO(string).readline
return tokenize.generate_tokens(readline)


def _calculate_token_scores(text):
# First, let's ignore any "special" comments
if _ENCODING_REGEXP.search(text):
return None, None

# token.NAME indicates either a variable or a keyword. If the
# text is mostly variables, then it's probably just a normal
# comment. The presence of non-NAME tokens indicates it is
# code, and the higher the number of non-NAME tokens, the higher the
# probability that this particular piece of text is code.
try:
tokens = list(_make_tokens(text))
except TokenError:
# We won't be able to tokenize every line properly, and if we
# can't, that likely means that the line is not real code.
return None, None

weight = 0.0
for tok in tokens:
tok_weight = _TOKEN_WEIGHTS.get(tok[0], 1)
if tok[0] == tokenize.NAME:
# This may be a keyword
if keyword.iskeyword(tok[1]):
tok_weight *= _KEYWORD_WEIGHTS.get(tok[1], 1)
else:
tok_weight = 0
weight += tok_weight

return weight, len(tokens)


def calculate_line_scores(text):
lines = defaultdict(lambda:[0.0, 0])
# first parse the contents and get the comments
tokens = _make_tokens(text)
for token in tokens:
if token[0] != tokenize.COMMENT:
continue
text = re.sub('^\s*#\s*', '', token[1])
score, token_count = _calculate_token_scores(text)
if score is None:
continue
start_line = token[2][0]
lines[start_line][0] += score
lines[start_line][1] += token_count

return { lno: w[0]/w[1] for lno, w in lines.iteritems() }


def check_file(filepath, contents):
if not filepath.endswith('.py'):
return []
scores = calculate_line_scores(contents)
scores = list(scores.iteritems())
scores.sort(key=lambda x:x[0])
print scores

# Combine adjacent lines - multiple lines containing probable code
# increases the likelyhood we have a correct match
combined = {}
for lno, weight in scores:
if lno-1 in combined:
combined[lno-1].append(weight)
else:
combined[lno] = [weight]

print combined

return 1
return [(lno, _MESSAGE_KEY, _MESSAGE_TEXT)
for lno, w in combined.iteritems() if w >= _THRESHOLD]
10 changes: 2 additions & 8 deletions dodgy/checks.py → dodgy/passwords.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,15 +56,9 @@ def check_line(line, check_list):
return messages


def check_file(filepath):
with open(filepath) as to_check:
return check_file_contents(to_check.read())


def check_file_contents(file_contents):
def check_file(_, contents):
messages = []

for line_number0, line in enumerate(file_contents.split('\n')):
for line_number0, line in enumerate(contents.split('\n')):
for check_list in (STRING_VALS, LINE_VALS, VAR_NAMES):
messages += [(line_number0+1, key, msg) for key, msg in check_line(line, check_list)]

Expand Down
26 changes: 18 additions & 8 deletions dodgy/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os
import mimetypes
import json
from dodgy.checks import check_file
from dodgy import passwords, coc


IGNORE_PATHS = [re.compile(patt % {'sep': os.path.sep}) for patt in (
Expand All @@ -28,6 +28,8 @@ def run_checks(directory, ignore_paths=None):
ignore_paths = [re.compile(patt) for patt in ignore_paths]
ignore_paths += IGNORE_PATHS

checks = [passwords, coc]

filepaths = list_files(directory)
for filepath in filepaths:
relpath = os.path.relpath(filepath, directory)
Expand All @@ -39,13 +41,21 @@ def run_checks(directory, ignore_paths=None):
if mimetype[0] is None or not mimetype[0].startswith('text/'):
continue

for msg_parts in check_file(filepath):
warnings.append({
'path': relpath,
'line': msg_parts[0],
'code': msg_parts[1],
'message': msg_parts[2]
})
for check in checks:
contents = open(filepath).read()
for msg_parts in check.check_file(filepath, contents):
warnings.append({
'path': relpath,
'line': msg_parts[0],
'code': msg_parts[1],
'message': msg_parts[2]
})

def _sort(a, b):
if a['path'] == b['path']:
return a['line'] < b['line']
return a['path'] < b['path']
warnings.sort(cmp=_sort)

return warnings

Expand Down
5 changes: 3 additions & 2 deletions tests/test_checks.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import os
from unittest import TestCase
from dodgy.checks import check_file
from dodgy.passwords import check_file


class TestChecks(TestCase):

def _run_checks(self, file_name):
filepath = os.path.join(os.path.dirname(__file__), 'testdata', file_name)
return check_file(filepath)
contents = open(filepath).read()
return check_file(filepath, contents)

def _check_messages(self, messages, expected_keys):
if expected_keys == (None,):
Expand Down
28 changes: 28 additions & 0 deletions tests/test_coc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from unittest import TestCase
from dodgy.coc import calculate_line_scores
import os
import re

class TestCOC(TestCase):

def test_coc(self):
testdir = os.path.join(os.path.dirname(__file__), 'testdata', 'coc')
testdir = os.path.abspath(testdir)
for filename in os.listdir(testdir):
filepath = os.path.join(testdir, filename)
contents = open(filepath).readlines()
expected = re.match(r'^# --CODE:(.*)$', contents[0]).group(1)
expected = set(map(int, expected.split(',')))
results = calculate_line_scores(''.join(contents))

found = set(results.keys())
# we ignore line 1, since it's the list if lines to expect
found.remove(1)

not_found = expected - found
not_expected = found - expected

if len(not_found) > 0:
self.fail("The following lines should have been marked as code in file %s: %s" % (filename, not_found))
if len(not_expected) > 0:
self.fail("The following lines should not have been marked in file %s: %s" % (filename, not_expected))
8 changes: 8 additions & 0 deletions tests/testdata/coc/module1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# --CODE: 6

def do_some_stuff(x):
x += 5
y = 2 * x
# y = 3 * x
print y
return y

0 comments on commit 8e56df9

Please sign in to comment.