Use ASAN to ensure there are no memory leaks when running the entire …

…html5lib test suite Since the normal tests go through python, it was not possible to check for leaks as python leaks on exit. So create a small C program to run the checks.
kovidgoyal · Jul 28, 2017 · 551729e · 551729e
1 parent b28e922
commit 551729e
Show file tree

Hide file tree

Showing 5 changed files with 77 additions and 27 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -42,3 +42,4 @@ install:
 
 script:
     - python $BUILDER test
+    - if [[ $BUILDER == "build.py" ]]; then python $BUILDER leak; fi
diff --git a/build.py b/build.py
@@ -285,9 +285,9 @@ def main():
             TEST_EXE, TEST_EXE, '-c', 'from html5_parser import *; ' + args.rest[0], *args.rest[1:])
     elif args.action == 'leak':
         build(args, build_leak_check=True)
-        p = subprocess.Popen([MEMLEAK_EXE], stdin=subprocess.PIPE)
-        p.communicate(('<p class="one">two<span>three</span>four' * 10).encode('utf-8'))
-        raise SystemExit(p.wait())
+        os.environ['MEMLEAK_EXE'] = os.path.abspath(MEMLEAK_EXE)
+        os.environ['ASAN_OPTIONS'] = 'leak_check_at_exit=0'
+        os.execlp(TEST_EXE, TEST_EXE, 'run_tests.py')
 
 
 if __name__ == '__main__':

diff --git a/mem-leak-check.c b/mem-leak-check.c
@@ -8,6 +8,7 @@
 #include <stdlib.h>
 #include <unistd.h>
 #include <stdio.h>
+#include <libxml/xmlmemory.h>
 #include "src/as-libxml.h"
 
 static inline libxml_doc*
@@ -36,13 +37,15 @@ int main(int UNUSED argc, char UNUSED **argv) {
     Options opts = {0};
     opts.gumbo_opts = kGumboDefaultOptions;
     opts.stack_size = 16 * 1024;
-    opts.gumbo_opts = kGumboDefaultOptions;
     opts.gumbo_opts.max_errors = 0;  
-    opts.namespace_elements = 1;
     opts.keep_doctype = 1;
+    xmlInitParser();
+    ssize_t sz = read(STDIN_FILENO, buf, (sizeof(buf) / sizeof(buf[0])) - 1);
+    parse_with_options(buf, (size_t)sz, &opts);
+    opts.namespace_elements = 1;
     opts.sanitize_names = 1;
     opts.gumbo_opts.use_xhtml_rules = 1;
-    ssize_t sz = read(STDIN_FILENO, buf, (sizeof(buf) / sizeof(buf[0])) - 1);
     parse_with_options(buf, (size_t)sz, &opts);
+    xmlCleanupParser();
     return 0;
 }
diff --git a/run_tests.py b/run_tests.py
@@ -75,8 +75,21 @@ def find_tests():
     return unittest.TestSuite(suites)
 
 
+def run_memleak_tests():
+    tests = find_tests()
+
+    tests = filter_tests_by_name(tests, 'asan_memleak')
+    r = unittest.TextTestRunner
+    result = r(verbosity=4).run(tests)
+
+    if not result.wasSuccessful():
+        raise SystemExit(1)
+
+
 def main():
     sys.path.insert(0, base)
+    if 'MEMLEAK_EXE' in os.environ:
+        return run_memleak_tests()
     parser = argparse.ArgumentParser(
         description='''\
 Run the specified tests, or all tests if none are specified. Tests

diff --git a/test/html5lib_adapter.py b/test/html5lib_adapter.py
@@ -7,6 +7,7 @@
 import codecs
 import os
 import re
+import subprocess
 import unittest
 
 from html5_parser import check_bom, check_for_meta_charset, parse
@@ -55,6 +56,7 @@ def is_section_heading(self, line):
             return False
 
     def normalize(self, data):
+
         def n(x):
             if x.endswith('\n'):
                 x = x[:-1]
@@ -133,16 +135,21 @@ def serialize_node(node, level=1):
 
 class BaseTest(TestCase):
 
+    @classmethod
+    def data_for_test(cls, test, expected='document'):
+        return test.get('document-fragment'), test.get('data'), test.get(expected), test.get(
+            'errors', '').split('\n')
+
     @classmethod
     def add_single(cls, test_name, num, test, expected):
+        inner_html, html, expected, errors = cls.data_for_test(test, expected)
 
         def test_func(
-            self,
-            inner_html=test.get('document-fragment'),
-            html=test.get('data'),
-            expected=test.get(expected),
-            errors=test.get('errors', '').split('\n')
-        ):
+                self,
+                inner_html=inner_html,
+                html=html,
+                expected=expected,
+                errors=errors):
             return self.implementation(inner_html, html, expected, errors, test_name)
 
         test_func.__name__ = str('test_%s_%d' % (test_name, num))
@@ -151,28 +158,34 @@ def test_func(
 
 class ConstructionTests(BaseTest):
 
-    def implementation(self, inner_html, html, expected, errors, test_name):
-        html = inner_html or html
+    @classmethod
+    def check_test(cls, inner_html, html, expected, errors, test_name):
         if test_name == 'isindex' or html == '<!doctype html><isindex type="hidden">':
-            raise unittest.SkipTest('gumbo and html5lib differ on <isindex> parsing'
-                                    ' and I cannot be bothered to figure out who is right')
+            return (
+                'gumbo and html5lib differ on <isindex> parsing'
+                ' and I cannot be bothered to figure out who is right')
         if test_name == 'menuitem-element':
-            raise unittest.SkipTest('gumbo and html5lib differ on <menuitem> parsing'
-                                    ' and I cannot be bothered to figure out who is right')
+            return (
+                'gumbo and html5lib differ on <menuitem> parsing'
+                ' and I cannot be bothered to figure out who is right')
         noscript = re.search(r'^\| +<noscript>$', expected, flags=re.MULTILINE)
         if noscript is not None:
-            raise unittest.SkipTest('<noscript> is always parsed with scripting off by gumbo')
+            return '<noscript> is always parsed with scripting off by gumbo'
         if '<thisisasillytestelementnametomakesurecrazytagnamesareparsedcorrectly>' in expected:
-            raise unittest.SkipTest('gumbo unlike html5lib, does not lowercase unknown tag names')
+            return 'gumbo unlike html5lib, does not lowercase unknown tag names'
         for line in errors:
             if 'expected-doctype-name-but' in line or 'unknown-doctype' in line:
-                raise unittest.SkipTest('gumbo auto-corrects malformed doctypes')
-
+                return 'gumbo auto-corrects malformed doctypes'
         if inner_html:
-            raise unittest.SkipTest('TODO: Implement fragment parsing')
-        else:
-            root = parse(html, namespace_elements=True, sanitize_names=False)
+            return 'TODO: Implement fragment parsing'
+
+    def implementation(self, inner_html, html, expected, errors, test_name):
+        html = inner_html or html
+        bad = self.check_test(inner_html, html, expected, errors, test_name)
+        if bad is not None:
+            raise unittest.SkipTest(bad)
 
+        root = parse(html, namespace_elements=True, sanitize_names=False)
         output = serialize_construction_output(root)
 
         # html5lib doesn't yet support the template tag, but it appears in the
@@ -196,8 +209,8 @@ def implementation(self, inner_html, html, expected, errors, test_name):
             raise unittest.SkipTest('buggy html5lib test')
         raw = html.encode('utf-8')
         output = check_bom(raw) or check_for_meta_charset(raw) or 'windows-1252'
-        error_msg = '\n'.join(map(type(''), [
-            '\n\nInput:', html, '\nExpected:', expected, '\nReceived:', output]))
+        error_msg = '\n'.join(
+            map(type(''), ['\n\nInput:', html, '\nExpected:', expected, '\nReceived:', output]))
         self.ae(expected.lower(), output, error_msg + '\n')
 
 
@@ -217,6 +230,26 @@ def load_suite(group, case_class, expected='document', data_class=TestData):
     return unittest.defaultTestLoader.loadTestsFromTestCase(case_class)
 
 
+class MemLeak(BaseTest):
+
+    @unittest.skipUnless('MEMLEAK_EXE' in os.environ, 'memleak check exe not available')
+    def test_asan_memleak(self):
+        MEMLEAK_EXE = os.environ['MEMLEAK_EXE']
+        env = os.environ.copy()
+        env.pop('ASAN_OPTIONS', None)
+        for path in html5lib_test_files('tree-construction'):
+            test_name = os.path.basename(path).rpartition('.')[0]
+            for i, test in enumerate(TestData(path)):
+                inner_html, html, expected, errors = ConstructionTests.data_for_test(test)
+                bad = ConstructionTests.check_test(inner_html, html, expected, errors, test_name)
+                if bad is not None:
+                    continue
+                p = subprocess.Popen([MEMLEAK_EXE], stdin=subprocess.PIPE, env=env)
+                p.communicate(html.encode('utf-8'))
+                self.ae(p.wait(), 0, 'The test {}-{} failed'.format(test_name, i))
+
+
 def find_tests():
     yield load_suite('tree-construction', ConstructionTests)
     yield load_suite('encoding', EncodingTests, expected='encoding')
+    yield unittest.defaultTestLoader.loadTestsFromTestCase(MemLeak)