cov: segment tree for querying individual points

get a list of matching code coverage elements from a given point.
haxscramper · Apr 22, 2024 · d981f4e · d981f4e
1 parent 46955d3
commit d981f4e
Show file tree

Hide file tree

Showing 7 changed files with 152 additions and 42 deletions.
diff --git a/scripts/cxx_codegen/profdata_merger/profdata_merger.cpp b/scripts/cxx_codegen/profdata_merger/profdata_merger.cpp
@@ -767,10 +767,10 @@ struct queries {
                 "CovSegment",
                 {
                     "Id",           // 1
-                    "StartLine",    // 2
-                    "StartCol",     // 3
-                    "EndLine",      // 4
-                    "EndCol",       // 5
+                    "LineStart",    // 2
+                    "ColStart",     // 3
+                    "LineEnd",      // 4
+                    "ColEnd",       // 5
                     "StartCount",   // 6
                     "EndCount",     // 7
                     "HasCount",     // 8

diff --git a/scripts/cxx_codegen/profdata_merger/profdata_merger.sql b/scripts/cxx_codegen/profdata_merger/profdata_merger.sql
@@ -92,10 +92,10 @@ CREATE TABLE "CovFunctionInstantiation" (
 
 CREATE TABLE "CovSegment" (
 	"Id" INTEGER NOT NULL, 
-	"StartLine" INTEGER NOT NULL, 
-	"StartCol" INTEGER NOT NULL, 
-	"EndLine" INTEGER NOT NULL, 
-	"EndCol" INTEGER NOT NULL, 
+	"LineStart" INTEGER NOT NULL, 
+	"ColStart" INTEGER NOT NULL, 
+	"LineEnd" INTEGER NOT NULL, 
+	"ColEnd" INTEGER NOT NULL, 
 	"StartCount" INTEGER NOT NULL, 
 	"EndCount" INTEGER NOT NULL, 
 	"HasCount" BOOLEAN NOT NULL, 

diff --git a/scripts/py_repository/py_repository/gen_coverage_cxx.py b/scripts/py_repository/py_repository/gen_coverage_cxx.py
@@ -1,16 +1,17 @@
 #!/usr/env/bin python
-from beartype.typing import Optional, Any, List, Tuple
+from beartype.typing import Optional, Any, List, Tuple, Iterable
 from pydantic import Field, BaseModel
 
-from sqlalchemy import create_engine, Column
+from sqlalchemy import create_engine, Column, select, Select
 from sqlalchemy import Enum as SqlEnum
 from sqlalchemy.schema import CreateTable
-from sqlalchemy.orm import declarative_base
+from sqlalchemy.orm import declarative_base, Session
 from py_scriptutils.sqlalchemy_utils import IdColumn, ForeignId, IntColumn, StrColumn, BoolColumn
 from py_scriptutils.repo_files import get_haxorg_repo_root_path
 from sqlalchemy.types import JSON
 import enum
 from beartype import beartype
+from pathlib import Path
 
 CoverageSchema = declarative_base()
 
@@ -100,10 +101,10 @@ class CovSegmentFlat(CoverageSchema):
 class CovSegment(CoverageSchema):
     __tablename__ = "CovSegment"
     Id = IdColumn()
-    StartLine = IntColumn()
-    StartCol = IntColumn()
-    EndLine = IntColumn()
-    EndCol = IntColumn()
+    LineStart = IntColumn()
+    ColStart = IntColumn()
+    LineEnd = IntColumn()
+    ColEnd = IntColumn()
     StartCount = IntColumn()
     EndCount = IntColumn()
     HasCount = BoolColumn()
@@ -113,6 +114,10 @@ class CovSegment(CoverageSchema):
     NestedIn = ForeignId("CovSegment.Id", nullable=True)
     IsLeaf = BoolColumn()
 
+    def intersects(self, line: int, col: int) -> bool:
+        return (self.LineStart <= line <= self.LineEnd) and (self.ColStart <= col <=
+                                                             self.ColEnd)
+
 
 class CovInstantiationGroup(CoverageSchema):
     __tablename__ = "CovInstantiationGroup"
@@ -171,6 +176,75 @@ def extract_text(lines: List[str], start: Tuple[int, int], end: Tuple[int, int])
         return "\n".join(extracted_lines)
 
 
+@beartype
+class CoverageSegmentTree:
+
+    def __init__(self, segments: Iterable[CovSegment]):
+        self.root = None
+        self.segments = sorted(segments, key=lambda x: (x.LineStart, x.ColStart))
+        if self.segments:
+            self.root = self.build_tree(0, len(self.segments) - 1)
+
+    @beartype
+    class Node:
+
+        def __init__(self, start: int, end: int, segments: Iterable[CovSegment]):
+            self.start = start
+            self.end = end
+            self.segments = segments
+            self.left: Optional['CoverageSegmentTree.Node'] = None
+            self.right: Optional['CoverageSegmentTree.Node'] = None
+
+    def build_tree(self, start: int, end: int) -> Node:
+        if start > end:
+            return None
+        if start == end:
+            return self.Node(start, end, [self.segments[start]])
+
+        mid = (start + end) // 2
+        node = self.Node(start, end, self.segments[start:end + 1])
+        node.left = self.build_tree(start, mid)
+        node.right = self.build_tree(mid + 1, end)
+        return node
+
+    def query(self,
+              line: int,
+              col: int,
+              node: Optional[Node] = None) -> Iterable[CovSegment]:
+        if node is None:
+            node = self.root
+        if node is None:
+            return []
+
+        # If the point is outside the bounds of the segments in this node
+        if node.start > line or node.end < line:
+            return []
+
+        # Check for intersection with segments at this node
+        result = [seg for seg in node.segments if seg.intersects(line, col)]
+
+        # Recurse on child nodes
+        if node.left and line <= (node.left.start + node.left.end) // 2:
+            result.extend(self.query(line, col, node.left))
+        if node.right and line >= (node.right.start + node.right.end) // 2 + 1:
+            result.extend(self.query(line, col, node.right))
+
+        return result
+
+
+@beartype
+def get_coverage_of(session: Session, path: Path) -> Select[Tuple[CovSegment]]:
+    target_id = session.execute(
+        select(CovFile).where(CovFile.Path == str(path))).fetchall()
+
+    if len(target_id) != 1:
+        raise ValueError(
+            f"{len(target_id)} files matched for given path '{path}', expected exactly one match"
+        )
+
+    return select(CovSegment).where(CovSegment.File == target_id[0][0].Id)
+
+
 if __name__ == "__main__":
     sql_url = "sqlite:///:memory:"
     db_engine = create_engine(sql_url)

diff --git a/scripts/py_scriptutils/py_scriptutils/json_utils.py b/scripts/py_scriptutils/py_scriptutils/json_utils.py
@@ -147,7 +147,8 @@ def get_path(value: Json) -> Json:
 def assert_subset(main: Json, subset: Json, message: Optional[str] = None):
     diff = get_subset_diff(main_set=main, expected_subset=subset)
 
-    compare = "\n".join([
+    compare = "Could not find expected subset of values in the main set\n\n"
+    compare += "\n".join([
         "[{}]{}".format(
             idx,
             describe_diff(

diff --git a/scripts/py_scriptutils/py_scriptutils/sqlalchemy_utils.py b/scripts/py_scriptutils/py_scriptutils/sqlalchemy_utils.py
@@ -87,10 +87,13 @@ def format_rich_table(engine: Engine,
 
 @beartype
 def format_rich_query(
-    engine: Engine,
+    engine: Union[Engine, Session],
     query: Executable,
     column_labels: List[str] = [],
 ) -> Table:
+
+    if isinstance(engine, Session):
+        engine = engine.get_bind()
 
     rich_table = Table(show_header=True, header_style="bold blue")
     with engine.connect() as connection:

diff --git a/...overage_corpus/test_file_segmentation.cpp → ...erage_corpus/test_file_segmentation_1.cpp b/...overage_corpus/test_file_segmentation.cpp → ...erage_corpus/test_file_segmentation_1.cpp
diff --git a/tests/python/repo/test_code_coverage.py b/tests/python/repo/test_code_coverage.py
@@ -292,8 +292,8 @@ def test_file_coverage_filter():
 
         assert len(df) == 2
         assert_frame(df, [
-            dict(StartLine=1, EndLine=1, Path="file1.cpp"),
-            dict(StartLine=5, EndLine=8, Path="main.cpp"),
+            dict(LineStart=1, LineEnd=1, Path="file1.cpp"),
+            dict(LineStart=5, LineEnd=8, Path="main.cpp"),
         ])
 
 
@@ -302,7 +302,47 @@ def cleanup_test_code(code: str) -> str:
     return re.sub(r"\s+", " ", code.replace("\n", " "))
 
 
-def test_file_segmentation():
+@beartype
+def add_cov_segment_text(df: pd.DataFrame, lines: List[str]):
+    df["Text"] = df.apply(
+        lambda row: cleanup_test_code(
+            cov.extract_text(
+                lines,
+                start=(row["LineStart"], row["ColStart"]),
+                end=(row["LineEnd"], row["ColEnd"]),
+            )),
+        axis=1,
+    )
+
+
+def test_file_segmentation_1():
+    with TemporaryDirectory() as tmp:
+        dir = Path(tmp)
+        dir = Path("/tmp/test_base_run_coverage")
+        code = corpus_base.joinpath("test_file_segmentation_1.cpp").read_text()
+        cmd = ProfileRunParams(dir=dir, main="main.cpp", files={"main.cpp": code})
+        cmd.run()
+
+        session = open_sqlite_session(cmd.get_sqlite(), cov.CoverageSchema)
+        main_cov = cov.get_coverage_of(session, cmd.get_code("main.cpp"))
+        lines = code.split("\n")
+
+        segtree = cov.CoverageSegmentTree(it[0] for it in session.execute(main_cov))
+        df = pd.read_sql(main_cov, session.get_bind())
+        add_cov_segment_text(df, lines)
+
+        # print(render_rich(dataframe_to_rich_table(df)))
+
+        # Coverage segments only overlay executable blocks and do not 
+        # account for extraneous elements such as function headers etc.
+        assert segtree.query(line=1, col=15)
+        assert not segtree.query(line=1, col=14)
+        assert_frame(df[df["LineStart"] == 1], [
+            dict(IsLeaf=True, Text="{}", ColStart=15, ColEnd=17),
+        ])
+
+
+def test_file_segmentation_2():
     with TemporaryDirectory() as tmp:
         dir = Path(tmp)
         dir = Path("/tmp/test_base_run_coverage")
@@ -317,15 +357,7 @@ def test_file_segmentation():
         lines = code.split("\n")
 
         df = pd.read_sql(select(cov.CovSegment), session.get_bind())
-        df["Text"] = df.apply(
-            lambda row: cleanup_test_code(
-                cov.extract_text(
-                    lines,
-                    start=(row["StartLine"], row["StartCol"]),
-                    end=(row["EndLine"], row["EndCol"]),
-                )),
-            axis=1,
-        )
+        add_cov_segment_text(df, lines)
 
         table = dataframe_to_rich_table(df)
         table.show_lines = True
@@ -339,42 +371,42 @@ def test_file_segmentation():
 
         assert_frame(df, [
             dict(
-                StartLine=1,
-                EndLine=1,
+                LineStart=1,
+                LineEnd=1,
                 SegmentIndex=0,
                 Text="{}",
                 IsLeaf=True,
             ),
             dict(
-                StartLine=3,
-                EndLine=5,
+                LineStart=3,
+                LineEnd=5,
                 SegmentIndex=1,
                 Id=2,
                 Text="{ if (true || false) { action(); } }",
                 IsLeaf=False,
             ),
             dict(
-                StartLine=4,
-                EndLine=4,
+                LineStart=4,
+                LineEnd=4,
                 SegmentIndex=2,
                 Text="true",
-                StartCol=9,
-                EndCol=13,
+                ColStart=9,
+                ColEnd=13,
                 NestedIn=2,
                 IsLeaf=True,
             ),
             dict(
-                StartLine=4,
-                EndLine=4,
+                LineStart=4,
+                LineEnd=4,
                 SegmentIndex=3,
                 Text="false",
-                StartCol=17,
-                EndCol=22,
+                ColStart=17,
+                ColEnd=22,
                 NestedIn=2,
                 IsLeaf=True,
             ),
             dict(
-                StartLine=4,
+                LineStart=4,
                 SegmentIndex=4,
                 Text="{ action(); }",
                 NestedIn=2,