Merge pull request #8492 from rtibbles/content_metadata_optimizations

Content metadata optimizations
learningequality · Oct 29, 2021 · 50f8b24 · 50f8b24
2 parents 48075ed + e2088c4
commit 50f8b24
Show file tree

Hide file tree

Showing 16 changed files with 380 additions and 163 deletions.
diff --git a/kolibri/core/content/api.py b/kolibri/core/content/api.py
@@ -175,12 +175,12 @@ class ContentNodeFilter(IdFilter):
     parent__isnull = BooleanFilter(field_name="parent", lookup_expr="isnull")
     include_coach_content = BooleanFilter(method="filter_include_coach_content")
     contains_quiz = CharFilter(method="filter_contains_quiz")
-    grade_levels = CharFilter(method="filter_contains_and")
-    resource_types = CharFilter(method="filter_contains_and")
-    learning_activities = CharFilter(method="filter_contains_and")
-    accessibility_labels = CharFilter(method="filter_contains_and")
-    categories = CharFilter(method="filter_contains_and")
-    learner_needs = CharFilter(method="filter_contains_and")
+    grade_levels = CharFilter(method="bitmask_contains_and")
+    resource_types = CharFilter(method="bitmask_contains_and")
+    learning_activities = CharFilter(method="bitmask_contains_and")
+    accessibility_labels = CharFilter(method="bitmask_contains_and")
+    categories = CharFilter(method="bitmask_contains_and")
+    learner_needs = CharFilter(method="bitmask_contains_and")
     keywords = CharFilter(method="filter_keywords")
     channels = UUIDInFilter(name="channel_id")
     languages = CharInFilter(name="lang_id")
@@ -269,11 +269,8 @@ def filter_keywords(self, queryset, name, value):
 
         return queryset.filter(query)
 
-    def filter_contains_and(self, queryset, name, value):
-        values = value.split(",")
-        return queryset.filter(
-            intersection([Q(**{name + "__contains": v}) for v in values])
-        )
+    def bitmask_contains_and(self, queryset, name, value):
+        return queryset.has_all_labels(name, value.split(","))
 
 
 class OptionalPageNumberPagination(ValuesViewsetPageNumberPagination):
@@ -342,6 +339,7 @@ class BaseContentNodeMixin(object):
         "accessibility_labels",
         "categories",
         "duration",
+        "ancestors",
     )
 
     field_map = {
@@ -422,77 +420,24 @@ def get_related_data_maps(self, items, queryset):
 
         return assessmentmetadata_map, files_map, languages_map, tags_map
 
-    def process_items(self, items, queryset, ancestor_lookup_method=None):
-        output = []
-        assessmentmetadata, files_map, languages_map, tags = self.get_related_data_maps(
-            items, queryset
-        )
-        for item in items:
-            item["assessmentmetadata"] = assessmentmetadata.get(item["id"])
-            item["tags"] = tags.get(item["id"], [])
-            item["files"] = files_map.get(item["id"], [])
-            lang_id = item.pop("lang_id")
-            item["lang"] = languages_map.get(lang_id)
-            if ancestor_lookup_method:
-                item["ancestors"] = ancestor_lookup_method(item)
-            item["is_leaf"] = item.get("kind") != content_kinds.TOPIC
-            output.append(item)
-        return output
-
     def consolidate(self, items, queryset):
+        output = []
         if items:
-
-            # We need to batch our queries for ancestors as the size of the expression tree
-            # depends on the number of nodes that we are querying for.
-            # On Windows, the SQL parameter limit is 999, and an ancestors call can produce
-            # 3 parameters per node in the queryset, so this should max out the parameters at 750.
-            ANCESTOR_BATCH_SIZE = 250
-
-            if len(items) > ANCESTOR_BATCH_SIZE:
-
-                ancestors_map = {}
-
-                for i in range(0, len(items), ANCESTOR_BATCH_SIZE):
-
-                    for anc in (
-                        models.ContentNode.objects.filter(
-                            id__in=[
-                                item["id"]
-                                for item in items[i : i + ANCESTOR_BATCH_SIZE]
-                            ]
-                        )
-                        .get_ancestors()
-                        .values("id", "title", "lft", "rght", "tree_id")
-                    ):
-                        ancestors_map[anc["id"]] = anc
-
-                ancestors = sorted(ancestors_map.values(), key=lambda x: x["lft"])
-            else:
-                ancestors = list(
-                    queryset.get_ancestors()
-                    .values("id", "title", "lft", "rght", "tree_id")
-                    .order_by("lft")
-                )
-
-            def ancestor_lookup(item):
-                lft = item.get("lft")
-                rght = item.get("rght")
-                tree_id = item.get("tree_id")
-                return list(
-                    map(
-                        lambda x: {"id": x["id"], "title": x["title"]},
-                        filter(
-                            lambda x: x["lft"] < lft
-                            and x["rght"] > rght
-                            and x["tree_id"] == tree_id,
-                            ancestors,
-                        ),
-                    )
-                )
-
-            return self.process_items(items, queryset, ancestor_lookup)
-
-        return []
+            (
+                assessmentmetadata,
+                files_map,
+                languages_map,
+                tags,
+            ) = self.get_related_data_maps(items, queryset)
+            for item in items:
+                item["assessmentmetadata"] = assessmentmetadata.get(item["id"])
+                item["tags"] = tags.get(item["id"], [])
+                item["files"] = files_map.get(item["id"], [])
+                lang_id = item.pop("lang_id")
+                item["lang"] = languages_map.get(lang_id)
+                item["is_leaf"] = item.get("kind") != content_kinds.TOPIC
+                output.append(item)
+        return output
 
 
 class OptionalContentNodePagination(ValuesViewsetCursorPagination):
@@ -872,11 +817,6 @@ class ContentNodeTreeViewset(BaseContentNodeMixin, BaseValuesViewset):
     # should not raise it.
     page_size = 25
 
-    def consolidate(self, items, queryset):
-        if items:
-            return self.process_items(items, queryset)
-        return []
-
     def validate_and_return_params(self, request):
         depth = request.query_params.get("depth", 2)
         lft__gt = request.query_params.get("lft__gt")
@@ -974,10 +914,6 @@ def retrieve(self, request, **kwargs):
         # The serialized parent representation is the first node in the lft order
         parent = nodes[0]
 
-        # Do a query to get the parent's ancestors - for all other nodes, we can manually build
-        # the ancestors from the tree topology that we already know!
-        parent["ancestors"] = list(parent_model.get_ancestors().values("id", "title"))
-
         # Use this to keep track of direct children of the parent node
         # this will allow us to do lookups for the grandchildren, in order
         # to insert them into the "children" property
@@ -1013,10 +949,6 @@ def retrieve(self, request, **kwargs):
                 # If the parent of the descendant does not already have its `children` property
                 # initialized, do so here.
                 desc_parent["children"] = {"results": [], "more": None}
-            # The ancestors field for the descendant will be its parents ancestors, plus its parent!
-            desc["ancestors"] = desc_parent["ancestors"] + [
-                {"id": desc_parent["id"], "title": desc_parent["title"]}
-            ]
             # Add this descendant to the results for the children pagination object
             desc_parent["children"]["results"].append(desc)
             # Only bother updating the URL for more if we have hit the page size limit

diff --git a/kolibri/core/content/contentschema/versions/content_schema_current.py b/kolibri/core/content/contentschema/versions/content_schema_current.py
@@ -46,16 +46,16 @@ class ContentContentnode(Base):
     __tablename__ = "content_contentnode"
     __table_args__ = (
         Index(
-            "content_contentnode_level_channel_id_available_29f0bb18_idx",
+            "content_contentnode_level_channel_id_kind_fd732cc4_idx",
             "level",
             "channel_id",
-            "available",
+            "kind",
         ),
         Index(
-            "content_contentnode_level_channel_id_kind_fd732cc4_idx",
+            "content_contentnode_level_channel_id_available_29f0bb18_idx",
             "level",
             "channel_id",
-            "kind",
+            "available",
         ),
     )
 
@@ -67,6 +67,7 @@ class ContentContentnode(Base):
     sort_order = Column(Float)
     license_owner = Column(String(200), nullable=False)
     author = Column(String(200), nullable=False)
+    kind = Column(String(200), nullable=False)
     available = Column(Boolean, nullable=False)
     lft = Column(Integer, nullable=False, index=True)
     rght = Column(Integer, nullable=False, index=True)
@@ -86,7 +87,12 @@ class ContentContentnode(Base):
     learner_needs = Column(Text)
     learning_activities = Column(Text)
     resource_types = Column(Text)
-    kind = Column(String(200), nullable=False)
+    accessibility_labels_bitmask_0 = Column(BigInteger)
+    categories_bitmask_0 = Column(BigInteger)
+    grade_levels_bitmask_0 = Column(BigInteger)
+    learner_needs_bitmask_0 = Column(BigInteger)
+    learning_activities_bitmask_0 = Column(BigInteger)
+    ancestors = Column(Text)
     parent_id = Column(ForeignKey("content_contentnode.id"), index=True)
 
     lang = relationship("ContentLanguage")

diff --git a/kolibri/core/content/migrations/0029_metadata_bitmasks.py b/kolibri/core/content/migrations/0029_metadata_bitmasks.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+# Generated by Django 1.11.29 on 2021-10-11 20:40
+from __future__ import unicode_literals
+
+from django.db import migrations
+from django.db import models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("content", "0028_contentnode_metadata_labels"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="contentnode",
+            name="accessibility_labels_bitmask_0",
+            field=models.BigIntegerField(blank=True, default=0, null=True),
+        ),
+        migrations.AddField(
+            model_name="contentnode",
+            name="categories_bitmask_0",
+            field=models.BigIntegerField(blank=True, default=0, null=True),
+        ),
+        migrations.AddField(
+            model_name="contentnode",
+            name="grade_levels_bitmask_0",
+            field=models.BigIntegerField(blank=True, default=0, null=True),
+        ),
+        migrations.AddField(
+            model_name="contentnode",
+            name="learner_needs_bitmask_0",
+            field=models.BigIntegerField(blank=True, default=0, null=True),
+        ),
+        migrations.AddField(
+            model_name="contentnode",
+            name="learning_activities_bitmask_0",
+            field=models.BigIntegerField(blank=True, default=0, null=True),
+        ),
+    ]
diff --git a/kolibri/core/content/migrations/0030_contentnode_ancestors.py b/kolibri/core/content/migrations/0030_contentnode_ancestors.py
@@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+# Generated by Django 1.11.29 on 2021-10-11 20:51
+from __future__ import unicode_literals
+
+from django.db import migrations
+
+import kolibri.core.fields
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("content", "0029_metadata_bitmasks"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="contentnode",
+            name="ancestors",
+            field=kolibri.core.fields.JSONField(blank=True, default=[], null=True),
+        ),
+    ]
diff --git a/kolibri/core/content/models.py b/kolibri/core/content/models.py
@@ -27,6 +27,7 @@
 
 from django.db import connection
 from django.db import models
+from django.db.models import F
 from django.db.models import Min
 from django.db.models import Q
 from django.db.models import QuerySet
@@ -39,7 +40,10 @@
 from .utils import paths
 from kolibri.core.content import base_models
 from kolibri.core.content.errors import InvalidStorageFilenameError
+from kolibri.core.content.utils.search import bitmask_fieldnames
+from kolibri.core.content.utils.search import metadata_bitmasks
 from kolibri.core.device.models import ContentCacheKey
+from kolibri.core.fields import JSONField
 from kolibri.core.mixins import FilterByUUIDQuerysetMixin
 
 PRESET_LOOKUP = dict(format_presets.choices)
@@ -82,6 +86,25 @@ def filter_by_content_ids(self, content_ids, validate=True):
     def exclude_by_content_ids(self, content_ids, validate=True):
         return self._by_uuids(content_ids, validate, "content_id", False)
 
+    def has_all_labels(self, field_name, labels):
+        bitmasks = metadata_bitmasks[field_name]
+        bits = {}
+        for label in labels:
+            if label in bitmasks:
+                bitmask_fieldname = bitmasks[label]["bitmask_field_name"]
+                if bitmask_fieldname not in bits:
+                    bits[bitmask_fieldname] = 0
+                bits[bitmask_fieldname] += bitmasks[label]["bits"]
+
+        filters = {}
+        annotations = {}
+        for bitmask_fieldname, bits in bits.items():
+            annotation_fieldname = "{}_{}".format(bitmask_fieldname, "masked")
+            filters[annotation_fieldname + "__gt"] = 0
+            annotations[annotation_fieldname] = F(bitmask_fieldname).bitand(bits)
+
+        return self.annotate(**annotations).filter(**filters)
+
 
 class ContentNodeManager(
     models.Manager.from_queryset(ContentNodeQueryset), TreeManager
@@ -162,6 +185,10 @@ class ContentNode(base_models.ContentNode):
     # then it is 1 or 0 depending on availability
     on_device_resources = models.IntegerField(default=0, null=True, blank=True)
 
+    # Use this to annotate ancestor information directly onto the ContentNode, as it can be a
+    # costly lookup
+    ancestors = JSONField(default=[], null=True, blank=True)
+
     objects = ContentNodeManager()
 
     class Meta:
@@ -186,6 +213,11 @@ def get_descendant_content_ids(self):
         )
 
 
+for field_name in bitmask_fieldnames:
+    field = models.BigIntegerField(default=0, null=True, blank=True)
+    field.contribute_to_class(ContentNode, field_name)
+
+
 @python_2_unicode_compatible
 class Language(base_models.Language):
     def __str__(self):

diff --git a/kolibri/core/content/test/test_annotation.py b/kolibri/core/content/test/test_annotation.py
@@ -14,12 +14,14 @@
 from kolibri.core.content.models import File
 from kolibri.core.content.models import Language
 from kolibri.core.content.models import LocalFile
+from kolibri.core.content.test.test_channel_upgrade import ChannelBuilder
 from kolibri.core.content.utils.annotation import calculate_included_languages
 from kolibri.core.content.utils.annotation import calculate_published_size
 from kolibri.core.content.utils.annotation import calculate_total_resource_count
 from kolibri.core.content.utils.annotation import mark_local_files_as_available
 from kolibri.core.content.utils.annotation import mark_local_files_as_unavailable
 from kolibri.core.content.utils.annotation import recurse_annotation_up_tree
+from kolibri.core.content.utils.annotation import set_channel_ancestors
 from kolibri.core.content.utils.annotation import set_channel_metadata_fields
 from kolibri.core.content.utils.annotation import (
     set_leaf_node_availability_from_local_file_availability,
@@ -906,3 +908,14 @@ def test_public(self):
 
         self.channel.refresh_from_db()
         self.assertTrue(self.channel.public)
+
+
+@patch("kolibri.core.content.utils.sqlalchemybridge.get_engine", new=get_engine)
+class AncestorAnnotationTestCase(TransactionTestCase):
+    def test_ancestors(self):
+        builder = ChannelBuilder()
+        builder.insert_into_default_db()
+        channel_id = builder.channel["id"]
+        set_channel_ancestors(channel_id)
+        for n in ContentNode.objects.filter(channel_id=channel_id):
+            self.assertEqual(n.ancestors, list(n.get_ancestors().values("id", "title")))
diff --git a/kolibri/core/content/test/test_channel_import.py b/kolibri/core/content/test/test_channel_import.py
@@ -294,8 +294,9 @@ def test_destination_tree_ids(self, apps_mock, tree_id_mock, BridgeMock):
 
 
 class MaliciousDatabaseTestCase(TestCase):
+    @patch("kolibri.core.content.utils.channel_import.set_channel_ancestors")
     @patch("kolibri.core.content.utils.channel_import.initialize_import_manager")
-    def test_non_existent_root_node(self, initialize_manager_mock):
+    def test_non_existent_root_node(self, initialize_manager_mock, ancestor_mock):
         import_mock = MagicMock()
         initialize_manager_mock.return_value = import_mock
         channel_id = "6199dde695db4ee4ab392222d5af1e5c"