Skip to content

Commit

Permalink
Merge pull request #8492 from rtibbles/content_metadata_optimizations
Browse files Browse the repository at this point in the history
Content metadata optimizations
  • Loading branch information
rtibbles committed Oct 29, 2021
2 parents 48075ed + e2088c4 commit 50f8b24
Show file tree
Hide file tree
Showing 16 changed files with 380 additions and 163 deletions.
118 changes: 25 additions & 93 deletions kolibri/core/content/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,12 +175,12 @@ class ContentNodeFilter(IdFilter):
parent__isnull = BooleanFilter(field_name="parent", lookup_expr="isnull")
include_coach_content = BooleanFilter(method="filter_include_coach_content")
contains_quiz = CharFilter(method="filter_contains_quiz")
grade_levels = CharFilter(method="filter_contains_and")
resource_types = CharFilter(method="filter_contains_and")
learning_activities = CharFilter(method="filter_contains_and")
accessibility_labels = CharFilter(method="filter_contains_and")
categories = CharFilter(method="filter_contains_and")
learner_needs = CharFilter(method="filter_contains_and")
grade_levels = CharFilter(method="bitmask_contains_and")
resource_types = CharFilter(method="bitmask_contains_and")
learning_activities = CharFilter(method="bitmask_contains_and")
accessibility_labels = CharFilter(method="bitmask_contains_and")
categories = CharFilter(method="bitmask_contains_and")
learner_needs = CharFilter(method="bitmask_contains_and")
keywords = CharFilter(method="filter_keywords")
channels = UUIDInFilter(name="channel_id")
languages = CharInFilter(name="lang_id")
Expand Down Expand Up @@ -269,11 +269,8 @@ def filter_keywords(self, queryset, name, value):

return queryset.filter(query)

def filter_contains_and(self, queryset, name, value):
values = value.split(",")
return queryset.filter(
intersection([Q(**{name + "__contains": v}) for v in values])
)
def bitmask_contains_and(self, queryset, name, value):
return queryset.has_all_labels(name, value.split(","))


class OptionalPageNumberPagination(ValuesViewsetPageNumberPagination):
Expand Down Expand Up @@ -342,6 +339,7 @@ class BaseContentNodeMixin(object):
"accessibility_labels",
"categories",
"duration",
"ancestors",
)

field_map = {
Expand Down Expand Up @@ -422,77 +420,24 @@ def get_related_data_maps(self, items, queryset):

return assessmentmetadata_map, files_map, languages_map, tags_map

def process_items(self, items, queryset, ancestor_lookup_method=None):
output = []
assessmentmetadata, files_map, languages_map, tags = self.get_related_data_maps(
items, queryset
)
for item in items:
item["assessmentmetadata"] = assessmentmetadata.get(item["id"])
item["tags"] = tags.get(item["id"], [])
item["files"] = files_map.get(item["id"], [])
lang_id = item.pop("lang_id")
item["lang"] = languages_map.get(lang_id)
if ancestor_lookup_method:
item["ancestors"] = ancestor_lookup_method(item)
item["is_leaf"] = item.get("kind") != content_kinds.TOPIC
output.append(item)
return output

def consolidate(self, items, queryset):
output = []
if items:

# We need to batch our queries for ancestors as the size of the expression tree
# depends on the number of nodes that we are querying for.
# On Windows, the SQL parameter limit is 999, and an ancestors call can produce
# 3 parameters per node in the queryset, so this should max out the parameters at 750.
ANCESTOR_BATCH_SIZE = 250

if len(items) > ANCESTOR_BATCH_SIZE:

ancestors_map = {}

for i in range(0, len(items), ANCESTOR_BATCH_SIZE):

for anc in (
models.ContentNode.objects.filter(
id__in=[
item["id"]
for item in items[i : i + ANCESTOR_BATCH_SIZE]
]
)
.get_ancestors()
.values("id", "title", "lft", "rght", "tree_id")
):
ancestors_map[anc["id"]] = anc

ancestors = sorted(ancestors_map.values(), key=lambda x: x["lft"])
else:
ancestors = list(
queryset.get_ancestors()
.values("id", "title", "lft", "rght", "tree_id")
.order_by("lft")
)

def ancestor_lookup(item):
lft = item.get("lft")
rght = item.get("rght")
tree_id = item.get("tree_id")
return list(
map(
lambda x: {"id": x["id"], "title": x["title"]},
filter(
lambda x: x["lft"] < lft
and x["rght"] > rght
and x["tree_id"] == tree_id,
ancestors,
),
)
)

return self.process_items(items, queryset, ancestor_lookup)

return []
(
assessmentmetadata,
files_map,
languages_map,
tags,
) = self.get_related_data_maps(items, queryset)
for item in items:
item["assessmentmetadata"] = assessmentmetadata.get(item["id"])
item["tags"] = tags.get(item["id"], [])
item["files"] = files_map.get(item["id"], [])
lang_id = item.pop("lang_id")
item["lang"] = languages_map.get(lang_id)
item["is_leaf"] = item.get("kind") != content_kinds.TOPIC
output.append(item)
return output


class OptionalContentNodePagination(ValuesViewsetCursorPagination):
Expand Down Expand Up @@ -872,11 +817,6 @@ class ContentNodeTreeViewset(BaseContentNodeMixin, BaseValuesViewset):
# should not raise it.
page_size = 25

def consolidate(self, items, queryset):
if items:
return self.process_items(items, queryset)
return []

def validate_and_return_params(self, request):
depth = request.query_params.get("depth", 2)
lft__gt = request.query_params.get("lft__gt")
Expand Down Expand Up @@ -974,10 +914,6 @@ def retrieve(self, request, **kwargs):
# The serialized parent representation is the first node in the lft order
parent = nodes[0]

# Do a query to get the parent's ancestors - for all other nodes, we can manually build
# the ancestors from the tree topology that we already know!
parent["ancestors"] = list(parent_model.get_ancestors().values("id", "title"))

# Use this to keep track of direct children of the parent node
# this will allow us to do lookups for the grandchildren, in order
# to insert them into the "children" property
Expand Down Expand Up @@ -1013,10 +949,6 @@ def retrieve(self, request, **kwargs):
# If the parent of the descendant does not already have its `children` property
# initialized, do so here.
desc_parent["children"] = {"results": [], "more": None}
# The ancestors field for the descendant will be its parents ancestors, plus its parent!
desc["ancestors"] = desc_parent["ancestors"] + [
{"id": desc_parent["id"], "title": desc_parent["title"]}
]
# Add this descendant to the results for the children pagination object
desc_parent["children"]["results"].append(desc)
# Only bother updating the URL for more if we have hit the page size limit
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,16 @@ class ContentContentnode(Base):
__tablename__ = "content_contentnode"
__table_args__ = (
Index(
"content_contentnode_level_channel_id_available_29f0bb18_idx",
"content_contentnode_level_channel_id_kind_fd732cc4_idx",
"level",
"channel_id",
"available",
"kind",
),
Index(
"content_contentnode_level_channel_id_kind_fd732cc4_idx",
"content_contentnode_level_channel_id_available_29f0bb18_idx",
"level",
"channel_id",
"kind",
"available",
),
)

Expand All @@ -67,6 +67,7 @@ class ContentContentnode(Base):
sort_order = Column(Float)
license_owner = Column(String(200), nullable=False)
author = Column(String(200), nullable=False)
kind = Column(String(200), nullable=False)
available = Column(Boolean, nullable=False)
lft = Column(Integer, nullable=False, index=True)
rght = Column(Integer, nullable=False, index=True)
Expand All @@ -86,7 +87,12 @@ class ContentContentnode(Base):
learner_needs = Column(Text)
learning_activities = Column(Text)
resource_types = Column(Text)
kind = Column(String(200), nullable=False)
accessibility_labels_bitmask_0 = Column(BigInteger)
categories_bitmask_0 = Column(BigInteger)
grade_levels_bitmask_0 = Column(BigInteger)
learner_needs_bitmask_0 = Column(BigInteger)
learning_activities_bitmask_0 = Column(BigInteger)
ancestors = Column(Text)
parent_id = Column(ForeignKey("content_contentnode.id"), index=True)

lang = relationship("ContentLanguage")
Expand Down
41 changes: 41 additions & 0 deletions kolibri/core/content/migrations/0029_metadata_bitmasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.11.29 on 2021-10-11 20:40
from __future__ import unicode_literals

from django.db import migrations
from django.db import models


class Migration(migrations.Migration):

dependencies = [
("content", "0028_contentnode_metadata_labels"),
]

operations = [
migrations.AddField(
model_name="contentnode",
name="accessibility_labels_bitmask_0",
field=models.BigIntegerField(blank=True, default=0, null=True),
),
migrations.AddField(
model_name="contentnode",
name="categories_bitmask_0",
field=models.BigIntegerField(blank=True, default=0, null=True),
),
migrations.AddField(
model_name="contentnode",
name="grade_levels_bitmask_0",
field=models.BigIntegerField(blank=True, default=0, null=True),
),
migrations.AddField(
model_name="contentnode",
name="learner_needs_bitmask_0",
field=models.BigIntegerField(blank=True, default=0, null=True),
),
migrations.AddField(
model_name="contentnode",
name="learning_activities_bitmask_0",
field=models.BigIntegerField(blank=True, default=0, null=True),
),
]
22 changes: 22 additions & 0 deletions kolibri/core/content/migrations/0030_contentnode_ancestors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.11.29 on 2021-10-11 20:51
from __future__ import unicode_literals

from django.db import migrations

import kolibri.core.fields


class Migration(migrations.Migration):

dependencies = [
("content", "0029_metadata_bitmasks"),
]

operations = [
migrations.AddField(
model_name="contentnode",
name="ancestors",
field=kolibri.core.fields.JSONField(blank=True, default=[], null=True),
),
]
32 changes: 32 additions & 0 deletions kolibri/core/content/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

from django.db import connection
from django.db import models
from django.db.models import F
from django.db.models import Min
from django.db.models import Q
from django.db.models import QuerySet
Expand All @@ -39,7 +40,10 @@
from .utils import paths
from kolibri.core.content import base_models
from kolibri.core.content.errors import InvalidStorageFilenameError
from kolibri.core.content.utils.search import bitmask_fieldnames
from kolibri.core.content.utils.search import metadata_bitmasks
from kolibri.core.device.models import ContentCacheKey
from kolibri.core.fields import JSONField
from kolibri.core.mixins import FilterByUUIDQuerysetMixin

PRESET_LOOKUP = dict(format_presets.choices)
Expand Down Expand Up @@ -82,6 +86,25 @@ def filter_by_content_ids(self, content_ids, validate=True):
def exclude_by_content_ids(self, content_ids, validate=True):
return self._by_uuids(content_ids, validate, "content_id", False)

def has_all_labels(self, field_name, labels):
bitmasks = metadata_bitmasks[field_name]
bits = {}
for label in labels:
if label in bitmasks:
bitmask_fieldname = bitmasks[label]["bitmask_field_name"]
if bitmask_fieldname not in bits:
bits[bitmask_fieldname] = 0
bits[bitmask_fieldname] += bitmasks[label]["bits"]

filters = {}
annotations = {}
for bitmask_fieldname, bits in bits.items():
annotation_fieldname = "{}_{}".format(bitmask_fieldname, "masked")
filters[annotation_fieldname + "__gt"] = 0
annotations[annotation_fieldname] = F(bitmask_fieldname).bitand(bits)

return self.annotate(**annotations).filter(**filters)


class ContentNodeManager(
models.Manager.from_queryset(ContentNodeQueryset), TreeManager
Expand Down Expand Up @@ -162,6 +185,10 @@ class ContentNode(base_models.ContentNode):
# then it is 1 or 0 depending on availability
on_device_resources = models.IntegerField(default=0, null=True, blank=True)

# Use this to annotate ancestor information directly onto the ContentNode, as it can be a
# costly lookup
ancestors = JSONField(default=[], null=True, blank=True)

objects = ContentNodeManager()

class Meta:
Expand All @@ -186,6 +213,11 @@ def get_descendant_content_ids(self):
)


for field_name in bitmask_fieldnames:
field = models.BigIntegerField(default=0, null=True, blank=True)
field.contribute_to_class(ContentNode, field_name)


@python_2_unicode_compatible
class Language(base_models.Language):
def __str__(self):
Expand Down
13 changes: 13 additions & 0 deletions kolibri/core/content/test/test_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,14 @@
from kolibri.core.content.models import File
from kolibri.core.content.models import Language
from kolibri.core.content.models import LocalFile
from kolibri.core.content.test.test_channel_upgrade import ChannelBuilder
from kolibri.core.content.utils.annotation import calculate_included_languages
from kolibri.core.content.utils.annotation import calculate_published_size
from kolibri.core.content.utils.annotation import calculate_total_resource_count
from kolibri.core.content.utils.annotation import mark_local_files_as_available
from kolibri.core.content.utils.annotation import mark_local_files_as_unavailable
from kolibri.core.content.utils.annotation import recurse_annotation_up_tree
from kolibri.core.content.utils.annotation import set_channel_ancestors
from kolibri.core.content.utils.annotation import set_channel_metadata_fields
from kolibri.core.content.utils.annotation import (
set_leaf_node_availability_from_local_file_availability,
Expand Down Expand Up @@ -906,3 +908,14 @@ def test_public(self):

self.channel.refresh_from_db()
self.assertTrue(self.channel.public)


@patch("kolibri.core.content.utils.sqlalchemybridge.get_engine", new=get_engine)
class AncestorAnnotationTestCase(TransactionTestCase):
def test_ancestors(self):
builder = ChannelBuilder()
builder.insert_into_default_db()
channel_id = builder.channel["id"]
set_channel_ancestors(channel_id)
for n in ContentNode.objects.filter(channel_id=channel_id):
self.assertEqual(n.ancestors, list(n.get_ancestors().values("id", "title")))
3 changes: 2 additions & 1 deletion kolibri/core/content/test/test_channel_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,8 +294,9 @@ def test_destination_tree_ids(self, apps_mock, tree_id_mock, BridgeMock):


class MaliciousDatabaseTestCase(TestCase):
@patch("kolibri.core.content.utils.channel_import.set_channel_ancestors")
@patch("kolibri.core.content.utils.channel_import.initialize_import_manager")
def test_non_existent_root_node(self, initialize_manager_mock):
def test_non_existent_root_node(self, initialize_manager_mock, ancestor_mock):
import_mock = MagicMock()
initialize_manager_mock.return_value = import_mock
channel_id = "6199dde695db4ee4ab392222d5af1e5c"
Expand Down

0 comments on commit 50f8b24

Please sign in to comment.