From e0a111ec8cc7ff4698c1a69e2be441488ab7c62d Mon Sep 17 00:00:00 2001 From: jamesrichards Date: Fri, 2 Aug 2024 15:31:05 +0100 Subject: [PATCH 1/5] Removed chunk and associated models. Removed get chunks file endpoint. Removed chunk related actions from storage_handler --- core-api/core_api/routes/file.py | 39 +---- core-api/tests/routes/test_file.py | 21 +-- redbox-core/redbox/models/__init__.py | 4 - redbox-core/redbox/models/file.py | 106 +----------- redbox-core/redbox/storage/elasticsearch.py | 132 ++------------ redbox-core/redbox/storage/storage_handler.py | 8 +- redbox-core/tests/conftest.py | 72 +++----- redbox-core/tests/models/__init__.py | 0 redbox-core/tests/models/test_file.py | 55 ------ .../tests/storage/test_elasticsearch.py | 162 ++++++------------ 10 files changed, 107 insertions(+), 492 deletions(-) delete mode 100644 redbox-core/tests/models/__init__.py delete mode 100644 redbox-core/tests/models/test_file.py diff --git a/core-api/core_api/routes/file.py b/core-api/core_api/routes/file.py index 0263e592c..408b23af9 100644 --- a/core-api/core_api/routes/file.py +++ b/core-api/core_api/routes/file.py @@ -11,7 +11,7 @@ from core_api.auth import get_user_uuid from core_api.publisher_handler import FilePublisher -from redbox.models import APIError404, Chunk, File, FileStatus, ProcessingStatusEnum, Settings +from redbox.models import APIError404, File, FileStatus, ProcessingStatusEnum, Settings from redbox.storage import ElasticsearchStorageHandler # === Functions === @@ -202,8 +202,7 @@ def delete_file(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_user_uui storage_handler.delete_item(file) - chunks = storage_handler.get_file_chunks(file.uuid, user_uuid) - storage_handler.delete_items(chunks) + storage_handler.delete_user_items("chunk", user_uuid) return file @@ -239,7 +238,7 @@ async def reingest_file(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_ storage_handler.update_item(file) # Remove old chunks - storage_handler.delete_file_chunks(file.uuid, user_uuid) + storage_handler.delete_user_items("chunk", user_uuid) # Add new chunks log.info("publishing %s", file.uuid) @@ -247,38 +246,6 @@ async def reingest_file(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_ return file - -@file_app.get( - "/{file_uuid}/chunks", - tags=["file"], - responses={404: {"model": APIError404, "description": "The file was not found"}}, -) -def get_file_chunks(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_user_uuid)]) -> list[Chunk]: - """Gets a list of chunks for a file in the database - - Args: - file_uuid (UUID): The UUID of the file to delete - user_uuid (UUID): The UUID of the user - - Returns: - Chunks (list, Chunk): The chunks belonging to the requested file - - Raises: - 404: If the file isn't found, or the creator and requester don't match - """ - try: - file = storage_handler.read_item(file_uuid, model_type="File") - except NotFoundError: - return file_not_found_response(file_uuid=file_uuid) - - if file.creator_user_uuid != user_uuid: - return file_not_found_response(file_uuid=file_uuid) - - log.info("getting chunks for file %s", file_uuid) - - return storage_handler.get_file_chunks(file_uuid, user_uuid) - - @file_app.get( "/{file_uuid}/status", tags=["file"], diff --git a/core-api/tests/routes/test_file.py b/core-api/tests/routes/test_file.py index 07f3c6896..5507f27bf 100644 --- a/core-api/tests/routes/test_file.py +++ b/core-api/tests/routes/test_file.py @@ -86,7 +86,7 @@ def test_delete_file(s3_client, app_client, elasticsearch_storage_handler, chunk # check assets exist assert s3_client.get_object(Bucket=env.bucket_name, Key=chunked_file.key) assert elasticsearch_storage_handler.read_item(item_uuid=chunked_file.uuid, model_type="file") - assert elasticsearch_storage_handler.get_file_chunks(chunked_file.uuid, chunked_file.creator_user_uuid) + assert elasticsearch_storage_handler.list_all_items("chunk", chunked_file.creator_user_uuid) response = app_client.delete(f"/file/{chunked_file.uuid}", headers=headers) assert response.status_code == HTTPStatus.OK @@ -98,7 +98,7 @@ def test_delete_file(s3_client, app_client, elasticsearch_storage_handler, chunk with pytest.raises(NotFoundError): elasticsearch_storage_handler.read_item(item_uuid=chunked_file.uuid, model_type="file") - assert not elasticsearch_storage_handler.get_file_chunks(chunked_file.uuid, chunked_file.creator_user_uuid) + assert not elasticsearch_storage_handler.list_all_items("chunk", chunked_file.creator_user_uuid) def test_delete_missing_file(app_client, headers): @@ -117,27 +117,16 @@ def test_reingest_file(app_client, chunked_file, elasticsearch_storage_handler, When I PUT it to /file/uuid/ I Expect the old chunks to be removed """ - previous_chunks = elasticsearch_storage_handler.get_file_chunks(chunked_file.uuid, chunked_file.creator_user_uuid) + previous_chunks = elasticsearch_storage_handler.list_all_items("chunk", chunked_file.creator_user_uuid) response = app_client.put(f"/file/{chunked_file.uuid}", headers=headers) assert response.status_code == HTTPStatus.OK elasticsearch_storage_handler.refresh() assert ( - elasticsearch_storage_handler.get_file_chunks(chunked_file.uuid, chunked_file.creator_user_uuid) + elasticsearch_storage_handler.list_all_items("chunk", chunked_file.creator_user_uuid) != previous_chunks - ) - - -def test_get_file_chunks(app_client, chunked_file, headers): - """ - Given a previously chunked file - When I GET it from /file/uuid/chunks - I Expect to receive the chunks - """ - response = app_client.get(f"/file/{chunked_file.uuid}/chunks", headers=headers) - assert response.status_code == HTTPStatus.OK - assert len(response.json()) == 12 + ), f"Pre and post chunks matched and both had {len(previous_chunks)} chunks" def test_get_missing_file_chunks(app_client, headers): diff --git a/redbox-core/redbox/models/__init__.py b/redbox-core/redbox/models/__init__.py index 79b7c0b96..107afa539 100644 --- a/redbox-core/redbox/models/__init__.py +++ b/redbox-core/redbox/models/__init__.py @@ -11,8 +11,6 @@ APIErrorResponse, ) from redbox.models.file import ( - Chunk, - ChunkStatus, File, FileStatus, ProcessingStatusEnum, @@ -29,8 +27,6 @@ "ChatRequest", "ChatResponse", "ChatRoute", - "Chunk", - "ChunkStatus", "EmbedQueueItem", "EmbeddingModelInfo", "EmbeddingResponse", diff --git a/redbox-core/redbox/models/file.py b/redbox-core/redbox/models/file.py index e7ca30dc9..4c2851dda 100644 --- a/redbox-core/redbox/models/file.py +++ b/redbox-core/redbox/models/file.py @@ -1,12 +1,11 @@ from __future__ import annotations -import hashlib from enum import Enum, StrEnum from uuid import UUID import datetime import tiktoken -from pydantic import BaseModel, Field, computed_field +from pydantic import BaseModel, Field from redbox.models.base import PersistableModel @@ -41,108 +40,7 @@ def __le__(self, other: Link): def __hash__(self): return hash(self.text) ^ hash(self.url) ^ hash(self.start_index) - - -class Metadata(BaseModel): - """this is a pydantic model for the unstructured Metadata class - uncomment fields below and update merge as required""" - - parent_doc_uuid: UUID | None = Field( - description="this field is not actually part of unstructured Metadata but is required by langchain", - default=None, - ) - - # attached_to_filename: Optional[str] = None - # category_depth: Optional[int] = None - # coordinates: Optional[CoordinatesMetadata] = None - # data_source: Optional[DataSourceMetadata] = None - # detection_class_prob: Optional[float] = None - # emphasized_text_contents: Optional[list[str]] = None - # emphasized_text_tags: Optional[list[str]] = None - # file_directory: Optional[str] = None - # filename: Optional[str | pathlib.Path] = None - # filetype: Optional[str] = None - # header_footer_type: Optional[str] = None - # image_path: Optional[str] = None - # is_continuation: Optional[bool] = None - languages: list[str] | None = None - # last_modified: Optional[str] = None - link_texts: list[str] | None = None - link_urls: list[str] | None = None - links: list[Link] | None = None - # orig_elements: Optional[list[Element]] = None - # page_name: Optional[str] = None - page_number: int | list[int] | None = None - # parent_id: Optional[UUID] = None - # regex_metadata: Optional[dict[str, list[RegexMetadata]]] = None - # section: Optional[str] = None - # sent_from: Optional[list[str]] = None - # sent_to: Optional[list[str]] = None - # signature: Optional[str] = None - # subject: Optional[str] = None - # text_as_html: Optional[str] = None - # url: Optional[str] = None - - @classmethod - def merge(cls, left: Metadata | None, right: Metadata | None) -> Metadata | None: - if not left: - return right - if not right: - return left - - def listify(obj, field_name: str) -> list: - field_value = getattr(obj, field_name, None) - if isinstance(field_value, list): - return field_value - if field_value is None: - return [] - return [field_value] - - def sorted_list_or_none(obj: list): - return sorted(set(obj)) or None - - data = { - field_name: sorted_list_or_none(listify(left, field_name) + listify(right, field_name)) - for field_name in cls.model_fields - } - - if parent_doc_uuids := data.get("parent_doc_uuid"): - parent_doc_uuids_without_none = [uuid for uuid in parent_doc_uuids if uuid] - if len(parent_doc_uuids) > 1: - message = "chunks do not have the same parent_doc_uuid" - raise ValueError(message) - data["parent_doc_uuid"] = parent_doc_uuids_without_none[0] - return cls(**data) - - -class Chunk(PersistableModel): - """Chunk of a File""" - - parent_file_uuid: UUID = Field(description="id of the original file which this text came from") - index: int = Field(description="relative position of this chunk in the original file") - text: str = Field(description="chunk of the original text") - metadata: Metadata | dict | None = Field( - description="subset of the unstructured Element.Metadata object", default=None - ) - embedding: list[float] | None = Field(description="the vector representation of the text", default=None) - - @computed_field # type: ignore[misc] # Remove if https://github.com/python/mypy/issues/1362 is fixed. - @property # Needed for type checking - see https://docs.pydantic.dev/2.0/usage/computed_fields/ - def text_hash(self) -> str: - return hashlib.md5(self.text.encode(encoding="UTF-8", errors="strict"), usedforsecurity=False).hexdigest() - - @computed_field # type: ignore[misc] # Remove if https://github.com/python/mypy/issues/1362 is fixed. - @property # Needed for type checking - see https://docs.pydantic.dev/2.0/usage/computed_fields/ - def token_count(self) -> int: - return len(encoding.encode(self.text)) - - -class ChunkStatus(BaseModel): - """Status of a chunk of a file.""" - - chunk_uuid: UUID - embedded: bool - + class FileStatus(BaseModel): """Status of a file.""" diff --git a/redbox-core/redbox/storage/elasticsearch.py b/redbox-core/redbox/storage/elasticsearch.py index 0708cc296..4da0e2a3e 100644 --- a/redbox-core/redbox/storage/elasticsearch.py +++ b/redbox-core/redbox/storage/elasticsearch.py @@ -8,45 +8,26 @@ from elasticsearch.helpers import scan from pydantic import ValidationError -from redbox.models import Chunk, FileStatus, Settings +from redbox.models import Settings from redbox.models.base import PersistableModel from redbox.storage.storage_handler import BaseStorageHandler logging.basicConfig(level=logging.INFO) log = logging.getLogger() -env = Settings() - -def build_chunk_query(parent_file_uuid: UUID, user_uuid: UUID) -> dict: - query = { +def get_query_match_all_for_user(user_uuid: UUID): + return { "query": { "bool": { - "must": [ - { - "bool": { - "should": [ - {"term": {"parent_file_uuid.keyword": str(parent_file_uuid)}}, - {"term": {"metadata.parent_file_uuid.keyword": str(parent_file_uuid)}}, - ] - } - }, - { - "bool": { - "should": [ - {"term": {"creator_user_uuid.keyword": str(user_uuid)}}, - {"term": {"metadata.creator_user_uuid.keyword": str(user_uuid)}}, - ] - } - }, + "should": [ + {"term": {"creator_user_uuid.keyword": str(user_uuid)}}, + {"term": {"metadata.creator_user_uuid.keyword": str(user_uuid)}}, ] } } } - return query - - class ElasticsearchStorageHandler(BaseStorageHandler): """Storage Handler for Elasticsearch""" @@ -79,6 +60,8 @@ def write_item(self, item: PersistableModel) -> ObjectApiResponse: def write_items(self, items: Sequence[PersistableModel]) -> Sequence[ObjectApiResponse]: return list(map(self.write_item, items)) + + def read_item(self, item_uuid: UUID, model_type: str): target_index = f"{self.root_index}-{model_type.lower()}" result = self.es_client.get(index=target_index, id=str(item_uuid)) @@ -121,6 +104,13 @@ def delete_items(self, items: list[PersistableModel]) -> ObjectApiResponse | Non index=target_index, body={"query": {"terms": {"_id": [str(item.uuid) for item in items]}}}, ) + + def delete_user_items(self, model_type: str, user_uuid: UUID) -> ObjectApiResponse | None: + target_index = f"{self.root_index}-{model_type.lower()}" + return self.es_client.delete_by_query( + index=target_index, + body=get_query_match_all_for_user(user_uuid), + ) def read_all_items(self, model_type: str, user_uuid: UUID) -> list[PersistableModel]: target_index = f"{self.root_index}-{model_type.lower()}" @@ -128,16 +118,7 @@ def read_all_items(self, model_type: str, user_uuid: UUID) -> list[PersistableMo result = scan( client=self.es_client, index=target_index, - query={ - "query": { - "bool": { - "should": [ - {"term": {"creator_user_uuid.keyword": str(user_uuid)}}, - {"term": {"metadata.creator_user_uuid.keyword": str(user_uuid)}}, - ] - } - } - }, + query=get_query_match_all_for_user(user_uuid), _source=True, ) @@ -167,88 +148,11 @@ def list_all_items(self, model_type: str, user_uuid: UUID) -> list[UUID]: results = scan( client=self.es_client, index=target_index, - query={ - "query": { - "bool": { - "should": [ - {"term": {"creator_user_uuid.keyword": str(user_uuid)}}, - {"term": {"metadata.creator_user_uuid.keyword": str(user_uuid)}}, - ] - } - } - }, + query=get_query_match_all_for_user(user_uuid), _source=False, ) except NotFoundError: log.info("Index %s not found. Returning empty list.", target_index) return [] - return [UUID(item["_id"]) for item in results] - - def get_file_chunks(self, parent_file_uuid: UUID, user_uuid: UUID) -> list[Chunk]: - """get chunks for a given file""" - target_index = f"{self.root_index}-chunk" - - return [ - hit_to_chunk(item) - for item in scan( - client=self.es_client, - index=target_index, - query=build_chunk_query(parent_file_uuid, user_uuid), - ) - ] - - def delete_file_chunks(self, parent_file_uuid: UUID, user_uuid: UUID): - """delete chunks for a given file""" - target_index = f"{self.root_index}-chunk" - - self.es_client.delete_by_query( - index=target_index, - body=build_chunk_query(parent_file_uuid, user_uuid), - ) - - def get_file_status(self, file_uuid: UUID, user_uuid: UUID) -> FileStatus: - """Get the status of a file and associated Chunks - - Args: - file_uuid (UUID): The UUID of the file to get the status of - user_uuid (UUID): the UUID of the user - - Returns: - FileStatus: The status of the file - """ - - # Test 1: Get the file - try: - file = self.read_item(file_uuid, "File") - except NotFoundError as e: - log.exception("file/%s not found", file_uuid) - message = f"File {file_uuid} not found" - raise ValueError(message) from e - if file.creator_user_uuid != user_uuid: - log.error("file/%s.%s not owned by %s", file_uuid, file.creator_user_uuid, user_uuid) - message = f"File {file_uuid} not found" - raise ValueError(message) - - return FileStatus( - file_uuid=file_uuid, - processing_status=file.ingest_status, - ) - - -def hit_to_chunk(hit: dict[str, Any]) -> Chunk: - if hit["_source"].get("uuid"): - # Legacy direct chunk storage - return Chunk(**hit["_source"]) - else: - # Document storage - return Chunk( - uuid=hit["_id"], - text=hit["_source"]["text"], - index=hit["_source"]["metadata"]["index"], - embedding=hit["_source"].get(env.embedding_document_field_name), - created_datetime=hit["_source"]["metadata"]["created_datetime"], - creator_user_uuid=hit["_source"]["metadata"]["creator_user_uuid"], - parent_file_uuid=hit["_source"]["metadata"]["parent_file_uuid"], - metadata=hit["_source"]["metadata"], - ) + return [UUID(item["_id"]) for item in results] \ No newline at end of file diff --git a/redbox-core/redbox/storage/storage_handler.py b/redbox-core/redbox/storage/storage_handler.py index f794bd94f..d14e3bafe 100644 --- a/redbox-core/redbox/storage/storage_handler.py +++ b/redbox-core/redbox/storage/storage_handler.py @@ -2,7 +2,7 @@ from typing import ClassVar from uuid import UUID -from redbox.models import Chunk, File +from redbox.models import File from redbox.models.base import PersistableModel @@ -12,7 +12,7 @@ class BaseStorageHandler(ABC): """ # dict comprehension for lowercase class name to class - model_type_map: ClassVar = {v.__name__.lower(): v for v in [Chunk, File]} + model_type_map: ClassVar = {v.__name__.lower(): v for v in [File]} def get_model_by_model_type(self, model_type): return self.model_type_map[model_type.lower()] @@ -60,7 +60,3 @@ def list_all_items(self, model_type: str, user_uuid: UUID): @abstractmethod def read_all_items(self, model_type: str, user_uuid: UUID): """Read all objects of a given type from a data store""" - - @abstractmethod - def get_file_chunks(self, parent_file_uuid: UUID, user_uuid: UUID) -> list[Chunk]: - """get chunks for a given file""" diff --git a/redbox-core/tests/conftest.py b/redbox-core/tests/conftest.py index 7e98a2032..f496cd1c5 100644 --- a/redbox-core/tests/conftest.py +++ b/redbox-core/tests/conftest.py @@ -4,7 +4,7 @@ import pytest from elasticsearch import Elasticsearch -from redbox.models import Chunk, File, Settings +from redbox.models import File, Settings from redbox.storage.elasticsearch import ElasticsearchStorageHandler from collections.abc import Generator @@ -39,51 +39,39 @@ def claire(): @pytest.fixture() -def file_belonging_to_alice(file_pdf_path, alice, env) -> File: - return File( +def file_belonging_to_alice(file_pdf_path, alice, env, elasticsearch_storage_handler: ElasticsearchStorageHandler) -> File: + f = File( key=file_pdf_path.name, bucket=env.bucket_name, creator_user_uuid=alice, ) + elasticsearch_storage_handler.write_item(f) + elasticsearch_storage_handler.refresh() + return f @pytest.fixture() -def file_belonging_to_bob(file_pdf_path, bob, env) -> File: - return File( +def file_belonging_to_bob(file_pdf_path, bob, env, elasticsearch_storage_handler: ElasticsearchStorageHandler) -> File: + f = File( key=file_pdf_path.name, bucket=env.bucket_name, creator_user_uuid=bob, ) + elasticsearch_storage_handler.write_item(f) + elasticsearch_storage_handler.refresh() + return f @pytest.fixture() -def chunk_belonging_to_alice(file_belonging_to_alice) -> Chunk: - return Chunk( - creator_user_uuid=file_belonging_to_alice.creator_user_uuid, - parent_file_uuid=file_belonging_to_alice.uuid, - index=1, - text="hello, i am Alice!", - ) - - -@pytest.fixture() -def chunk_belonging_to_bob(file_belonging_to_bob) -> Chunk: - return Chunk( - creator_user_uuid=file_belonging_to_bob.creator_user_uuid, - parent_file_uuid=file_belonging_to_bob.uuid, - index=1, - text="hello, i am Bob!", - ) - - -@pytest.fixture() -def chunk_belonging_to_claire(claire) -> Chunk: - return Chunk( +def file_belonging_to_claire(file_pdf_path, claire, env, elasticsearch_storage_handler: ElasticsearchStorageHandler) -> File: + f = File( + key=file_pdf_path.name, + bucket=env.bucket_name, creator_user_uuid=claire, - parent_file_uuid=uuid4(), - index=1, - text="hello, i am Claire!", ) + elasticsearch_storage_handler.write_item(f) + elasticsearch_storage_handler.refresh() + return f @pytest.fixture @@ -91,20 +79,6 @@ def file_pdf_path() -> Path: return Path(__file__).parents[2] / "tests" / "data" / "pdf" / "Cabinet Office - Wikipedia.pdf" -@pytest.fixture() -def stored_chunk_belonging_to_alice(elasticsearch_storage_handler, chunk_belonging_to_alice) -> Chunk: - elasticsearch_storage_handler.write_item(item=chunk_belonging_to_alice) - elasticsearch_storage_handler.refresh() - return chunk_belonging_to_alice - - -@pytest.fixture() -def stored_chunk_belonging_to_bob(elasticsearch_storage_handler, chunk_belonging_to_bob) -> Chunk: - elasticsearch_storage_handler.write_item(item=chunk_belonging_to_bob) - elasticsearch_storage_handler.refresh() - return chunk_belonging_to_bob - - @pytest.fixture() def elasticsearch_client(env) -> Elasticsearch: return env.elasticsearch_client() @@ -120,13 +94,21 @@ def es_index(env) -> str: return f"{env.elastic_root_index}-chunk" +@pytest.fixture(scope="session") +def es_index_file(env) -> str: + return f"{env.elastic_root_index}-file" + + @pytest.fixture(autouse=True, scope="session") -def create_index(env, es_index): +def create_index(env, es_index, es_index_file): es = env.elasticsearch_client() if not es.indices.exists(index=es_index): es.indices.create(index=es_index) + if not es.indices.exists(index=es_index_file): + es.indices.create(index=es_index_file) yield es.indices.delete(index=es_index) + es.indices.delete(index=es_index_file) @pytest.fixture() diff --git a/redbox-core/tests/models/__init__.py b/redbox-core/tests/models/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/redbox-core/tests/models/test_file.py b/redbox-core/tests/models/test_file.py deleted file mode 100644 index bcd3842f5..000000000 --- a/redbox-core/tests/models/test_file.py +++ /dev/null @@ -1,55 +0,0 @@ -import uuid - -import pytest - -from redbox.models.file import Link, Metadata - -UUID_1 = uuid.UUID("7c66416e-eff7-441f-aafc-6f06e62fb4ec") -UUID_2 = uuid.UUID("b7aadcce-806c-4dc6-8b95-d2477d717560") - - -def test_merge_pass(): - left = Metadata( - languages=["en"], - page_number=1, - links=[Link(text="text", start_index=1, url="http://url")], - parent_doc_uuid=UUID_1, - ) - right = Metadata(languages=["en", "fr"], page_number=[2, 3]) - - expected = Metadata( - languages=["en", "fr"], - links=[Link(text="text", url="http://url", start_index=1)], - page_number=[1, 2, 3], - parent_doc_uuid=UUID_1, - ) - actual = Metadata.merge(left, right) - - assert actual == expected - - -def test_merge_pass_same_parent_doc_uuid(): - left = Metadata(parent_doc_uuid=UUID_1) - right = Metadata(parent_doc_uuid=UUID_1) - - expected = Metadata(parent_doc_uuid=UUID_1) - actual = Metadata.merge(left, right) - - assert actual == expected - - -def test_merge_pass_one_empty_parent_doc_uuid(): - left = Metadata(parent_doc_uuid=UUID_1) - right = Metadata() - - expected = Metadata(parent_doc_uuid=UUID_1) - actual = Metadata.merge(left, right) - - assert actual == expected - - -def test_merge_pass_two_parent_doc_uuid(): - left = Metadata(parent_doc_uuid=UUID_1) - right = Metadata(parent_doc_uuid=UUID_2) - with pytest.raises(ValueError, match="chunks do not have the same parent_doc_uuid"): - Metadata.merge(left, right) diff --git a/redbox-core/tests/storage/test_elasticsearch.py b/redbox-core/tests/storage/test_elasticsearch.py index 86f219c73..00d6e297f 100644 --- a/redbox-core/tests/storage/test_elasticsearch.py +++ b/redbox-core/tests/storage/test_elasticsearch.py @@ -3,7 +3,7 @@ import pytest from elasticsearch import NotFoundError -from redbox.models import Chunk +from redbox.models import File from redbox.storage.elasticsearch import ElasticsearchStorageHandler @@ -21,35 +21,23 @@ def test_elasticsearch_client_connection(elasticsearch_client, elasticsearch_sto assert isinstance(elasticsearch_storage_handler.model_type_map, dict) -def test_elasticsearch_write_read_item(elasticsearch_storage_handler, chunk_belonging_to_alice): +def test_elasticsearch_write_read_item(elasticsearch_storage_handler, file_belonging_to_alice): """ - Given that `Chunk` is a valid model + Given that `File` is a valid model When I - Then I expect a valid Chunk to be returned on read" + Then I expect a valid File to be returned on read" """ - # Write the chunk - elasticsearch_storage_handler.write_item(item=chunk_belonging_to_alice) + # Write the file + elasticsearch_storage_handler.write_item(item=file_belonging_to_alice) - # Read the chunk - chunk_read = elasticsearch_storage_handler.read_item(chunk_belonging_to_alice.uuid, "Chunk") + # Read the File + item_read = elasticsearch_storage_handler.read_item(file_belonging_to_alice.uuid, "File") - assert chunk_read.uuid == chunk_belonging_to_alice.uuid - - -def test_elastic_read_item(elasticsearch_storage_handler, stored_chunk_belonging_to_alice): - read_chunk = elasticsearch_storage_handler.read_item(stored_chunk_belonging_to_alice.uuid, "Chunk") - assert read_chunk.uuid == stored_chunk_belonging_to_alice.uuid - assert read_chunk.parent_file_uuid == stored_chunk_belonging_to_alice.parent_file_uuid - assert read_chunk.index == stored_chunk_belonging_to_alice.index - assert read_chunk.text == stored_chunk_belonging_to_alice.text - assert read_chunk.metadata == stored_chunk_belonging_to_alice.metadata - assert read_chunk.creator_user_uuid == stored_chunk_belonging_to_alice.creator_user_uuid - assert read_chunk.token_count == stored_chunk_belonging_to_alice.token_count + assert item_read.uuid == file_belonging_to_alice.uuid def test_elastic_delete_item_fail( elasticsearch_storage_handler: ElasticsearchStorageHandler, - chunk_belonging_to_bob, ): """ Given that I have an non-existent item uuid @@ -57,7 +45,7 @@ def test_elastic_delete_item_fail( Then I expect to see a NotFoundError error raised """ with pytest.raises(NotFoundError): - elasticsearch_storage_handler.delete_item(chunk_belonging_to_bob) + elasticsearch_storage_handler.delete_item(File(uuid=uuid4(), creator_user_uuid=uuid4(), key="", bucket="")) def test_elastic_read_item_fail( @@ -69,7 +57,7 @@ def test_elastic_read_item_fail( Then I expect to see a NotFoundError error raised """ with pytest.raises(NotFoundError): - elasticsearch_storage_handler.read_item(uuid4(), "Chunk") + elasticsearch_storage_handler.read_item(uuid4(), "File") def test_elastic_write_read_delete_items(elasticsearch_storage_handler): @@ -79,138 +67,88 @@ def test_elastic_write_read_delete_items(elasticsearch_storage_handler): Then I expect to see them written to the database """ creator_user_uuid = uuid4() - chunks = [ - Chunk( + files = [ + File( creator_user_uuid=creator_user_uuid, - parent_file_uuid=uuid4(), - index=i, - text="test_text", + key=f"somefile-{i}.txt", + bucket="a-bucket" ) for i in range(10) ] - elasticsearch_storage_handler.write_items(chunks) + elasticsearch_storage_handler.write_items(files) - read_chunks = elasticsearch_storage_handler.read_items([chunk.uuid for chunk in chunks], "Chunk") + read_files = elasticsearch_storage_handler.read_items([file.uuid for file in files], "File") - assert read_chunks == chunks + assert read_files == files - # Delete the chunks - elasticsearch_storage_handler.delete_items(chunks) + # Delete the files + elasticsearch_storage_handler.delete_items(files) - # Check that the chunks are deleted - items_left = elasticsearch_storage_handler.list_all_items("Chunk", creator_user_uuid) + # Check that the files are deleted + items_left = elasticsearch_storage_handler.list_all_items("File", creator_user_uuid) - assert all(chunk.uuid not in items_left for chunk in chunks) + assert all(file.uuid not in items_left for file in files) def test_list_all_items( elasticsearch_storage_handler: ElasticsearchStorageHandler, - stored_chunk_belonging_to_alice: Chunk, - stored_chunk_belonging_to_bob: Chunk, - chunk_belonging_to_claire: Chunk, + file_belonging_to_alice: File, + file_belonging_to_bob: File, alice: UUID, ): """ Given that I have - * a saved chunk belonging to Alice - * a saved chunk belonging to Bob - * an unsaved chunk belonging to Claire + * a saved file belonging to Alice + * a saved file belonging to Bob When I call list_all_items as alice Then I expect to see the uuids of the saved objects that belong to alice returned """ - uuids = elasticsearch_storage_handler.list_all_items("Chunk", alice) - assert len(uuids) == 1 + uuids = elasticsearch_storage_handler.list_all_items("File", alice) + assert len(uuids) == 1, f"Unexpected number of files {len(uuids)}" def test_read_all_items( elasticsearch_storage_handler: ElasticsearchStorageHandler, - stored_chunk_belonging_to_alice: Chunk, - stored_chunk_belonging_to_bob: Chunk, - chunk_belonging_to_claire: Chunk, + file_belonging_to_alice: File, + file_belonging_to_bob: File, alice: UUID, ): """ Given that I have - * a saved chunk belonging to Alice - * a saved chunk belonging to Bob - * an unsaved chunk belonging to Claire + * a saved file belonging to Alice + * a saved file belonging to Bob When I call read_all_items as alice - Then I expect to see the one Chunk belonging to alice + Then I expect to see the one File belonging to alice """ - chunks = elasticsearch_storage_handler.read_all_items("Chunk", alice) - assert len(chunks) == 1 - assert chunks[0].creator_user_uuid == alice + files = elasticsearch_storage_handler.read_all_items("File", alice) + assert len(files) == 1 + assert files[0].creator_user_uuid == alice -def test_elastic_delete_item(elasticsearch_storage_handler, stored_chunk_belonging_to_alice): +def test_elastic_delete_item(elasticsearch_storage_handler, file_belonging_to_alice): """ Given that I have a saved object When I call delete_item on it Then I expect to not be able to read the item """ - elasticsearch_storage_handler.delete_item(stored_chunk_belonging_to_alice) + elasticsearch_storage_handler.delete_item(file_belonging_to_alice) with pytest.raises(NotFoundError): - elasticsearch_storage_handler.read_item(stored_chunk_belonging_to_alice.uuid, "Chunk") - - -def test_get_file_chunks( - elasticsearch_storage_handler: ElasticsearchStorageHandler, - stored_chunk_belonging_to_alice: Chunk, -): - """ - Given that a chunk belonging to a file belonging alice have been saved - When I call get_file_chunks with the right file id and alice's id - I Expect the single chunk to be retrieved - """ - assert stored_chunk_belonging_to_alice.creator_user_uuid - - chunks = elasticsearch_storage_handler.get_file_chunks( - stored_chunk_belonging_to_alice.parent_file_uuid, - stored_chunk_belonging_to_alice.creator_user_uuid, - ) - - assert len(chunks) == 1 - - -def test_get_file_chunks_fail( - elasticsearch_storage_handler: ElasticsearchStorageHandler, - stored_chunk_belonging_to_alice: Chunk, -): - """ - Given that a chunk belonging to a file belonging alice have been saved - When I call get_file_chunks with the right file id and another id - I Expect the no chunks to be retrieved - """ - other_chunks = elasticsearch_storage_handler.get_file_chunks( - stored_chunk_belonging_to_alice.parent_file_uuid, - uuid4(), - ) - assert not other_chunks + elasticsearch_storage_handler.read_item(file_belonging_to_alice.uuid, "File") -def test_delete_file_chunks( - elasticsearch_storage_handler: ElasticsearchStorageHandler, - stored_chunk_belonging_to_alice: Chunk, -): +def test_elastic_delete_user_item(elasticsearch_storage_handler, file_belonging_to_alice, alice): """ - Given that a chunk belonging to a file belonging alice have been saved - When I call delete_file_chunks with the right file id and alice's id - I Expect the chunks to be removed + Given that I have a saved object + When I call delete_item on it + Then I expect to not be able to read the item """ - assert stored_chunk_belonging_to_alice.creator_user_uuid - - elasticsearch_storage_handler.delete_file_chunks( - stored_chunk_belonging_to_alice.parent_file_uuid, - stored_chunk_belonging_to_alice.creator_user_uuid, - ) + files = elasticsearch_storage_handler.read_all_items("File", alice) + assert len(files) == 1 + assert files[0].creator_user_uuid == alice + elasticsearch_storage_handler.delete_user_items("file", alice) elasticsearch_storage_handler.refresh() - - chunks = elasticsearch_storage_handler.get_file_chunks( - stored_chunk_belonging_to_alice.parent_file_uuid, - stored_chunk_belonging_to_alice.creator_user_uuid, - ) - - assert len(chunks) == 0 + files = elasticsearch_storage_handler.read_all_items("File", alice) + assert len(files) == 0 \ No newline at end of file From 7d5e42f88e683f424f7ab8f55da35d5dc4f0bec6 Mon Sep 17 00:00:00 2001 From: jamesrichards Date: Fri, 2 Aug 2024 15:33:40 +0100 Subject: [PATCH 2/5] Ruff --- core-api/core_api/routes/file.py | 1 + core-api/tests/routes/test_file.py | 3 +-- redbox-core/redbox/chains/graph.py | 6 +----- redbox-core/redbox/models/file.py | 2 +- redbox-core/redbox/storage/elasticsearch.py | 9 +++------ redbox-core/tests/conftest.py | 8 ++++++-- redbox-core/tests/storage/test_elasticsearch.py | 11 ++--------- 7 files changed, 15 insertions(+), 25 deletions(-) diff --git a/core-api/core_api/routes/file.py b/core-api/core_api/routes/file.py index 408b23af9..105db3fa1 100644 --- a/core-api/core_api/routes/file.py +++ b/core-api/core_api/routes/file.py @@ -246,6 +246,7 @@ async def reingest_file(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_ return file + @file_app.get( "/{file_uuid}/status", tags=["file"], diff --git a/core-api/tests/routes/test_file.py b/core-api/tests/routes/test_file.py index 5507f27bf..aa52a9b0f 100644 --- a/core-api/tests/routes/test_file.py +++ b/core-api/tests/routes/test_file.py @@ -124,8 +124,7 @@ def test_reingest_file(app_client, chunked_file, elasticsearch_storage_handler, elasticsearch_storage_handler.refresh() assert ( - elasticsearch_storage_handler.list_all_items("chunk", chunked_file.creator_user_uuid) - != previous_chunks + elasticsearch_storage_handler.list_all_items("chunk", chunked_file.creator_user_uuid) != previous_chunks ), f"Pre and post chunks matched and both had {len(previous_chunks)} chunks" diff --git a/redbox-core/redbox/chains/graph.py b/redbox-core/redbox/chains/graph.py index 6521e41a2..3166344b0 100644 --- a/redbox-core/redbox/chains/graph.py +++ b/redbox-core/redbox/chains/graph.py @@ -76,9 +76,7 @@ def chat_prompt_from_messages(state: ChainState): log.debug("Setting chat prompt") system_prompt_message = [("system", system_prompt)] - prompts_budget = len(tokeniser.encode(system_prompt)) + len( - tokeniser.encode(question_prompt) - ) + prompts_budget = len(tokeniser.encode(system_prompt)) + len(tokeniser.encode(question_prompt)) chat_history_budget = state["query"].ai_settings.context_window_size - llm_max_tokens - prompts_budget if chat_history_budget <= 0: @@ -101,8 +99,6 @@ def chat_prompt_from_messages(state: ChainState): return chat_prompt_from_messages - - @chain def set_prompt_args(state: ChainState): log.debug("Setting prompt args") diff --git a/redbox-core/redbox/models/file.py b/redbox-core/redbox/models/file.py index 4c2851dda..1950bab34 100644 --- a/redbox-core/redbox/models/file.py +++ b/redbox-core/redbox/models/file.py @@ -40,7 +40,7 @@ def __le__(self, other: Link): def __hash__(self): return hash(self.text) ^ hash(self.url) ^ hash(self.start_index) - + class FileStatus(BaseModel): """Status of a file.""" diff --git a/redbox-core/redbox/storage/elasticsearch.py b/redbox-core/redbox/storage/elasticsearch.py index 4da0e2a3e..a1d6a73c8 100644 --- a/redbox-core/redbox/storage/elasticsearch.py +++ b/redbox-core/redbox/storage/elasticsearch.py @@ -1,6 +1,5 @@ import logging from collections.abc import Sequence -from typing import Any from uuid import UUID from elastic_transport import ObjectApiResponse @@ -8,7 +7,6 @@ from elasticsearch.helpers import scan from pydantic import ValidationError -from redbox.models import Settings from redbox.models.base import PersistableModel from redbox.storage.storage_handler import BaseStorageHandler @@ -28,6 +26,7 @@ def get_query_match_all_for_user(user_uuid: UUID): } } + class ElasticsearchStorageHandler(BaseStorageHandler): """Storage Handler for Elasticsearch""" @@ -60,8 +59,6 @@ def write_item(self, item: PersistableModel) -> ObjectApiResponse: def write_items(self, items: Sequence[PersistableModel]) -> Sequence[ObjectApiResponse]: return list(map(self.write_item, items)) - - def read_item(self, item_uuid: UUID, model_type: str): target_index = f"{self.root_index}-{model_type.lower()}" result = self.es_client.get(index=target_index, id=str(item_uuid)) @@ -104,7 +101,7 @@ def delete_items(self, items: list[PersistableModel]) -> ObjectApiResponse | Non index=target_index, body={"query": {"terms": {"_id": [str(item.uuid) for item in items]}}}, ) - + def delete_user_items(self, model_type: str, user_uuid: UUID) -> ObjectApiResponse | None: target_index = f"{self.root_index}-{model_type.lower()}" return self.es_client.delete_by_query( @@ -155,4 +152,4 @@ def list_all_items(self, model_type: str, user_uuid: UUID) -> list[UUID]: except NotFoundError: log.info("Index %s not found. Returning empty list.", target_index) return [] - return [UUID(item["_id"]) for item in results] \ No newline at end of file + return [UUID(item["_id"]) for item in results] diff --git a/redbox-core/tests/conftest.py b/redbox-core/tests/conftest.py index f496cd1c5..a89d3f278 100644 --- a/redbox-core/tests/conftest.py +++ b/redbox-core/tests/conftest.py @@ -39,7 +39,9 @@ def claire(): @pytest.fixture() -def file_belonging_to_alice(file_pdf_path, alice, env, elasticsearch_storage_handler: ElasticsearchStorageHandler) -> File: +def file_belonging_to_alice( + file_pdf_path, alice, env, elasticsearch_storage_handler: ElasticsearchStorageHandler +) -> File: f = File( key=file_pdf_path.name, bucket=env.bucket_name, @@ -63,7 +65,9 @@ def file_belonging_to_bob(file_pdf_path, bob, env, elasticsearch_storage_handler @pytest.fixture() -def file_belonging_to_claire(file_pdf_path, claire, env, elasticsearch_storage_handler: ElasticsearchStorageHandler) -> File: +def file_belonging_to_claire( + file_pdf_path, claire, env, elasticsearch_storage_handler: ElasticsearchStorageHandler +) -> File: f = File( key=file_pdf_path.name, bucket=env.bucket_name, diff --git a/redbox-core/tests/storage/test_elasticsearch.py b/redbox-core/tests/storage/test_elasticsearch.py index 00d6e297f..5345194cc 100644 --- a/redbox-core/tests/storage/test_elasticsearch.py +++ b/redbox-core/tests/storage/test_elasticsearch.py @@ -67,14 +67,7 @@ def test_elastic_write_read_delete_items(elasticsearch_storage_handler): Then I expect to see them written to the database """ creator_user_uuid = uuid4() - files = [ - File( - creator_user_uuid=creator_user_uuid, - key=f"somefile-{i}.txt", - bucket="a-bucket" - ) - for i in range(10) - ] + files = [File(creator_user_uuid=creator_user_uuid, key=f"somefile-{i}.txt", bucket="a-bucket") for i in range(10)] elasticsearch_storage_handler.write_items(files) @@ -151,4 +144,4 @@ def test_elastic_delete_user_item(elasticsearch_storage_handler, file_belonging_ elasticsearch_storage_handler.delete_user_items("file", alice) elasticsearch_storage_handler.refresh() files = elasticsearch_storage_handler.read_all_items("File", alice) - assert len(files) == 0 \ No newline at end of file + assert len(files) == 0 From 620299eb32678a110efb6b2573082fdaf34e0da7 Mon Sep 17 00:00:00 2001 From: jamesrichards Date: Fri, 2 Aug 2024 16:32:36 +0100 Subject: [PATCH 3/5] Fixed issue with reingest not restricting chunk deletion to single file. Added additional tests for this scenario --- core-api/core_api/routes/file.py | 7 +- core-api/tests/conftest.py | 95 +++++++++++++++------ core-api/tests/routes/test_file.py | 42 +++++---- redbox-core/redbox/storage/elasticsearch.py | 46 ++++++---- 4 files changed, 130 insertions(+), 60 deletions(-) diff --git a/core-api/core_api/routes/file.py b/core-api/core_api/routes/file.py index 105db3fa1..83074b9b4 100644 --- a/core-api/core_api/routes/file.py +++ b/core-api/core_api/routes/file.py @@ -212,7 +212,10 @@ def delete_file(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_user_uui tags=["file"], responses={404: {"model": APIError404, "description": "The file was not found"}}, ) -async def reingest_file(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_user_uuid)]) -> File: +async def reingest_file( + file_uuid: UUID, + user_uuid: Annotated[UUID, Depends(get_user_uuid)] +) -> File: """Deletes exisiting file chunks and regenerates embeddings Args: @@ -238,7 +241,7 @@ async def reingest_file(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_ storage_handler.update_item(file) # Remove old chunks - storage_handler.delete_user_items("chunk", user_uuid) + storage_handler.delete_user_items("chunk", user_uuid, filters=[ElasticsearchStorageHandler.get_with_parent_file_filter(file.uuid)]) # Add new chunks log.info("publishing %s", file.uuid) diff --git a/core-api/tests/conftest.py b/core-api/tests/conftest.py index 1f4e55f09..96d53b452 100644 --- a/core-api/tests/conftest.py +++ b/core-api/tests/conftest.py @@ -1,6 +1,7 @@ from pathlib import Path from uuid import UUID, uuid4 from datetime import UTC, datetime +import uuid import pytest from botocore.exceptions import ClientError @@ -25,21 +26,6 @@ def env(): return Settings() -@pytest.fixture(scope="session") -def s3_client(env): - _client = env.s3_client() - try: - _client.create_bucket( - Bucket=env.bucket_name, - CreateBucketConfiguration={"LocationConstraint": env.aws_region}, - ) - except ClientError as e: - if e.response["Error"]["Code"] != "BucketAlreadyOwnedByYou": - raise - - return _client - - @pytest.fixture(scope="session") def es_client(env) -> Elasticsearch: return env.elasticsearch_client() @@ -91,18 +77,8 @@ def elasticsearch_storage_handler(es_client, env): @pytest.fixture() -def file(s3_client, file_pdf_path: Path, alice, env) -> File: +def file(file_pdf_path: Path, alice, env) -> File: file_name = file_pdf_path.name - file_type = file_pdf_path.suffix - - with file_pdf_path.open("rb") as f: - s3_client.put_object( - Bucket=env.bucket_name, - Body=f.read(), - Key=file_name, - Tagging=f"file_type={file_type}", - ) - return File(key=file_name, bucket=env.bucket_name, creator_user_uuid=alice) @@ -120,6 +96,26 @@ def stored_file_1(elasticsearch_storage_handler, file) -> File: return file +@pytest.fixture() +def stored_user_files(elasticsearch_storage_handler) -> list[File]: + user = uuid4() + files = [ + File( + creator_user_uuid=user, + key="testfile1.txt", + bucket="local" + ), + File( + creator_user_uuid=user, + key="testfile2.txt", + bucket="local" + ) + ] + for file in files: + elasticsearch_storage_handler.write_item(file) + elasticsearch_storage_handler.refresh() + return files + @pytest.fixture(scope="session") def embedding_model_dim() -> int: return 3072 # 3-large default size @@ -201,6 +197,47 @@ def stored_large_file_chunks(stored_file_1) -> list[Document]: return normal_chunks + large_chunks +@pytest.fixture() +def stored_user_chunks(stored_user_files) -> list[list[Document]]: + chunks_by_file = [] + for file in stored_user_files: + normal_chunks = [ + Document( + page_content="hello", + metadata=ChunkMetadata( + parent_file_uuid=str(file.uuid), + index=i, + file_name="test_file", + creator_user_uuid=file.creator_user_uuid, + page_number=4, + created_datetime=datetime.now(UTC), + token_count=4, + chunk_resolution=ChunkResolution.normal, + ).model_dump(), + ) + for i in range(25) + ] + + large_chunks = [ + Document( + page_content="hello" * 10, + metadata=ChunkMetadata( + parent_file_uuid=str(file.uuid), + index=i, + file_name="test_file", + creator_user_uuid=file.creator_user_uuid, + page_number=4, + created_datetime=datetime.now(UTC), + token_count=20, + chunk_resolution=ChunkResolution.largest, + ).model_dump(), + ) + for i in range(5) + ] + chunks_by_file.append(normal_chunks+large_chunks) + return chunks_by_file + + @pytest.fixture() def chunked_file(elasticsearch_store: ElasticsearchStore, stored_file_chunks, stored_file_1) -> File: elasticsearch_store.add_documents(stored_file_chunks) @@ -212,6 +249,12 @@ def large_chunked_file(elasticsearch_store, stored_large_file_chunks, stored_fil elasticsearch_store.add_documents(stored_large_file_chunks) return stored_file_1 +@pytest.fixture() +def chunked_user_files(elasticsearch_store, stored_user_chunks) -> File: + for chunks in stored_user_chunks: + elasticsearch_store.add_documents(chunks) + return stored_user_chunks + @pytest.fixture() def file_pdf_path() -> Path: diff --git a/core-api/tests/routes/test_file.py b/core-api/tests/routes/test_file.py index aa52a9b0f..3ed7c9ea1 100644 --- a/core-api/tests/routes/test_file.py +++ b/core-api/tests/routes/test_file.py @@ -1,16 +1,17 @@ import json from http import HTTPStatus from pathlib import Path +from jose import jwt import pytest from elasticsearch import NotFoundError from faststream.redis import TestRedisBroker from core_api.routes.file import env, router - +from redbox.storage.elasticsearch import ElasticsearchStorageHandler @pytest.mark.asyncio() -async def test_post_file_upload(s3_client, app_client, file_pdf_path: Path, headers): +async def test_post_file_upload(app_client, file_pdf_path: Path, headers): """ Given a new file When I POST it to /file @@ -20,12 +21,6 @@ async def test_post_file_upload(s3_client, app_client, file_pdf_path: Path, head file_key = file_pdf_path.name with file_pdf_path.open("rb") as f: - s3_client.upload_fileobj( - Bucket=env.bucket_name, - Fileobj=f, - Key=file_key, - ExtraArgs={"Tagging": "file_type=pdf"}, - ) async with TestRedisBroker(router.broker): response = app_client.post( @@ -77,14 +72,13 @@ def test_get_missing_file(app_client, headers): assert response.status_code == HTTPStatus.NOT_FOUND -def test_delete_file(s3_client, app_client, elasticsearch_storage_handler, chunked_file, headers): +def test_delete_file(app_client, elasticsearch_storage_handler, chunked_file, headers): """ Given a previously saved file When I DELETE it to /file I Expect to see it removed from s3 and elastic-search, including the chunks """ # check assets exist - assert s3_client.get_object(Bucket=env.bucket_name, Key=chunked_file.key) assert elasticsearch_storage_handler.read_item(item_uuid=chunked_file.uuid, model_type="file") assert elasticsearch_storage_handler.list_all_items("chunk", chunked_file.creator_user_uuid) @@ -111,21 +105,37 @@ def test_delete_missing_file(app_client, headers): assert response.status_code == HTTPStatus.NOT_FOUND -def test_reingest_file(app_client, chunked_file, elasticsearch_storage_handler, headers): +def test_reingest_file(app_client, chunked_user_files, stored_user_files, elasticsearch_storage_handler): """ Given a previously chunked file When I PUT it to /file/uuid/ I Expect the old chunks to be removed """ - previous_chunks = elasticsearch_storage_handler.list_all_items("chunk", chunked_file.creator_user_uuid) + test_file = stored_user_files[0] - response = app_client.put(f"/file/{chunked_file.uuid}", headers=headers) - assert response.status_code == HTTPStatus.OK + bearer_token = jwt.encode({"user_uuid": str(test_file.creator_user_uuid)}, key="nvjkernd") + headers_for_user = {"Authorization": f"Bearer {bearer_token}"} + + previous_chunks_by_file = [ + elasticsearch_storage_handler.list_all_items( + "chunk", + file.creator_user_uuid, + filters=[ElasticsearchStorageHandler.get_with_parent_file_filter(file.uuid)] + ) + for file in stored_user_files + ] + + response = app_client.put(f"/file/{test_file.uuid}", headers=headers_for_user) + assert response.status_code == HTTPStatus.OK, f"Error response: [{response.status_code}] {response.text}" elasticsearch_storage_handler.refresh() assert ( - elasticsearch_storage_handler.list_all_items("chunk", chunked_file.creator_user_uuid) != previous_chunks - ), f"Pre and post chunks matched and both had {len(previous_chunks)} chunks" + elasticsearch_storage_handler.list_all_items("chunk", test_file.creator_user_uuid) != previous_chunks_by_file[0] + ), f"Pre and post chunks matched and both had {len(previous_chunks_by_file[0])} chunks" + + for file, previous_chunks in zip(stored_user_files[1:], previous_chunks_by_file[1:]): + post_chunks = elasticsearch_storage_handler.list_all_items("chunk", file.creator_user_uuid) + assert post_chunks == previous_chunks, f"Additional files had their chunks changed! Pre: {len(previous_chunks)} Post: {len(post_chunks)}" def test_get_missing_file_chunks(app_client, headers): diff --git a/redbox-core/redbox/storage/elasticsearch.py b/redbox-core/redbox/storage/elasticsearch.py index a1d6a73c8..387c7d0b2 100644 --- a/redbox-core/redbox/storage/elasticsearch.py +++ b/redbox-core/redbox/storage/elasticsearch.py @@ -14,17 +14,8 @@ log = logging.getLogger() -def get_query_match_all_for_user(user_uuid: UUID): - return { - "query": { - "bool": { - "should": [ - {"term": {"creator_user_uuid.keyword": str(user_uuid)}}, - {"term": {"metadata.creator_user_uuid.keyword": str(user_uuid)}}, - ] - } - } - } + + class ElasticsearchStorageHandler(BaseStorageHandler): @@ -102,11 +93,11 @@ def delete_items(self, items: list[PersistableModel]) -> ObjectApiResponse | Non body={"query": {"terms": {"_id": [str(item.uuid) for item in items]}}}, ) - def delete_user_items(self, model_type: str, user_uuid: UUID) -> ObjectApiResponse | None: + def delete_user_items(self, model_type: str, user_uuid: UUID, filters: list[dict] = None) -> ObjectApiResponse | None: target_index = f"{self.root_index}-{model_type.lower()}" return self.es_client.delete_by_query( index=target_index, - body=get_query_match_all_for_user(user_uuid), + body=ElasticsearchStorageHandler.get_query_match_all_for_user(user_uuid, filters), ) def read_all_items(self, model_type: str, user_uuid: UUID) -> list[PersistableModel]: @@ -115,7 +106,7 @@ def read_all_items(self, model_type: str, user_uuid: UUID) -> list[PersistableMo result = scan( client=self.es_client, index=target_index, - query=get_query_match_all_for_user(user_uuid), + query=ElasticsearchStorageHandler.get_query_match_all_for_user(user_uuid), _source=True, ) @@ -138,14 +129,14 @@ def read_all_items(self, model_type: str, user_uuid: UUID) -> list[PersistableMo log.exception("Validation exception for %s", item, exc_info=e) return items - def list_all_items(self, model_type: str, user_uuid: UUID) -> list[UUID]: + def list_all_items(self, model_type: str, user_uuid: UUID, filters: list[dict] = None) -> list[UUID]: target_index = f"{self.root_index}-{model_type.lower()}" try: # Only return _id results = scan( client=self.es_client, index=target_index, - query=get_query_match_all_for_user(user_uuid), + query=ElasticsearchStorageHandler.get_query_match_all_for_user(user_uuid, filters), _source=False, ) @@ -153,3 +144,26 @@ def list_all_items(self, model_type: str, user_uuid: UUID) -> list[UUID]: log.info("Index %s not found. Returning empty list.", target_index) return [] return [UUID(item["_id"]) for item in results] + + + @classmethod + def get_query_match_all_for_user(cls, user_uuid: UUID, filters: list[dict] = None): + query = { + "query": { + "bool": { + "should": [ + {"term": {"creator_user_uuid.keyword": str(user_uuid)}}, + {"term": {"metadata.creator_user_uuid.keyword": str(user_uuid)}}, + ] + }, + } + } + if filters: + query["query"]["bool"]["filter"] = filters + return query + + @classmethod + def get_with_parent_file_filter(cls, parent_file_uuid: UUID | str): + return{ + "term": {"metadata.parent_file_uuid.keyword": str(parent_file_uuid)}, + } \ No newline at end of file From 5105ef81dc638115a4d76877de1ed3d3b0e5a5bc Mon Sep 17 00:00:00 2001 From: jamesrichards Date: Fri, 2 Aug 2024 16:33:27 +0100 Subject: [PATCH 4/5] Removing integration tests for chunks --- tests/test_e2e.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/tests/test_e2e.py b/tests/test_e2e.py index 37f7a87b3..eddac847e 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -84,21 +84,6 @@ def test_get_file_status(self, user_uuid): if not embedding_complete: pytest.fail(reason=f"failed to get embedded chunks within {timeout} seconds, potential error: {error}") - @pytest.mark.parametrize("user_uuid", USER_UUIDS) - def test_get_file_chunks(self, user_uuid): - """ - Given that I have POSTed a file key to core-api/file - And the file status is complete - When I GET the file chunks - I Expect a 200 response code - """ - chunks_response = requests.get( - f"http://{TEST_ORIGIN}/file/{TestEndToEnd.file_uuids[user_uuid]}/chunks", - headers=make_headers(user_uuid), - timeout=30, - ) - assert chunks_response.status_code == HTTPStatus.OK - @pytest.mark.parametrize("user_uuid", USER_UUIDS) def test_post_rag(self, user_uuid): """ From e8c06fc6cb269077424185b35148ac7807794531 Mon Sep 17 00:00:00 2001 From: jamesrichards Date: Mon, 5 Aug 2024 11:01:53 +0100 Subject: [PATCH 5/5] Ruff --- core-api/core_api/routes/file.py | 9 +++--- core-api/tests/conftest.py | 18 ++++------- core-api/tests/routes/test_file.py | 33 +++++++++++---------- redbox-core/redbox/storage/elasticsearch.py | 13 ++++---- 4 files changed, 31 insertions(+), 42 deletions(-) diff --git a/core-api/core_api/routes/file.py b/core-api/core_api/routes/file.py index 83074b9b4..2953b9cd1 100644 --- a/core-api/core_api/routes/file.py +++ b/core-api/core_api/routes/file.py @@ -212,10 +212,7 @@ def delete_file(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_user_uui tags=["file"], responses={404: {"model": APIError404, "description": "The file was not found"}}, ) -async def reingest_file( - file_uuid: UUID, - user_uuid: Annotated[UUID, Depends(get_user_uuid)] -) -> File: +async def reingest_file(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_user_uuid)]) -> File: """Deletes exisiting file chunks and regenerates embeddings Args: @@ -241,7 +238,9 @@ async def reingest_file( storage_handler.update_item(file) # Remove old chunks - storage_handler.delete_user_items("chunk", user_uuid, filters=[ElasticsearchStorageHandler.get_with_parent_file_filter(file.uuid)]) + storage_handler.delete_user_items( + "chunk", user_uuid, filters=[ElasticsearchStorageHandler.get_with_parent_file_filter(file.uuid)] + ) # Add new chunks log.info("publishing %s", file.uuid) diff --git a/core-api/tests/conftest.py b/core-api/tests/conftest.py index 96d53b452..108948925 100644 --- a/core-api/tests/conftest.py +++ b/core-api/tests/conftest.py @@ -1,10 +1,8 @@ from pathlib import Path from uuid import UUID, uuid4 from datetime import UTC, datetime -import uuid import pytest -from botocore.exceptions import ClientError from elasticsearch import Elasticsearch from fastapi.testclient import TestClient from jose import jwt @@ -100,22 +98,15 @@ def stored_file_1(elasticsearch_storage_handler, file) -> File: def stored_user_files(elasticsearch_storage_handler) -> list[File]: user = uuid4() files = [ - File( - creator_user_uuid=user, - key="testfile1.txt", - bucket="local" - ), - File( - creator_user_uuid=user, - key="testfile2.txt", - bucket="local" - ) + File(creator_user_uuid=user, key="testfile1.txt", bucket="local"), + File(creator_user_uuid=user, key="testfile2.txt", bucket="local"), ] for file in files: elasticsearch_storage_handler.write_item(file) elasticsearch_storage_handler.refresh() return files + @pytest.fixture(scope="session") def embedding_model_dim() -> int: return 3072 # 3-large default size @@ -234,7 +225,7 @@ def stored_user_chunks(stored_user_files) -> list[list[Document]]: ) for i in range(5) ] - chunks_by_file.append(normal_chunks+large_chunks) + chunks_by_file.append(normal_chunks + large_chunks) return chunks_by_file @@ -249,6 +240,7 @@ def large_chunked_file(elasticsearch_store, stored_large_file_chunks, stored_fil elasticsearch_store.add_documents(stored_large_file_chunks) return stored_file_1 + @pytest.fixture() def chunked_user_files(elasticsearch_store, stored_user_chunks) -> File: for chunks in stored_user_chunks: diff --git a/core-api/tests/routes/test_file.py b/core-api/tests/routes/test_file.py index 3ed7c9ea1..9c552a841 100644 --- a/core-api/tests/routes/test_file.py +++ b/core-api/tests/routes/test_file.py @@ -10,6 +10,7 @@ from core_api.routes.file import env, router from redbox.storage.elasticsearch import ElasticsearchStorageHandler + @pytest.mark.asyncio() async def test_post_file_upload(app_client, file_pdf_path: Path, headers): """ @@ -20,17 +21,15 @@ async def test_post_file_upload(app_client, file_pdf_path: Path, headers): file_key = file_pdf_path.name - with file_pdf_path.open("rb") as f: - - async with TestRedisBroker(router.broker): - response = app_client.post( - "/file", - json={ - "key": file_key, - "bucket": env.bucket_name, - }, - headers=headers, - ) + async with TestRedisBroker(router.broker): + response = app_client.post( + "/file", + json={ + "key": file_key, + "bucket": env.bucket_name, + }, + headers=headers, + ) assert response.status_code == HTTPStatus.CREATED file = json.loads(response.content.decode("utf-8")) @@ -118,11 +117,11 @@ def test_reingest_file(app_client, chunked_user_files, stored_user_files, elasti previous_chunks_by_file = [ elasticsearch_storage_handler.list_all_items( - "chunk", - file.creator_user_uuid, - filters=[ElasticsearchStorageHandler.get_with_parent_file_filter(file.uuid)] + "chunk", + file.creator_user_uuid, + filters=[ElasticsearchStorageHandler.get_with_parent_file_filter(file.uuid)], ) - for file in stored_user_files + for file in stored_user_files ] response = app_client.put(f"/file/{test_file.uuid}", headers=headers_for_user) @@ -135,7 +134,9 @@ def test_reingest_file(app_client, chunked_user_files, stored_user_files, elasti for file, previous_chunks in zip(stored_user_files[1:], previous_chunks_by_file[1:]): post_chunks = elasticsearch_storage_handler.list_all_items("chunk", file.creator_user_uuid) - assert post_chunks == previous_chunks, f"Additional files had their chunks changed! Pre: {len(previous_chunks)} Post: {len(post_chunks)}" + assert ( + post_chunks == previous_chunks + ), f"Additional files had their chunks changed! Pre: {len(previous_chunks)} Post: {len(post_chunks)}" def test_get_missing_file_chunks(app_client, headers): diff --git a/redbox-core/redbox/storage/elasticsearch.py b/redbox-core/redbox/storage/elasticsearch.py index 387c7d0b2..94faffb3d 100644 --- a/redbox-core/redbox/storage/elasticsearch.py +++ b/redbox-core/redbox/storage/elasticsearch.py @@ -14,10 +14,6 @@ log = logging.getLogger() - - - - class ElasticsearchStorageHandler(BaseStorageHandler): """Storage Handler for Elasticsearch""" @@ -93,7 +89,9 @@ def delete_items(self, items: list[PersistableModel]) -> ObjectApiResponse | Non body={"query": {"terms": {"_id": [str(item.uuid) for item in items]}}}, ) - def delete_user_items(self, model_type: str, user_uuid: UUID, filters: list[dict] = None) -> ObjectApiResponse | None: + def delete_user_items( + self, model_type: str, user_uuid: UUID, filters: list[dict] = None + ) -> ObjectApiResponse | None: target_index = f"{self.root_index}-{model_type.lower()}" return self.es_client.delete_by_query( index=target_index, @@ -145,7 +143,6 @@ def list_all_items(self, model_type: str, user_uuid: UUID, filters: list[dict] = return [] return [UUID(item["_id"]) for item in results] - @classmethod def get_query_match_all_for_user(cls, user_uuid: UUID, filters: list[dict] = None): query = { @@ -164,6 +161,6 @@ def get_query_match_all_for_user(cls, user_uuid: UUID, filters: list[dict] = Non @classmethod def get_with_parent_file_filter(cls, parent_file_uuid: UUID | str): - return{ + return { "term": {"metadata.parent_file_uuid.keyword": str(parent_file_uuid)}, - } \ No newline at end of file + }