Merge pull request #909 from i-dot-ai/feature/remove-legacy-chunk-model

Remove chunk model
i-dot-ai · Aug 5, 2024 · 0c35176 · 0c35176
2 parents 15fe2a2 + e8c06fc
commit 0c35176
Show file tree

Hide file tree

Showing 12 changed files with 219 additions and 566 deletions.
diff --git a/core-api/core_api/routes/file.py b/core-api/core_api/routes/file.py
@@ -11,7 +11,7 @@
 
 from core_api.auth import get_user_uuid
 from core_api.publisher_handler import FilePublisher
-from redbox.models import APIError404, Chunk, File, FileStatus, ProcessingStatusEnum, Settings
+from redbox.models import APIError404, File, FileStatus, ProcessingStatusEnum, Settings
 from redbox.storage import ElasticsearchStorageHandler
 
 # === Functions ===
@@ -202,8 +202,7 @@ def delete_file(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_user_uui
 
     storage_handler.delete_item(file)
 
-    chunks = storage_handler.get_file_chunks(file.uuid, user_uuid)
-    storage_handler.delete_items(chunks)
+    storage_handler.delete_user_items("chunk", user_uuid)
     return file
 
 
@@ -239,7 +238,9 @@ async def reingest_file(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_
     storage_handler.update_item(file)
 
     # Remove old chunks
-    storage_handler.delete_file_chunks(file.uuid, user_uuid)
+    storage_handler.delete_user_items(
+        "chunk", user_uuid, filters=[ElasticsearchStorageHandler.get_with_parent_file_filter(file.uuid)]
+    )
 
     # Add new chunks
     log.info("publishing %s", file.uuid)
@@ -248,37 +249,6 @@ async def reingest_file(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_
     return file
 
 
-@file_app.get(
-    "/{file_uuid}/chunks",
-    tags=["file"],
-    responses={404: {"model": APIError404, "description": "The file was not found"}},
-)
-def get_file_chunks(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_user_uuid)]) -> list[Chunk]:
-    """Gets a list of chunks for a file in the database
-
-    Args:
-        file_uuid (UUID): The UUID of the file to delete
-        user_uuid (UUID): The UUID of the user
-
-    Returns:
-        Chunks (list, Chunk): The chunks belonging to the requested file
-
-    Raises:
-        404: If the file isn't found, or the creator and requester don't match
-    """
-    try:
-        file = storage_handler.read_item(file_uuid, model_type="File")
-    except NotFoundError:
-        return file_not_found_response(file_uuid=file_uuid)
-
-    if file.creator_user_uuid != user_uuid:
-        return file_not_found_response(file_uuid=file_uuid)
-
-    log.info("getting chunks for file %s", file_uuid)
-
-    return storage_handler.get_file_chunks(file_uuid, user_uuid)
-
-
 @file_app.get(
     "/{file_uuid}/status",
     tags=["file"],

diff --git a/core-api/tests/conftest.py b/core-api/tests/conftest.py
@@ -3,7 +3,6 @@
 from datetime import UTC, datetime
 
 import pytest
-from botocore.exceptions import ClientError
 from elasticsearch import Elasticsearch
 from fastapi.testclient import TestClient
 from jose import jwt
@@ -25,21 +24,6 @@ def env():
     return Settings()
 
 
-@pytest.fixture(scope="session")
-def s3_client(env):
-    _client = env.s3_client()
-    try:
-        _client.create_bucket(
-            Bucket=env.bucket_name,
-            CreateBucketConfiguration={"LocationConstraint": env.aws_region},
-        )
-    except ClientError as e:
-        if e.response["Error"]["Code"] != "BucketAlreadyOwnedByYou":
-            raise
-
-    return _client
-
-
 @pytest.fixture(scope="session")
 def es_client(env) -> Elasticsearch:
     return env.elasticsearch_client()
@@ -91,18 +75,8 @@ def elasticsearch_storage_handler(es_client, env):
 
 
 @pytest.fixture()
-def file(s3_client, file_pdf_path: Path, alice, env) -> File:
+def file(file_pdf_path: Path, alice, env) -> File:
     file_name = file_pdf_path.name
-    file_type = file_pdf_path.suffix
-
-    with file_pdf_path.open("rb") as f:
-        s3_client.put_object(
-            Bucket=env.bucket_name,
-            Body=f.read(),
-            Key=file_name,
-            Tagging=f"file_type={file_type}",
-        )
-
     return File(key=file_name, bucket=env.bucket_name, creator_user_uuid=alice)
 
 
@@ -120,6 +94,19 @@ def stored_file_1(elasticsearch_storage_handler, file) -> File:
     return file
 
 
+@pytest.fixture()
+def stored_user_files(elasticsearch_storage_handler) -> list[File]:
+    user = uuid4()
+    files = [
+        File(creator_user_uuid=user, key="testfile1.txt", bucket="local"),
+        File(creator_user_uuid=user, key="testfile2.txt", bucket="local"),
+    ]
+    for file in files:
+        elasticsearch_storage_handler.write_item(file)
+    elasticsearch_storage_handler.refresh()
+    return files
+
+
 @pytest.fixture(scope="session")
 def embedding_model_dim() -> int:
     return 3072  # 3-large default size
@@ -201,6 +188,47 @@ def stored_large_file_chunks(stored_file_1) -> list[Document]:
     return normal_chunks + large_chunks
 
 
+@pytest.fixture()
+def stored_user_chunks(stored_user_files) -> list[list[Document]]:
+    chunks_by_file = []
+    for file in stored_user_files:
+        normal_chunks = [
+            Document(
+                page_content="hello",
+                metadata=ChunkMetadata(
+                    parent_file_uuid=str(file.uuid),
+                    index=i,
+                    file_name="test_file",
+                    creator_user_uuid=file.creator_user_uuid,
+                    page_number=4,
+                    created_datetime=datetime.now(UTC),
+                    token_count=4,
+                    chunk_resolution=ChunkResolution.normal,
+                ).model_dump(),
+            )
+            for i in range(25)
+        ]
+
+        large_chunks = [
+            Document(
+                page_content="hello" * 10,
+                metadata=ChunkMetadata(
+                    parent_file_uuid=str(file.uuid),
+                    index=i,
+                    file_name="test_file",
+                    creator_user_uuid=file.creator_user_uuid,
+                    page_number=4,
+                    created_datetime=datetime.now(UTC),
+                    token_count=20,
+                    chunk_resolution=ChunkResolution.largest,
+                ).model_dump(),
+            )
+            for i in range(5)
+        ]
+        chunks_by_file.append(normal_chunks + large_chunks)
+    return chunks_by_file
+
+
 @pytest.fixture()
 def chunked_file(elasticsearch_store: ElasticsearchStore, stored_file_chunks, stored_file_1) -> File:
     elasticsearch_store.add_documents(stored_file_chunks)
@@ -213,6 +241,13 @@ def large_chunked_file(elasticsearch_store, stored_large_file_chunks, stored_fil
     return stored_file_1
 
 
+@pytest.fixture()
+def chunked_user_files(elasticsearch_store, stored_user_chunks) -> File:
+    for chunks in stored_user_chunks:
+        elasticsearch_store.add_documents(chunks)
+    return stored_user_chunks
+
+
 @pytest.fixture()
 def file_pdf_path() -> Path:
     return Path(__file__).parents[2] / "tests" / "data" / "pdf" / "Cabinet Office - Wikipedia.pdf"

diff --git a/core-api/tests/routes/test_file.py b/core-api/tests/routes/test_file.py
@@ -1,16 +1,18 @@
 import json
 from http import HTTPStatus
 from pathlib import Path
+from jose import jwt
 
 import pytest
 from elasticsearch import NotFoundError
 from faststream.redis import TestRedisBroker
 
 from core_api.routes.file import env, router
+from redbox.storage.elasticsearch import ElasticsearchStorageHandler
 
 
 @pytest.mark.asyncio()
-async def test_post_file_upload(s3_client, app_client, file_pdf_path: Path, headers):
+async def test_post_file_upload(app_client, file_pdf_path: Path, headers):
     """
     Given a new file
     When I POST it to /file
@@ -19,23 +21,15 @@ async def test_post_file_upload(s3_client, app_client, file_pdf_path: Path, head
 
     file_key = file_pdf_path.name
 
-    with file_pdf_path.open("rb") as f:
-        s3_client.upload_fileobj(
-            Bucket=env.bucket_name,
-            Fileobj=f,
-            Key=file_key,
-            ExtraArgs={"Tagging": "file_type=pdf"},
+    async with TestRedisBroker(router.broker):
+        response = app_client.post(
+            "/file",
+            json={
+                "key": file_key,
+                "bucket": env.bucket_name,
+            },
+            headers=headers,
         )
-
-        async with TestRedisBroker(router.broker):
-            response = app_client.post(
-                "/file",
-                json={
-                    "key": file_key,
-                    "bucket": env.bucket_name,
-                },
-                headers=headers,
-            )
     assert response.status_code == HTTPStatus.CREATED
 
     file = json.loads(response.content.decode("utf-8"))
@@ -77,16 +71,15 @@ def test_get_missing_file(app_client, headers):
     assert response.status_code == HTTPStatus.NOT_FOUND
 
 
-def test_delete_file(s3_client, app_client, elasticsearch_storage_handler, chunked_file, headers):
+def test_delete_file(app_client, elasticsearch_storage_handler, chunked_file, headers):
     """
     Given a previously saved file
     When I DELETE it to /file
     I Expect to see it removed from s3 and elastic-search, including the chunks
     """
     # check assets exist
-    assert s3_client.get_object(Bucket=env.bucket_name, Key=chunked_file.key)
     assert elasticsearch_storage_handler.read_item(item_uuid=chunked_file.uuid, model_type="file")
-    assert elasticsearch_storage_handler.get_file_chunks(chunked_file.uuid, chunked_file.creator_user_uuid)
+    assert elasticsearch_storage_handler.list_all_items("chunk", chunked_file.creator_user_uuid)
 
     response = app_client.delete(f"/file/{chunked_file.uuid}", headers=headers)
     assert response.status_code == HTTPStatus.OK
@@ -98,7 +91,7 @@ def test_delete_file(s3_client, app_client, elasticsearch_storage_handler, chunk
     with pytest.raises(NotFoundError):
         elasticsearch_storage_handler.read_item(item_uuid=chunked_file.uuid, model_type="file")
 
-    assert not elasticsearch_storage_handler.get_file_chunks(chunked_file.uuid, chunked_file.creator_user_uuid)
+    assert not elasticsearch_storage_handler.list_all_items("chunk", chunked_file.creator_user_uuid)
 
 
 def test_delete_missing_file(app_client, headers):
@@ -111,33 +104,39 @@ def test_delete_missing_file(app_client, headers):
     assert response.status_code == HTTPStatus.NOT_FOUND
 
 
-def test_reingest_file(app_client, chunked_file, elasticsearch_storage_handler, headers):
+def test_reingest_file(app_client, chunked_user_files, stored_user_files, elasticsearch_storage_handler):
     """
     Given a previously chunked file
     When I PUT it to /file/uuid/
     I Expect the old chunks to be removed
     """
-    previous_chunks = elasticsearch_storage_handler.get_file_chunks(chunked_file.uuid, chunked_file.creator_user_uuid)
+    test_file = stored_user_files[0]
 
-    response = app_client.put(f"/file/{chunked_file.uuid}", headers=headers)
-    assert response.status_code == HTTPStatus.OK
+    bearer_token = jwt.encode({"user_uuid": str(test_file.creator_user_uuid)}, key="nvjkernd")
+    headers_for_user = {"Authorization": f"Bearer {bearer_token}"}
 
-    elasticsearch_storage_handler.refresh()
-    assert (
-        elasticsearch_storage_handler.get_file_chunks(chunked_file.uuid, chunked_file.creator_user_uuid)
-        != previous_chunks
-    )
+    previous_chunks_by_file = [
+        elasticsearch_storage_handler.list_all_items(
+            "chunk",
+            file.creator_user_uuid,
+            filters=[ElasticsearchStorageHandler.get_with_parent_file_filter(file.uuid)],
+        )
+        for file in stored_user_files
+    ]
 
+    response = app_client.put(f"/file/{test_file.uuid}", headers=headers_for_user)
+    assert response.status_code == HTTPStatus.OK, f"Error response: [{response.status_code}] {response.text}"
 
-def test_get_file_chunks(app_client, chunked_file, headers):
-    """
-    Given a previously chunked file
-    When I GET it from /file/uuid/chunks
-    I Expect to receive the chunks
-    """
-    response = app_client.get(f"/file/{chunked_file.uuid}/chunks", headers=headers)
-    assert response.status_code == HTTPStatus.OK
-    assert len(response.json()) == 12
+    elasticsearch_storage_handler.refresh()
+    assert (
+        elasticsearch_storage_handler.list_all_items("chunk", test_file.creator_user_uuid) != previous_chunks_by_file[0]
+    ), f"Pre and post chunks matched and both had {len(previous_chunks_by_file[0])} chunks"
+
+    for file, previous_chunks in zip(stored_user_files[1:], previous_chunks_by_file[1:]):
+        post_chunks = elasticsearch_storage_handler.list_all_items("chunk", file.creator_user_uuid)
+        assert (
+            post_chunks == previous_chunks
+        ), f"Additional files had their chunks changed! Pre: {len(previous_chunks)} Post: {len(post_chunks)}"
 
 
 def test_get_missing_file_chunks(app_client, headers):

diff --git a/redbox-core/redbox/models/__init__.py b/redbox-core/redbox/models/__init__.py
@@ -11,8 +11,6 @@
     APIErrorResponse,
 )
 from redbox.models.file import (
-    Chunk,
-    ChunkStatus,
     File,
     FileStatus,
     ProcessingStatusEnum,
@@ -29,8 +27,6 @@
     "ChatRequest",
     "ChatResponse",
     "ChatRoute",
-    "Chunk",
-    "ChunkStatus",
     "EmbedQueueItem",
     "EmbeddingModelInfo",
     "EmbeddingResponse",