From e0a111ec8cc7ff4698c1a69e2be441488ab7c62d Mon Sep 17 00:00:00 2001
From: jamesrichards <james.richards@cabinetoffice.gov.uk>
Date: Fri, 2 Aug 2024 15:31:05 +0100
Subject: [PATCH 1/5] Removed chunk and associated models. Removed get chunks
 file endpoint. Removed chunk related actions from storage_handler

---
 core-api/core_api/routes/file.py              |  39 +----
 core-api/tests/routes/test_file.py            |  21 +--
 redbox-core/redbox/models/__init__.py         |   4 -
 redbox-core/redbox/models/file.py             | 106 +-----------
 redbox-core/redbox/storage/elasticsearch.py   | 132 ++------------
 redbox-core/redbox/storage/storage_handler.py |   8 +-
 redbox-core/tests/conftest.py                 |  72 +++-----
 redbox-core/tests/models/__init__.py          |   0
 redbox-core/tests/models/test_file.py         |  55 ------
 .../tests/storage/test_elasticsearch.py       | 162 ++++++------------
 10 files changed, 107 insertions(+), 492 deletions(-)
 delete mode 100644 redbox-core/tests/models/__init__.py
 delete mode 100644 redbox-core/tests/models/test_file.py

diff --git a/core-api/core_api/routes/file.py b/core-api/core_api/routes/file.py
index 0263e592c..408b23af9 100644
--- a/core-api/core_api/routes/file.py
+++ b/core-api/core_api/routes/file.py
@@ -11,7 +11,7 @@
 
 from core_api.auth import get_user_uuid
 from core_api.publisher_handler import FilePublisher
-from redbox.models import APIError404, Chunk, File, FileStatus, ProcessingStatusEnum, Settings
+from redbox.models import APIError404, File, FileStatus, ProcessingStatusEnum, Settings
 from redbox.storage import ElasticsearchStorageHandler
 
 # === Functions ===
@@ -202,8 +202,7 @@ def delete_file(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_user_uui
 
     storage_handler.delete_item(file)
 
-    chunks = storage_handler.get_file_chunks(file.uuid, user_uuid)
-    storage_handler.delete_items(chunks)
+    storage_handler.delete_user_items("chunk", user_uuid)
     return file
 
 
@@ -239,7 +238,7 @@ async def reingest_file(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_
     storage_handler.update_item(file)
 
     # Remove old chunks
-    storage_handler.delete_file_chunks(file.uuid, user_uuid)
+    storage_handler.delete_user_items("chunk", user_uuid)
 
     # Add new chunks
     log.info("publishing %s", file.uuid)
@@ -247,38 +246,6 @@ async def reingest_file(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_
 
     return file
 
-
-@file_app.get(
-    "/{file_uuid}/chunks",
-    tags=["file"],
-    responses={404: {"model": APIError404, "description": "The file was not found"}},
-)
-def get_file_chunks(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_user_uuid)]) -> list[Chunk]:
-    """Gets a list of chunks for a file in the database
-
-    Args:
-        file_uuid (UUID): The UUID of the file to delete
-        user_uuid (UUID): The UUID of the user
-
-    Returns:
-        Chunks (list, Chunk): The chunks belonging to the requested file
-
-    Raises:
-        404: If the file isn't found, or the creator and requester don't match
-    """
-    try:
-        file = storage_handler.read_item(file_uuid, model_type="File")
-    except NotFoundError:
-        return file_not_found_response(file_uuid=file_uuid)
-
-    if file.creator_user_uuid != user_uuid:
-        return file_not_found_response(file_uuid=file_uuid)
-
-    log.info("getting chunks for file %s", file_uuid)
-
-    return storage_handler.get_file_chunks(file_uuid, user_uuid)
-
-
 @file_app.get(
     "/{file_uuid}/status",
     tags=["file"],
diff --git a/core-api/tests/routes/test_file.py b/core-api/tests/routes/test_file.py
index 07f3c6896..5507f27bf 100644
--- a/core-api/tests/routes/test_file.py
+++ b/core-api/tests/routes/test_file.py
@@ -86,7 +86,7 @@ def test_delete_file(s3_client, app_client, elasticsearch_storage_handler, chunk
     # check assets exist
     assert s3_client.get_object(Bucket=env.bucket_name, Key=chunked_file.key)
     assert elasticsearch_storage_handler.read_item(item_uuid=chunked_file.uuid, model_type="file")
-    assert elasticsearch_storage_handler.get_file_chunks(chunked_file.uuid, chunked_file.creator_user_uuid)
+    assert elasticsearch_storage_handler.list_all_items("chunk", chunked_file.creator_user_uuid)
 
     response = app_client.delete(f"/file/{chunked_file.uuid}", headers=headers)
     assert response.status_code == HTTPStatus.OK
@@ -98,7 +98,7 @@ def test_delete_file(s3_client, app_client, elasticsearch_storage_handler, chunk
     with pytest.raises(NotFoundError):
         elasticsearch_storage_handler.read_item(item_uuid=chunked_file.uuid, model_type="file")
 
-    assert not elasticsearch_storage_handler.get_file_chunks(chunked_file.uuid, chunked_file.creator_user_uuid)
+    assert not elasticsearch_storage_handler.list_all_items("chunk", chunked_file.creator_user_uuid)
 
 
 def test_delete_missing_file(app_client, headers):
@@ -117,27 +117,16 @@ def test_reingest_file(app_client, chunked_file, elasticsearch_storage_handler,
     When I PUT it to /file/uuid/
     I Expect the old chunks to be removed
     """
-    previous_chunks = elasticsearch_storage_handler.get_file_chunks(chunked_file.uuid, chunked_file.creator_user_uuid)
+    previous_chunks = elasticsearch_storage_handler.list_all_items("chunk", chunked_file.creator_user_uuid)
 
     response = app_client.put(f"/file/{chunked_file.uuid}", headers=headers)
     assert response.status_code == HTTPStatus.OK
 
     elasticsearch_storage_handler.refresh()
     assert (
-        elasticsearch_storage_handler.get_file_chunks(chunked_file.uuid, chunked_file.creator_user_uuid)
+        elasticsearch_storage_handler.list_all_items("chunk", chunked_file.creator_user_uuid)
         != previous_chunks
-    )
-
-
-def test_get_file_chunks(app_client, chunked_file, headers):
-    """
-    Given a previously chunked file
-    When I GET it from /file/uuid/chunks
-    I Expect to receive the chunks
-    """
-    response = app_client.get(f"/file/{chunked_file.uuid}/chunks", headers=headers)
-    assert response.status_code == HTTPStatus.OK
-    assert len(response.json()) == 12
+    ), f"Pre and post chunks matched and both had {len(previous_chunks)} chunks"
 
 
 def test_get_missing_file_chunks(app_client, headers):
diff --git a/redbox-core/redbox/models/__init__.py b/redbox-core/redbox/models/__init__.py
index 79b7c0b96..107afa539 100644
--- a/redbox-core/redbox/models/__init__.py
+++ b/redbox-core/redbox/models/__init__.py
@@ -11,8 +11,6 @@
     APIErrorResponse,
 )
 from redbox.models.file import (
-    Chunk,
-    ChunkStatus,
     File,
     FileStatus,
     ProcessingStatusEnum,
@@ -29,8 +27,6 @@
     "ChatRequest",
     "ChatResponse",
     "ChatRoute",
-    "Chunk",
-    "ChunkStatus",
     "EmbedQueueItem",
     "EmbeddingModelInfo",
     "EmbeddingResponse",
diff --git a/redbox-core/redbox/models/file.py b/redbox-core/redbox/models/file.py
index e7ca30dc9..4c2851dda 100644
--- a/redbox-core/redbox/models/file.py
+++ b/redbox-core/redbox/models/file.py
@@ -1,12 +1,11 @@
 from __future__ import annotations
 
-import hashlib
 from enum import Enum, StrEnum
 from uuid import UUID
 import datetime
 
 import tiktoken
-from pydantic import BaseModel, Field, computed_field
+from pydantic import BaseModel, Field
 
 from redbox.models.base import PersistableModel
 
@@ -41,108 +40,7 @@ def __le__(self, other: Link):
 
     def __hash__(self):
         return hash(self.text) ^ hash(self.url) ^ hash(self.start_index)
-
-
-class Metadata(BaseModel):
-    """this is a pydantic model for the unstructured Metadata class
-    uncomment fields below and update merge as required"""
-
-    parent_doc_uuid: UUID | None = Field(
-        description="this field is not actually part of unstructured Metadata but is required by langchain",
-        default=None,
-    )
-
-    # attached_to_filename: Optional[str] = None
-    # category_depth: Optional[int] = None
-    # coordinates: Optional[CoordinatesMetadata] = None
-    # data_source: Optional[DataSourceMetadata] = None
-    # detection_class_prob: Optional[float] = None
-    # emphasized_text_contents: Optional[list[str]] = None
-    # emphasized_text_tags: Optional[list[str]] = None
-    # file_directory: Optional[str] = None
-    # filename: Optional[str | pathlib.Path] = None
-    # filetype: Optional[str] = None
-    # header_footer_type: Optional[str] = None
-    # image_path: Optional[str] = None
-    # is_continuation: Optional[bool] = None
-    languages: list[str] | None = None
-    # last_modified: Optional[str] = None
-    link_texts: list[str] | None = None
-    link_urls: list[str] | None = None
-    links: list[Link] | None = None
-    # orig_elements: Optional[list[Element]] = None
-    # page_name: Optional[str] = None
-    page_number: int | list[int] | None = None
-    # parent_id: Optional[UUID] = None
-    # regex_metadata: Optional[dict[str, list[RegexMetadata]]] = None
-    # section: Optional[str] = None
-    # sent_from: Optional[list[str]] = None
-    # sent_to: Optional[list[str]] = None
-    # signature: Optional[str] = None
-    # subject: Optional[str] = None
-    # text_as_html: Optional[str] = None
-    # url: Optional[str] = None
-
-    @classmethod
-    def merge(cls, left: Metadata | None, right: Metadata | None) -> Metadata | None:
-        if not left:
-            return right
-        if not right:
-            return left
-
-        def listify(obj, field_name: str) -> list:
-            field_value = getattr(obj, field_name, None)
-            if isinstance(field_value, list):
-                return field_value
-            if field_value is None:
-                return []
-            return [field_value]
-
-        def sorted_list_or_none(obj: list):
-            return sorted(set(obj)) or None
-
-        data = {
-            field_name: sorted_list_or_none(listify(left, field_name) + listify(right, field_name))
-            for field_name in cls.model_fields
-        }
-
-        if parent_doc_uuids := data.get("parent_doc_uuid"):
-            parent_doc_uuids_without_none = [uuid for uuid in parent_doc_uuids if uuid]
-            if len(parent_doc_uuids) > 1:
-                message = "chunks do not have the same parent_doc_uuid"
-                raise ValueError(message)
-            data["parent_doc_uuid"] = parent_doc_uuids_without_none[0]
-        return cls(**data)
-
-
-class Chunk(PersistableModel):
-    """Chunk of a File"""
-
-    parent_file_uuid: UUID = Field(description="id of the original file which this text came from")
-    index: int = Field(description="relative position of this chunk in the original file")
-    text: str = Field(description="chunk of the original text")
-    metadata: Metadata | dict | None = Field(
-        description="subset of the unstructured Element.Metadata object", default=None
-    )
-    embedding: list[float] | None = Field(description="the vector representation of the text", default=None)
-
-    @computed_field  # type: ignore[misc] # Remove if https://github.com/python/mypy/issues/1362 is fixed.
-    @property  # Needed for type checking - see https://docs.pydantic.dev/2.0/usage/computed_fields/
-    def text_hash(self) -> str:
-        return hashlib.md5(self.text.encode(encoding="UTF-8", errors="strict"), usedforsecurity=False).hexdigest()
-
-    @computed_field  # type: ignore[misc] # Remove if https://github.com/python/mypy/issues/1362 is fixed.
-    @property  # Needed for type checking - see https://docs.pydantic.dev/2.0/usage/computed_fields/
-    def token_count(self) -> int:
-        return len(encoding.encode(self.text))
-
-
-class ChunkStatus(BaseModel):
-    """Status of a chunk of a file."""
-
-    chunk_uuid: UUID
-    embedded: bool
-
+    
 
 class FileStatus(BaseModel):
     """Status of a file."""
diff --git a/redbox-core/redbox/storage/elasticsearch.py b/redbox-core/redbox/storage/elasticsearch.py
index 0708cc296..4da0e2a3e 100644
--- a/redbox-core/redbox/storage/elasticsearch.py
+++ b/redbox-core/redbox/storage/elasticsearch.py
@@ -8,45 +8,26 @@
 from elasticsearch.helpers import scan
 from pydantic import ValidationError
 
-from redbox.models import Chunk, FileStatus, Settings
+from redbox.models import Settings
 from redbox.models.base import PersistableModel
 from redbox.storage.storage_handler import BaseStorageHandler
 
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger()
 
-env = Settings()
 
-
-def build_chunk_query(parent_file_uuid: UUID, user_uuid: UUID) -> dict:
-    query = {
+def get_query_match_all_for_user(user_uuid: UUID):
+    return {
         "query": {
             "bool": {
-                "must": [
-                    {
-                        "bool": {
-                            "should": [
-                                {"term": {"parent_file_uuid.keyword": str(parent_file_uuid)}},
-                                {"term": {"metadata.parent_file_uuid.keyword": str(parent_file_uuid)}},
-                            ]
-                        }
-                    },
-                    {
-                        "bool": {
-                            "should": [
-                                {"term": {"creator_user_uuid.keyword": str(user_uuid)}},
-                                {"term": {"metadata.creator_user_uuid.keyword": str(user_uuid)}},
-                            ]
-                        }
-                    },
+                "should": [
+                    {"term": {"creator_user_uuid.keyword": str(user_uuid)}},
+                    {"term": {"metadata.creator_user_uuid.keyword": str(user_uuid)}},
                 ]
             }
         }
     }
 
-    return query
-
-
 class ElasticsearchStorageHandler(BaseStorageHandler):
     """Storage Handler for Elasticsearch"""
 
@@ -79,6 +60,8 @@ def write_item(self, item: PersistableModel) -> ObjectApiResponse:
     def write_items(self, items: Sequence[PersistableModel]) -> Sequence[ObjectApiResponse]:
         return list(map(self.write_item, items))
 
+    
+
     def read_item(self, item_uuid: UUID, model_type: str):
         target_index = f"{self.root_index}-{model_type.lower()}"
         result = self.es_client.get(index=target_index, id=str(item_uuid))
@@ -121,6 +104,13 @@ def delete_items(self, items: list[PersistableModel]) -> ObjectApiResponse | Non
             index=target_index,
             body={"query": {"terms": {"_id": [str(item.uuid) for item in items]}}},
         )
+    
+    def delete_user_items(self, model_type: str, user_uuid: UUID) -> ObjectApiResponse | None:
+        target_index = f"{self.root_index}-{model_type.lower()}"
+        return self.es_client.delete_by_query(
+            index=target_index,
+            body=get_query_match_all_for_user(user_uuid),
+        )
 
     def read_all_items(self, model_type: str, user_uuid: UUID) -> list[PersistableModel]:
         target_index = f"{self.root_index}-{model_type.lower()}"
@@ -128,16 +118,7 @@ def read_all_items(self, model_type: str, user_uuid: UUID) -> list[PersistableMo
             result = scan(
                 client=self.es_client,
                 index=target_index,
-                query={
-                    "query": {
-                        "bool": {
-                            "should": [
-                                {"term": {"creator_user_uuid.keyword": str(user_uuid)}},
-                                {"term": {"metadata.creator_user_uuid.keyword": str(user_uuid)}},
-                            ]
-                        }
-                    }
-                },
+                query=get_query_match_all_for_user(user_uuid),
                 _source=True,
             )
 
@@ -167,88 +148,11 @@ def list_all_items(self, model_type: str, user_uuid: UUID) -> list[UUID]:
             results = scan(
                 client=self.es_client,
                 index=target_index,
-                query={
-                    "query": {
-                        "bool": {
-                            "should": [
-                                {"term": {"creator_user_uuid.keyword": str(user_uuid)}},
-                                {"term": {"metadata.creator_user_uuid.keyword": str(user_uuid)}},
-                            ]
-                        }
-                    }
-                },
+                query=get_query_match_all_for_user(user_uuid),
                 _source=False,
             )
 
         except NotFoundError:
             log.info("Index %s not found. Returning empty list.", target_index)
             return []
-        return [UUID(item["_id"]) for item in results]
-
-    def get_file_chunks(self, parent_file_uuid: UUID, user_uuid: UUID) -> list[Chunk]:
-        """get chunks for a given file"""
-        target_index = f"{self.root_index}-chunk"
-
-        return [
-            hit_to_chunk(item)
-            for item in scan(
-                client=self.es_client,
-                index=target_index,
-                query=build_chunk_query(parent_file_uuid, user_uuid),
-            )
-        ]
-
-    def delete_file_chunks(self, parent_file_uuid: UUID, user_uuid: UUID):
-        """delete chunks for a given file"""
-        target_index = f"{self.root_index}-chunk"
-
-        self.es_client.delete_by_query(
-            index=target_index,
-            body=build_chunk_query(parent_file_uuid, user_uuid),
-        )
-
-    def get_file_status(self, file_uuid: UUID, user_uuid: UUID) -> FileStatus:
-        """Get the status of a file and associated Chunks
-
-        Args:
-            file_uuid (UUID): The UUID of the file to get the status of
-            user_uuid (UUID): the UUID of the user
-
-        Returns:
-            FileStatus: The status of the file
-        """
-
-        # Test 1: Get the file
-        try:
-            file = self.read_item(file_uuid, "File")
-        except NotFoundError as e:
-            log.exception("file/%s not found", file_uuid)
-            message = f"File {file_uuid} not found"
-            raise ValueError(message) from e
-        if file.creator_user_uuid != user_uuid:
-            log.error("file/%s.%s not owned by %s", file_uuid, file.creator_user_uuid, user_uuid)
-            message = f"File {file_uuid} not found"
-            raise ValueError(message)
-
-        return FileStatus(
-            file_uuid=file_uuid,
-            processing_status=file.ingest_status,
-        )
-
-
-def hit_to_chunk(hit: dict[str, Any]) -> Chunk:
-    if hit["_source"].get("uuid"):
-        # Legacy direct chunk storage
-        return Chunk(**hit["_source"])
-    else:
-        # Document storage
-        return Chunk(
-            uuid=hit["_id"],
-            text=hit["_source"]["text"],
-            index=hit["_source"]["metadata"]["index"],
-            embedding=hit["_source"].get(env.embedding_document_field_name),
-            created_datetime=hit["_source"]["metadata"]["created_datetime"],
-            creator_user_uuid=hit["_source"]["metadata"]["creator_user_uuid"],
-            parent_file_uuid=hit["_source"]["metadata"]["parent_file_uuid"],
-            metadata=hit["_source"]["metadata"],
-        )
+        return [UUID(item["_id"]) for item in results]
\ No newline at end of file
diff --git a/redbox-core/redbox/storage/storage_handler.py b/redbox-core/redbox/storage/storage_handler.py
index f794bd94f..d14e3bafe 100644
--- a/redbox-core/redbox/storage/storage_handler.py
+++ b/redbox-core/redbox/storage/storage_handler.py
@@ -2,7 +2,7 @@
 from typing import ClassVar
 from uuid import UUID
 
-from redbox.models import Chunk, File
+from redbox.models import File
 from redbox.models.base import PersistableModel
 
 
@@ -12,7 +12,7 @@ class BaseStorageHandler(ABC):
     """
 
     # dict comprehension for lowercase class name to class
-    model_type_map: ClassVar = {v.__name__.lower(): v for v in [Chunk, File]}
+    model_type_map: ClassVar = {v.__name__.lower(): v for v in [File]}
 
     def get_model_by_model_type(self, model_type):
         return self.model_type_map[model_type.lower()]
@@ -60,7 +60,3 @@ def list_all_items(self, model_type: str, user_uuid: UUID):
     @abstractmethod
     def read_all_items(self, model_type: str, user_uuid: UUID):
         """Read all objects of a given type from a data store"""
-
-    @abstractmethod
-    def get_file_chunks(self, parent_file_uuid: UUID, user_uuid: UUID) -> list[Chunk]:
-        """get chunks for a given file"""
diff --git a/redbox-core/tests/conftest.py b/redbox-core/tests/conftest.py
index 7e98a2032..f496cd1c5 100644
--- a/redbox-core/tests/conftest.py
+++ b/redbox-core/tests/conftest.py
@@ -4,7 +4,7 @@
 import pytest
 from elasticsearch import Elasticsearch
 
-from redbox.models import Chunk, File, Settings
+from redbox.models import File, Settings
 from redbox.storage.elasticsearch import ElasticsearchStorageHandler
 
 from collections.abc import Generator
@@ -39,51 +39,39 @@ def claire():
 
 
 @pytest.fixture()
-def file_belonging_to_alice(file_pdf_path, alice, env) -> File:
-    return File(
+def file_belonging_to_alice(file_pdf_path, alice, env, elasticsearch_storage_handler: ElasticsearchStorageHandler) -> File:
+    f = File(
         key=file_pdf_path.name,
         bucket=env.bucket_name,
         creator_user_uuid=alice,
     )
+    elasticsearch_storage_handler.write_item(f)
+    elasticsearch_storage_handler.refresh()
+    return f
 
 
 @pytest.fixture()
-def file_belonging_to_bob(file_pdf_path, bob, env) -> File:
-    return File(
+def file_belonging_to_bob(file_pdf_path, bob, env, elasticsearch_storage_handler: ElasticsearchStorageHandler) -> File:
+    f = File(
         key=file_pdf_path.name,
         bucket=env.bucket_name,
         creator_user_uuid=bob,
     )
+    elasticsearch_storage_handler.write_item(f)
+    elasticsearch_storage_handler.refresh()
+    return f
 
 
 @pytest.fixture()
-def chunk_belonging_to_alice(file_belonging_to_alice) -> Chunk:
-    return Chunk(
-        creator_user_uuid=file_belonging_to_alice.creator_user_uuid,
-        parent_file_uuid=file_belonging_to_alice.uuid,
-        index=1,
-        text="hello, i am Alice!",
-    )
-
-
-@pytest.fixture()
-def chunk_belonging_to_bob(file_belonging_to_bob) -> Chunk:
-    return Chunk(
-        creator_user_uuid=file_belonging_to_bob.creator_user_uuid,
-        parent_file_uuid=file_belonging_to_bob.uuid,
-        index=1,
-        text="hello, i am Bob!",
-    )
-
-
-@pytest.fixture()
-def chunk_belonging_to_claire(claire) -> Chunk:
-    return Chunk(
+def file_belonging_to_claire(file_pdf_path, claire, env, elasticsearch_storage_handler: ElasticsearchStorageHandler) -> File:
+    f = File(
+        key=file_pdf_path.name,
+        bucket=env.bucket_name,
         creator_user_uuid=claire,
-        parent_file_uuid=uuid4(),
-        index=1,
-        text="hello, i am Claire!",
     )
+    elasticsearch_storage_handler.write_item(f)
+    elasticsearch_storage_handler.refresh()
+    return f
 
 
 @pytest.fixture
@@ -91,20 +79,6 @@ def file_pdf_path() -> Path:
     return Path(__file__).parents[2] / "tests" / "data" / "pdf" / "Cabinet Office - Wikipedia.pdf"
 
 
-@pytest.fixture()
-def stored_chunk_belonging_to_alice(elasticsearch_storage_handler, chunk_belonging_to_alice) -> Chunk:
-    elasticsearch_storage_handler.write_item(item=chunk_belonging_to_alice)
-    elasticsearch_storage_handler.refresh()
-    return chunk_belonging_to_alice
-
-
-@pytest.fixture()
-def stored_chunk_belonging_to_bob(elasticsearch_storage_handler, chunk_belonging_to_bob) -> Chunk:
-    elasticsearch_storage_handler.write_item(item=chunk_belonging_to_bob)
-    elasticsearch_storage_handler.refresh()
-    return chunk_belonging_to_bob
-
-
 @pytest.fixture()
 def elasticsearch_client(env) -> Elasticsearch:
     return env.elasticsearch_client()
@@ -120,13 +94,21 @@ def es_index(env) -> str:
     return f"{env.elastic_root_index}-chunk"
 
 
+@pytest.fixture(scope="session")
+def es_index_file(env) -> str:
+    return f"{env.elastic_root_index}-file"
+
+
 @pytest.fixture(autouse=True, scope="session")
-def create_index(env, es_index):
+def create_index(env, es_index, es_index_file):
     es = env.elasticsearch_client()
     if not es.indices.exists(index=es_index):
         es.indices.create(index=es_index)
+    if not es.indices.exists(index=es_index_file):
+        es.indices.create(index=es_index_file)
     yield
     es.indices.delete(index=es_index)
+    es.indices.delete(index=es_index_file)
 
 
 @pytest.fixture()
diff --git a/redbox-core/tests/models/__init__.py b/redbox-core/tests/models/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/redbox-core/tests/models/test_file.py b/redbox-core/tests/models/test_file.py
deleted file mode 100644
index bcd3842f5..000000000
--- a/redbox-core/tests/models/test_file.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import uuid
-
-import pytest
-
-from redbox.models.file import Link, Metadata
-
-UUID_1 = uuid.UUID("7c66416e-eff7-441f-aafc-6f06e62fb4ec")
-UUID_2 = uuid.UUID("b7aadcce-806c-4dc6-8b95-d2477d717560")
-
-
-def test_merge_pass():
-    left = Metadata(
-        languages=["en"],
-        page_number=1,
-        links=[Link(text="text", start_index=1, url="http://url")],
-        parent_doc_uuid=UUID_1,
-    )
-    right = Metadata(languages=["en", "fr"], page_number=[2, 3])
-
-    expected = Metadata(
-        languages=["en", "fr"],
-        links=[Link(text="text", url="http://url", start_index=1)],
-        page_number=[1, 2, 3],
-        parent_doc_uuid=UUID_1,
-    )
-    actual = Metadata.merge(left, right)
-
-    assert actual == expected
-
-
-def test_merge_pass_same_parent_doc_uuid():
-    left = Metadata(parent_doc_uuid=UUID_1)
-    right = Metadata(parent_doc_uuid=UUID_1)
-
-    expected = Metadata(parent_doc_uuid=UUID_1)
-    actual = Metadata.merge(left, right)
-
-    assert actual == expected
-
-
-def test_merge_pass_one_empty_parent_doc_uuid():
-    left = Metadata(parent_doc_uuid=UUID_1)
-    right = Metadata()
-
-    expected = Metadata(parent_doc_uuid=UUID_1)
-    actual = Metadata.merge(left, right)
-
-    assert actual == expected
-
-
-def test_merge_pass_two_parent_doc_uuid():
-    left = Metadata(parent_doc_uuid=UUID_1)
-    right = Metadata(parent_doc_uuid=UUID_2)
-    with pytest.raises(ValueError, match="chunks do not have the same parent_doc_uuid"):
-        Metadata.merge(left, right)
diff --git a/redbox-core/tests/storage/test_elasticsearch.py b/redbox-core/tests/storage/test_elasticsearch.py
index 86f219c73..00d6e297f 100644
--- a/redbox-core/tests/storage/test_elasticsearch.py
+++ b/redbox-core/tests/storage/test_elasticsearch.py
@@ -3,7 +3,7 @@
 import pytest
 from elasticsearch import NotFoundError
 
-from redbox.models import Chunk
+from redbox.models import File
 from redbox.storage.elasticsearch import ElasticsearchStorageHandler
 
 
@@ -21,35 +21,23 @@ def test_elasticsearch_client_connection(elasticsearch_client, elasticsearch_sto
     assert isinstance(elasticsearch_storage_handler.model_type_map, dict)
 
 
-def test_elasticsearch_write_read_item(elasticsearch_storage_handler, chunk_belonging_to_alice):
+def test_elasticsearch_write_read_item(elasticsearch_storage_handler, file_belonging_to_alice):
     """
-    Given that `Chunk` is a valid model
+    Given that `File` is a valid model
     When I
-    Then I expect a valid Chunk to be returned on read"
+    Then I expect a valid File to be returned on read"
     """
-    # Write the chunk
-    elasticsearch_storage_handler.write_item(item=chunk_belonging_to_alice)
+    # Write the file
+    elasticsearch_storage_handler.write_item(item=file_belonging_to_alice)
 
-    # Read the chunk
-    chunk_read = elasticsearch_storage_handler.read_item(chunk_belonging_to_alice.uuid, "Chunk")
+    # Read the File
+    item_read = elasticsearch_storage_handler.read_item(file_belonging_to_alice.uuid, "File")
 
-    assert chunk_read.uuid == chunk_belonging_to_alice.uuid
-
-
-def test_elastic_read_item(elasticsearch_storage_handler, stored_chunk_belonging_to_alice):
-    read_chunk = elasticsearch_storage_handler.read_item(stored_chunk_belonging_to_alice.uuid, "Chunk")
-    assert read_chunk.uuid == stored_chunk_belonging_to_alice.uuid
-    assert read_chunk.parent_file_uuid == stored_chunk_belonging_to_alice.parent_file_uuid
-    assert read_chunk.index == stored_chunk_belonging_to_alice.index
-    assert read_chunk.text == stored_chunk_belonging_to_alice.text
-    assert read_chunk.metadata == stored_chunk_belonging_to_alice.metadata
-    assert read_chunk.creator_user_uuid == stored_chunk_belonging_to_alice.creator_user_uuid
-    assert read_chunk.token_count == stored_chunk_belonging_to_alice.token_count
+    assert item_read.uuid == file_belonging_to_alice.uuid
 
 
 def test_elastic_delete_item_fail(
     elasticsearch_storage_handler: ElasticsearchStorageHandler,
-    chunk_belonging_to_bob,
 ):
     """
     Given that I have an non-existent item uuid
@@ -57,7 +45,7 @@ def test_elastic_delete_item_fail(
     Then I expect to see a NotFoundError error raised
     """
     with pytest.raises(NotFoundError):
-        elasticsearch_storage_handler.delete_item(chunk_belonging_to_bob)
+        elasticsearch_storage_handler.delete_item(File(uuid=uuid4(), creator_user_uuid=uuid4(), key="", bucket=""))
 
 
 def test_elastic_read_item_fail(
@@ -69,7 +57,7 @@ def test_elastic_read_item_fail(
     Then I expect to see a NotFoundError error raised
     """
     with pytest.raises(NotFoundError):
-        elasticsearch_storage_handler.read_item(uuid4(), "Chunk")
+        elasticsearch_storage_handler.read_item(uuid4(), "File")
 
 
 def test_elastic_write_read_delete_items(elasticsearch_storage_handler):
@@ -79,138 +67,88 @@ def test_elastic_write_read_delete_items(elasticsearch_storage_handler):
     Then I expect to see them written to the database
     """
     creator_user_uuid = uuid4()
-    chunks = [
-        Chunk(
+    files = [
+        File(
             creator_user_uuid=creator_user_uuid,
-            parent_file_uuid=uuid4(),
-            index=i,
-            text="test_text",
+            key=f"somefile-{i}.txt",
+            bucket="a-bucket"
         )
         for i in range(10)
     ]
 
-    elasticsearch_storage_handler.write_items(chunks)
+    elasticsearch_storage_handler.write_items(files)
 
-    read_chunks = elasticsearch_storage_handler.read_items([chunk.uuid for chunk in chunks], "Chunk")
+    read_files = elasticsearch_storage_handler.read_items([file.uuid for file in files], "File")
 
-    assert read_chunks == chunks
+    assert read_files == files
 
-    # Delete the chunks
-    elasticsearch_storage_handler.delete_items(chunks)
+    # Delete the files
+    elasticsearch_storage_handler.delete_items(files)
 
-    # Check that the chunks are deleted
-    items_left = elasticsearch_storage_handler.list_all_items("Chunk", creator_user_uuid)
+    # Check that the files are deleted
+    items_left = elasticsearch_storage_handler.list_all_items("File", creator_user_uuid)
 
-    assert all(chunk.uuid not in items_left for chunk in chunks)
+    assert all(file.uuid not in items_left for file in files)
 
 
 def test_list_all_items(
     elasticsearch_storage_handler: ElasticsearchStorageHandler,
-    stored_chunk_belonging_to_alice: Chunk,
-    stored_chunk_belonging_to_bob: Chunk,
-    chunk_belonging_to_claire: Chunk,
+    file_belonging_to_alice: File,
+    file_belonging_to_bob: File,
     alice: UUID,
 ):
     """
     Given that I have
-    * a saved chunk belonging to Alice
-    * a saved chunk belonging to Bob
-    * an unsaved chunk belonging to Claire
+    * a saved file belonging to Alice
+    * a saved file belonging to Bob
     When I call list_all_items as alice
     Then I expect to see the uuids of the saved objects that belong to alice returned
     """
-    uuids = elasticsearch_storage_handler.list_all_items("Chunk", alice)
-    assert len(uuids) == 1
+    uuids = elasticsearch_storage_handler.list_all_items("File", alice)
+    assert len(uuids) == 1, f"Unexpected number of files {len(uuids)}"
 
 
 def test_read_all_items(
     elasticsearch_storage_handler: ElasticsearchStorageHandler,
-    stored_chunk_belonging_to_alice: Chunk,
-    stored_chunk_belonging_to_bob: Chunk,
-    chunk_belonging_to_claire: Chunk,
+    file_belonging_to_alice: File,
+    file_belonging_to_bob: File,
     alice: UUID,
 ):
     """
     Given that I have
-    * a saved chunk belonging to Alice
-    * a saved chunk belonging to Bob
-    * an unsaved chunk belonging to Claire
+    * a saved file belonging to Alice
+    * a saved file belonging to Bob
     When I call read_all_items as alice
-    Then I expect to see the one Chunk belonging to alice
+    Then I expect to see the one File belonging to alice
     """
-    chunks = elasticsearch_storage_handler.read_all_items("Chunk", alice)
-    assert len(chunks) == 1
-    assert chunks[0].creator_user_uuid == alice
+    files = elasticsearch_storage_handler.read_all_items("File", alice)
+    assert len(files) == 1
+    assert files[0].creator_user_uuid == alice
 
 
-def test_elastic_delete_item(elasticsearch_storage_handler, stored_chunk_belonging_to_alice):
+def test_elastic_delete_item(elasticsearch_storage_handler, file_belonging_to_alice):
     """
     Given that I have a saved object
     When I call delete_item on it
     Then I expect to not be able to read the item
     """
-    elasticsearch_storage_handler.delete_item(stored_chunk_belonging_to_alice)
+    elasticsearch_storage_handler.delete_item(file_belonging_to_alice)
 
     with pytest.raises(NotFoundError):
-        elasticsearch_storage_handler.read_item(stored_chunk_belonging_to_alice.uuid, "Chunk")
-
-
-def test_get_file_chunks(
-    elasticsearch_storage_handler: ElasticsearchStorageHandler,
-    stored_chunk_belonging_to_alice: Chunk,
-):
-    """
-    Given that a chunk belonging to a file belonging alice have been saved
-    When I call get_file_chunks with the right file id and alice's id
-    I Expect the single chunk to be retrieved
-    """
-    assert stored_chunk_belonging_to_alice.creator_user_uuid
-
-    chunks = elasticsearch_storage_handler.get_file_chunks(
-        stored_chunk_belonging_to_alice.parent_file_uuid,
-        stored_chunk_belonging_to_alice.creator_user_uuid,
-    )
-
-    assert len(chunks) == 1
-
-
-def test_get_file_chunks_fail(
-    elasticsearch_storage_handler: ElasticsearchStorageHandler,
-    stored_chunk_belonging_to_alice: Chunk,
-):
-    """
-    Given that a chunk belonging to a file belonging alice have been saved
-    When I call get_file_chunks with the right file id and another id
-    I Expect the no chunks to be retrieved
-    """
-    other_chunks = elasticsearch_storage_handler.get_file_chunks(
-        stored_chunk_belonging_to_alice.parent_file_uuid,
-        uuid4(),
-    )
-    assert not other_chunks
+        elasticsearch_storage_handler.read_item(file_belonging_to_alice.uuid, "File")
 
 
-def test_delete_file_chunks(
-    elasticsearch_storage_handler: ElasticsearchStorageHandler,
-    stored_chunk_belonging_to_alice: Chunk,
-):
+def test_elastic_delete_user_item(elasticsearch_storage_handler, file_belonging_to_alice, alice):
     """
-    Given that a chunk belonging to a file belonging alice have been saved
-    When I call delete_file_chunks with the right file id and alice's id
-    I Expect the chunks to be removed
+    Given that I have a saved object
+    When I call delete_item on it
+    Then I expect to not be able to read the item
     """
-    assert stored_chunk_belonging_to_alice.creator_user_uuid
-
-    elasticsearch_storage_handler.delete_file_chunks(
-        stored_chunk_belonging_to_alice.parent_file_uuid,
-        stored_chunk_belonging_to_alice.creator_user_uuid,
-    )
+    files = elasticsearch_storage_handler.read_all_items("File", alice)
+    assert len(files) == 1
+    assert files[0].creator_user_uuid == alice
 
+    elasticsearch_storage_handler.delete_user_items("file", alice)
     elasticsearch_storage_handler.refresh()
-
-    chunks = elasticsearch_storage_handler.get_file_chunks(
-        stored_chunk_belonging_to_alice.parent_file_uuid,
-        stored_chunk_belonging_to_alice.creator_user_uuid,
-    )
-
-    assert len(chunks) == 0
+    files = elasticsearch_storage_handler.read_all_items("File", alice)
+    assert len(files) == 0
\ No newline at end of file

From 7d5e42f88e683f424f7ab8f55da35d5dc4f0bec6 Mon Sep 17 00:00:00 2001
From: jamesrichards <james.richards@cabinetoffice.gov.uk>
Date: Fri, 2 Aug 2024 15:33:40 +0100
Subject: [PATCH 2/5] Ruff

---
 core-api/core_api/routes/file.py                |  1 +
 core-api/tests/routes/test_file.py              |  3 +--
 redbox-core/redbox/chains/graph.py              |  6 +-----
 redbox-core/redbox/models/file.py               |  2 +-
 redbox-core/redbox/storage/elasticsearch.py     |  9 +++------
 redbox-core/tests/conftest.py                   |  8 ++++++--
 redbox-core/tests/storage/test_elasticsearch.py | 11 ++---------
 7 files changed, 15 insertions(+), 25 deletions(-)

diff --git a/core-api/core_api/routes/file.py b/core-api/core_api/routes/file.py
index 408b23af9..105db3fa1 100644
--- a/core-api/core_api/routes/file.py
+++ b/core-api/core_api/routes/file.py
@@ -246,6 +246,7 @@ async def reingest_file(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_
 
     return file
 
+
 @file_app.get(
     "/{file_uuid}/status",
     tags=["file"],
diff --git a/core-api/tests/routes/test_file.py b/core-api/tests/routes/test_file.py
index 5507f27bf..aa52a9b0f 100644
--- a/core-api/tests/routes/test_file.py
+++ b/core-api/tests/routes/test_file.py
@@ -124,8 +124,7 @@ def test_reingest_file(app_client, chunked_file, elasticsearch_storage_handler,
 
     elasticsearch_storage_handler.refresh()
     assert (
-        elasticsearch_storage_handler.list_all_items("chunk", chunked_file.creator_user_uuid)
-        != previous_chunks
+        elasticsearch_storage_handler.list_all_items("chunk", chunked_file.creator_user_uuid) != previous_chunks
     ), f"Pre and post chunks matched and both had {len(previous_chunks)} chunks"
 
 
diff --git a/redbox-core/redbox/chains/graph.py b/redbox-core/redbox/chains/graph.py
index 6521e41a2..3166344b0 100644
--- a/redbox-core/redbox/chains/graph.py
+++ b/redbox-core/redbox/chains/graph.py
@@ -76,9 +76,7 @@ def chat_prompt_from_messages(state: ChainState):
 
         log.debug("Setting chat prompt")
         system_prompt_message = [("system", system_prompt)]
-        prompts_budget = len(tokeniser.encode(system_prompt)) + len(
-            tokeniser.encode(question_prompt)
-        )
+        prompts_budget = len(tokeniser.encode(system_prompt)) + len(tokeniser.encode(question_prompt))
         chat_history_budget = state["query"].ai_settings.context_window_size - llm_max_tokens - prompts_budget
 
         if chat_history_budget <= 0:
@@ -101,8 +99,6 @@ def chat_prompt_from_messages(state: ChainState):
     return chat_prompt_from_messages
 
 
-
-
 @chain
 def set_prompt_args(state: ChainState):
     log.debug("Setting prompt args")
diff --git a/redbox-core/redbox/models/file.py b/redbox-core/redbox/models/file.py
index 4c2851dda..1950bab34 100644
--- a/redbox-core/redbox/models/file.py
+++ b/redbox-core/redbox/models/file.py
@@ -40,7 +40,7 @@ def __le__(self, other: Link):
 
     def __hash__(self):
         return hash(self.text) ^ hash(self.url) ^ hash(self.start_index)
-    
+
 
 class FileStatus(BaseModel):
     """Status of a file."""
diff --git a/redbox-core/redbox/storage/elasticsearch.py b/redbox-core/redbox/storage/elasticsearch.py
index 4da0e2a3e..a1d6a73c8 100644
--- a/redbox-core/redbox/storage/elasticsearch.py
+++ b/redbox-core/redbox/storage/elasticsearch.py
@@ -1,6 +1,5 @@
 import logging
 from collections.abc import Sequence
-from typing import Any
 from uuid import UUID
 
 from elastic_transport import ObjectApiResponse
@@ -8,7 +7,6 @@
 from elasticsearch.helpers import scan
 from pydantic import ValidationError
 
-from redbox.models import Settings
 from redbox.models.base import PersistableModel
 from redbox.storage.storage_handler import BaseStorageHandler
 
@@ -28,6 +26,7 @@ def get_query_match_all_for_user(user_uuid: UUID):
         }
     }
 
+
 class ElasticsearchStorageHandler(BaseStorageHandler):
     """Storage Handler for Elasticsearch"""
 
@@ -60,8 +59,6 @@ def write_item(self, item: PersistableModel) -> ObjectApiResponse:
     def write_items(self, items: Sequence[PersistableModel]) -> Sequence[ObjectApiResponse]:
         return list(map(self.write_item, items))
 
-    
-
     def read_item(self, item_uuid: UUID, model_type: str):
         target_index = f"{self.root_index}-{model_type.lower()}"
         result = self.es_client.get(index=target_index, id=str(item_uuid))
@@ -104,7 +101,7 @@ def delete_items(self, items: list[PersistableModel]) -> ObjectApiResponse | Non
             index=target_index,
             body={"query": {"terms": {"_id": [str(item.uuid) for item in items]}}},
         )
-    
+
     def delete_user_items(self, model_type: str, user_uuid: UUID) -> ObjectApiResponse | None:
         target_index = f"{self.root_index}-{model_type.lower()}"
         return self.es_client.delete_by_query(
@@ -155,4 +152,4 @@ def list_all_items(self, model_type: str, user_uuid: UUID) -> list[UUID]:
         except NotFoundError:
             log.info("Index %s not found. Returning empty list.", target_index)
             return []
-        return [UUID(item["_id"]) for item in results]
\ No newline at end of file
+        return [UUID(item["_id"]) for item in results]
diff --git a/redbox-core/tests/conftest.py b/redbox-core/tests/conftest.py
index f496cd1c5..a89d3f278 100644
--- a/redbox-core/tests/conftest.py
+++ b/redbox-core/tests/conftest.py
@@ -39,7 +39,9 @@ def claire():
 
 
 @pytest.fixture()
-def file_belonging_to_alice(file_pdf_path, alice, env, elasticsearch_storage_handler: ElasticsearchStorageHandler) -> File:
+def file_belonging_to_alice(
+    file_pdf_path, alice, env, elasticsearch_storage_handler: ElasticsearchStorageHandler
+) -> File:
     f = File(
         key=file_pdf_path.name,
         bucket=env.bucket_name,
@@ -63,7 +65,9 @@ def file_belonging_to_bob(file_pdf_path, bob, env, elasticsearch_storage_handler
 
 
 @pytest.fixture()
-def file_belonging_to_claire(file_pdf_path, claire, env, elasticsearch_storage_handler: ElasticsearchStorageHandler) -> File:
+def file_belonging_to_claire(
+    file_pdf_path, claire, env, elasticsearch_storage_handler: ElasticsearchStorageHandler
+) -> File:
     f = File(
         key=file_pdf_path.name,
         bucket=env.bucket_name,
diff --git a/redbox-core/tests/storage/test_elasticsearch.py b/redbox-core/tests/storage/test_elasticsearch.py
index 00d6e297f..5345194cc 100644
--- a/redbox-core/tests/storage/test_elasticsearch.py
+++ b/redbox-core/tests/storage/test_elasticsearch.py
@@ -67,14 +67,7 @@ def test_elastic_write_read_delete_items(elasticsearch_storage_handler):
     Then I expect to see them written to the database
     """
     creator_user_uuid = uuid4()
-    files = [
-        File(
-            creator_user_uuid=creator_user_uuid,
-            key=f"somefile-{i}.txt",
-            bucket="a-bucket"
-        )
-        for i in range(10)
-    ]
+    files = [File(creator_user_uuid=creator_user_uuid, key=f"somefile-{i}.txt", bucket="a-bucket") for i in range(10)]
 
     elasticsearch_storage_handler.write_items(files)
 
@@ -151,4 +144,4 @@ def test_elastic_delete_user_item(elasticsearch_storage_handler, file_belonging_
     elasticsearch_storage_handler.delete_user_items("file", alice)
     elasticsearch_storage_handler.refresh()
     files = elasticsearch_storage_handler.read_all_items("File", alice)
-    assert len(files) == 0
\ No newline at end of file
+    assert len(files) == 0

From 620299eb32678a110efb6b2573082fdaf34e0da7 Mon Sep 17 00:00:00 2001
From: jamesrichards <james.richards@cabinetoffice.gov.uk>
Date: Fri, 2 Aug 2024 16:32:36 +0100
Subject: [PATCH 3/5] Fixed issue with reingest not restricting chunk deletion
 to single file. Added additional tests for this scenario

---
 core-api/core_api/routes/file.py            |  7 +-
 core-api/tests/conftest.py                  | 95 +++++++++++++++------
 core-api/tests/routes/test_file.py          | 42 +++++----
 redbox-core/redbox/storage/elasticsearch.py | 46 ++++++----
 4 files changed, 130 insertions(+), 60 deletions(-)

diff --git a/core-api/core_api/routes/file.py b/core-api/core_api/routes/file.py
index 105db3fa1..83074b9b4 100644
--- a/core-api/core_api/routes/file.py
+++ b/core-api/core_api/routes/file.py
@@ -212,7 +212,10 @@ def delete_file(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_user_uui
     tags=["file"],
     responses={404: {"model": APIError404, "description": "The file was not found"}},
 )
-async def reingest_file(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_user_uuid)]) -> File:
+async def reingest_file(
+    file_uuid: UUID, 
+    user_uuid: Annotated[UUID, Depends(get_user_uuid)]
+) -> File:
     """Deletes exisiting file chunks and regenerates embeddings
 
     Args:
@@ -238,7 +241,7 @@ async def reingest_file(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_
     storage_handler.update_item(file)
 
     # Remove old chunks
-    storage_handler.delete_user_items("chunk", user_uuid)
+    storage_handler.delete_user_items("chunk", user_uuid, filters=[ElasticsearchStorageHandler.get_with_parent_file_filter(file.uuid)])
 
     # Add new chunks
     log.info("publishing %s", file.uuid)
diff --git a/core-api/tests/conftest.py b/core-api/tests/conftest.py
index 1f4e55f09..96d53b452 100644
--- a/core-api/tests/conftest.py
+++ b/core-api/tests/conftest.py
@@ -1,6 +1,7 @@
 from pathlib import Path
 from uuid import UUID, uuid4
 from datetime import UTC, datetime
+import uuid
 
 import pytest
 from botocore.exceptions import ClientError
@@ -25,21 +26,6 @@ def env():
     return Settings()
 
 
-@pytest.fixture(scope="session")
-def s3_client(env):
-    _client = env.s3_client()
-    try:
-        _client.create_bucket(
-            Bucket=env.bucket_name,
-            CreateBucketConfiguration={"LocationConstraint": env.aws_region},
-        )
-    except ClientError as e:
-        if e.response["Error"]["Code"] != "BucketAlreadyOwnedByYou":
-            raise
-
-    return _client
-
-
 @pytest.fixture(scope="session")
 def es_client(env) -> Elasticsearch:
     return env.elasticsearch_client()
@@ -91,18 +77,8 @@ def elasticsearch_storage_handler(es_client, env):
 
 
 @pytest.fixture()
-def file(s3_client, file_pdf_path: Path, alice, env) -> File:
+def file(file_pdf_path: Path, alice, env) -> File:
     file_name = file_pdf_path.name
-    file_type = file_pdf_path.suffix
-
-    with file_pdf_path.open("rb") as f:
-        s3_client.put_object(
-            Bucket=env.bucket_name,
-            Body=f.read(),
-            Key=file_name,
-            Tagging=f"file_type={file_type}",
-        )
-
     return File(key=file_name, bucket=env.bucket_name, creator_user_uuid=alice)
 
 
@@ -120,6 +96,26 @@ def stored_file_1(elasticsearch_storage_handler, file) -> File:
     return file
 
 
+@pytest.fixture()
+def stored_user_files(elasticsearch_storage_handler) -> list[File]:
+    user = uuid4()
+    files = [
+        File(
+            creator_user_uuid=user,
+            key="testfile1.txt",
+            bucket="local"
+        ),
+        File(
+            creator_user_uuid=user,
+            key="testfile2.txt",
+            bucket="local"
+        )
+    ]
+    for file in files:
+        elasticsearch_storage_handler.write_item(file)
+    elasticsearch_storage_handler.refresh()
+    return files
+
 @pytest.fixture(scope="session")
 def embedding_model_dim() -> int:
     return 3072  # 3-large default size
@@ -201,6 +197,47 @@ def stored_large_file_chunks(stored_file_1) -> list[Document]:
     return normal_chunks + large_chunks
 
 
+@pytest.fixture()
+def stored_user_chunks(stored_user_files) -> list[list[Document]]:
+    chunks_by_file = []
+    for file in stored_user_files:
+        normal_chunks = [
+            Document(
+                page_content="hello",
+                metadata=ChunkMetadata(
+                    parent_file_uuid=str(file.uuid),
+                    index=i,
+                    file_name="test_file",
+                    creator_user_uuid=file.creator_user_uuid,
+                    page_number=4,
+                    created_datetime=datetime.now(UTC),
+                    token_count=4,
+                    chunk_resolution=ChunkResolution.normal,
+                ).model_dump(),
+            )
+            for i in range(25)
+        ]
+
+        large_chunks = [
+            Document(
+                page_content="hello" * 10,
+                metadata=ChunkMetadata(
+                    parent_file_uuid=str(file.uuid),
+                    index=i,
+                    file_name="test_file",
+                    creator_user_uuid=file.creator_user_uuid,
+                    page_number=4,
+                    created_datetime=datetime.now(UTC),
+                    token_count=20,
+                    chunk_resolution=ChunkResolution.largest,
+                ).model_dump(),
+            )
+            for i in range(5)
+        ]
+        chunks_by_file.append(normal_chunks+large_chunks)
+    return chunks_by_file
+
+
 @pytest.fixture()
 def chunked_file(elasticsearch_store: ElasticsearchStore, stored_file_chunks, stored_file_1) -> File:
     elasticsearch_store.add_documents(stored_file_chunks)
@@ -212,6 +249,12 @@ def large_chunked_file(elasticsearch_store, stored_large_file_chunks, stored_fil
     elasticsearch_store.add_documents(stored_large_file_chunks)
     return stored_file_1
 
+@pytest.fixture()
+def chunked_user_files(elasticsearch_store, stored_user_chunks) -> File:
+    for chunks in stored_user_chunks:
+        elasticsearch_store.add_documents(chunks)
+    return stored_user_chunks
+
 
 @pytest.fixture()
 def file_pdf_path() -> Path:
diff --git a/core-api/tests/routes/test_file.py b/core-api/tests/routes/test_file.py
index aa52a9b0f..3ed7c9ea1 100644
--- a/core-api/tests/routes/test_file.py
+++ b/core-api/tests/routes/test_file.py
@@ -1,16 +1,17 @@
 import json
 from http import HTTPStatus
 from pathlib import Path
+from jose import jwt
 
 import pytest
 from elasticsearch import NotFoundError
 from faststream.redis import TestRedisBroker
 
 from core_api.routes.file import env, router
-
+from redbox.storage.elasticsearch import ElasticsearchStorageHandler
 
 @pytest.mark.asyncio()
-async def test_post_file_upload(s3_client, app_client, file_pdf_path: Path, headers):
+async def test_post_file_upload(app_client, file_pdf_path: Path, headers):
     """
     Given a new file
     When I POST it to /file
@@ -20,12 +21,6 @@ async def test_post_file_upload(s3_client, app_client, file_pdf_path: Path, head
     file_key = file_pdf_path.name
 
     with file_pdf_path.open("rb") as f:
-        s3_client.upload_fileobj(
-            Bucket=env.bucket_name,
-            Fileobj=f,
-            Key=file_key,
-            ExtraArgs={"Tagging": "file_type=pdf"},
-        )
 
         async with TestRedisBroker(router.broker):
             response = app_client.post(
@@ -77,14 +72,13 @@ def test_get_missing_file(app_client, headers):
     assert response.status_code == HTTPStatus.NOT_FOUND
 
 
-def test_delete_file(s3_client, app_client, elasticsearch_storage_handler, chunked_file, headers):
+def test_delete_file(app_client, elasticsearch_storage_handler, chunked_file, headers):
     """
     Given a previously saved file
     When I DELETE it to /file
     I Expect to see it removed from s3 and elastic-search, including the chunks
     """
     # check assets exist
-    assert s3_client.get_object(Bucket=env.bucket_name, Key=chunked_file.key)
     assert elasticsearch_storage_handler.read_item(item_uuid=chunked_file.uuid, model_type="file")
     assert elasticsearch_storage_handler.list_all_items("chunk", chunked_file.creator_user_uuid)
 
@@ -111,21 +105,37 @@ def test_delete_missing_file(app_client, headers):
     assert response.status_code == HTTPStatus.NOT_FOUND
 
 
-def test_reingest_file(app_client, chunked_file, elasticsearch_storage_handler, headers):
+def test_reingest_file(app_client, chunked_user_files, stored_user_files, elasticsearch_storage_handler):
     """
     Given a previously chunked file
     When I PUT it to /file/uuid/
     I Expect the old chunks to be removed
     """
-    previous_chunks = elasticsearch_storage_handler.list_all_items("chunk", chunked_file.creator_user_uuid)
+    test_file = stored_user_files[0]
 
-    response = app_client.put(f"/file/{chunked_file.uuid}", headers=headers)
-    assert response.status_code == HTTPStatus.OK
+    bearer_token = jwt.encode({"user_uuid": str(test_file.creator_user_uuid)}, key="nvjkernd")
+    headers_for_user = {"Authorization": f"Bearer {bearer_token}"}
+
+    previous_chunks_by_file = [
+        elasticsearch_storage_handler.list_all_items(
+            "chunk", 
+            file.creator_user_uuid, 
+            filters=[ElasticsearchStorageHandler.get_with_parent_file_filter(file.uuid)]
+        )
+        for file in stored_user_files 
+    ]
+
+    response = app_client.put(f"/file/{test_file.uuid}", headers=headers_for_user)
+    assert response.status_code == HTTPStatus.OK, f"Error response: [{response.status_code}] {response.text}"
 
     elasticsearch_storage_handler.refresh()
     assert (
-        elasticsearch_storage_handler.list_all_items("chunk", chunked_file.creator_user_uuid) != previous_chunks
-    ), f"Pre and post chunks matched and both had {len(previous_chunks)} chunks"
+        elasticsearch_storage_handler.list_all_items("chunk", test_file.creator_user_uuid) != previous_chunks_by_file[0]
+    ), f"Pre and post chunks matched and both had {len(previous_chunks_by_file[0])} chunks"
+
+    for file, previous_chunks in zip(stored_user_files[1:], previous_chunks_by_file[1:]):
+        post_chunks = elasticsearch_storage_handler.list_all_items("chunk", file.creator_user_uuid)
+        assert post_chunks == previous_chunks, f"Additional files had their chunks changed! Pre: {len(previous_chunks)} Post: {len(post_chunks)}"
 
 
 def test_get_missing_file_chunks(app_client, headers):
diff --git a/redbox-core/redbox/storage/elasticsearch.py b/redbox-core/redbox/storage/elasticsearch.py
index a1d6a73c8..387c7d0b2 100644
--- a/redbox-core/redbox/storage/elasticsearch.py
+++ b/redbox-core/redbox/storage/elasticsearch.py
@@ -14,17 +14,8 @@
 log = logging.getLogger()
 
 
-def get_query_match_all_for_user(user_uuid: UUID):
-    return {
-        "query": {
-            "bool": {
-                "should": [
-                    {"term": {"creator_user_uuid.keyword": str(user_uuid)}},
-                    {"term": {"metadata.creator_user_uuid.keyword": str(user_uuid)}},
-                ]
-            }
-        }
-    }
+
+
 
 
 class ElasticsearchStorageHandler(BaseStorageHandler):
@@ -102,11 +93,11 @@ def delete_items(self, items: list[PersistableModel]) -> ObjectApiResponse | Non
             body={"query": {"terms": {"_id": [str(item.uuid) for item in items]}}},
         )
 
-    def delete_user_items(self, model_type: str, user_uuid: UUID) -> ObjectApiResponse | None:
+    def delete_user_items(self, model_type: str, user_uuid: UUID, filters: list[dict] = None) -> ObjectApiResponse | None:
         target_index = f"{self.root_index}-{model_type.lower()}"
         return self.es_client.delete_by_query(
             index=target_index,
-            body=get_query_match_all_for_user(user_uuid),
+            body=ElasticsearchStorageHandler.get_query_match_all_for_user(user_uuid, filters),
         )
 
     def read_all_items(self, model_type: str, user_uuid: UUID) -> list[PersistableModel]:
@@ -115,7 +106,7 @@ def read_all_items(self, model_type: str, user_uuid: UUID) -> list[PersistableMo
             result = scan(
                 client=self.es_client,
                 index=target_index,
-                query=get_query_match_all_for_user(user_uuid),
+                query=ElasticsearchStorageHandler.get_query_match_all_for_user(user_uuid),
                 _source=True,
             )
 
@@ -138,14 +129,14 @@ def read_all_items(self, model_type: str, user_uuid: UUID) -> list[PersistableMo
                 log.exception("Validation exception for %s", item, exc_info=e)
         return items
 
-    def list_all_items(self, model_type: str, user_uuid: UUID) -> list[UUID]:
+    def list_all_items(self, model_type: str, user_uuid: UUID, filters: list[dict] = None) -> list[UUID]:
         target_index = f"{self.root_index}-{model_type.lower()}"
         try:
             # Only return _id
             results = scan(
                 client=self.es_client,
                 index=target_index,
-                query=get_query_match_all_for_user(user_uuid),
+                query=ElasticsearchStorageHandler.get_query_match_all_for_user(user_uuid, filters),
                 _source=False,
             )
 
@@ -153,3 +144,26 @@ def list_all_items(self, model_type: str, user_uuid: UUID) -> list[UUID]:
             log.info("Index %s not found. Returning empty list.", target_index)
             return []
         return [UUID(item["_id"]) for item in results]
+
+
+    @classmethod
+    def get_query_match_all_for_user(cls, user_uuid: UUID, filters: list[dict] = None):
+        query = {
+            "query": {
+                "bool": {
+                    "should": [
+                        {"term": {"creator_user_uuid.keyword": str(user_uuid)}},
+                        {"term": {"metadata.creator_user_uuid.keyword": str(user_uuid)}},
+                    ]
+                },
+            }
+        }
+        if filters:
+            query["query"]["bool"]["filter"] = filters
+        return query
+
+    @classmethod
+    def get_with_parent_file_filter(cls, parent_file_uuid: UUID | str):
+        return{
+            "term": {"metadata.parent_file_uuid.keyword": str(parent_file_uuid)},
+        }
\ No newline at end of file

From 5105ef81dc638115a4d76877de1ed3d3b0e5a5bc Mon Sep 17 00:00:00 2001
From: jamesrichards <james.richards@cabinetoffice.gov.uk>
Date: Fri, 2 Aug 2024 16:33:27 +0100
Subject: [PATCH 4/5] Removing integration tests for chunks

---
 tests/test_e2e.py | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/tests/test_e2e.py b/tests/test_e2e.py
index 37f7a87b3..eddac847e 100644
--- a/tests/test_e2e.py
+++ b/tests/test_e2e.py
@@ -84,21 +84,6 @@ def test_get_file_status(self, user_uuid):
         if not embedding_complete:
             pytest.fail(reason=f"failed to get embedded chunks within {timeout} seconds, potential error: {error}")
 
-    @pytest.mark.parametrize("user_uuid", USER_UUIDS)
-    def test_get_file_chunks(self, user_uuid):
-        """
-        Given that I have POSTed a file key to core-api/file
-        And the file status is complete
-        When I GET the file chunks
-        I Expect a 200 response code
-        """
-        chunks_response = requests.get(
-            f"http://{TEST_ORIGIN}/file/{TestEndToEnd.file_uuids[user_uuid]}/chunks",
-            headers=make_headers(user_uuid),
-            timeout=30,
-        )
-        assert chunks_response.status_code == HTTPStatus.OK
-
     @pytest.mark.parametrize("user_uuid", USER_UUIDS)
     def test_post_rag(self, user_uuid):
         """

From e8c06fc6cb269077424185b35148ac7807794531 Mon Sep 17 00:00:00 2001
From: jamesrichards <james.richards@cabinetoffice.gov.uk>
Date: Mon, 5 Aug 2024 11:01:53 +0100
Subject: [PATCH 5/5] Ruff

---
 core-api/core_api/routes/file.py            |  9 +++---
 core-api/tests/conftest.py                  | 18 ++++-------
 core-api/tests/routes/test_file.py          | 33 +++++++++++----------
 redbox-core/redbox/storage/elasticsearch.py | 13 ++++----
 4 files changed, 31 insertions(+), 42 deletions(-)

diff --git a/core-api/core_api/routes/file.py b/core-api/core_api/routes/file.py
index 83074b9b4..2953b9cd1 100644
--- a/core-api/core_api/routes/file.py
+++ b/core-api/core_api/routes/file.py
@@ -212,10 +212,7 @@ def delete_file(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_user_uui
     tags=["file"],
     responses={404: {"model": APIError404, "description": "The file was not found"}},
 )
-async def reingest_file(
-    file_uuid: UUID, 
-    user_uuid: Annotated[UUID, Depends(get_user_uuid)]
-) -> File:
+async def reingest_file(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_user_uuid)]) -> File:
     """Deletes exisiting file chunks and regenerates embeddings
 
     Args:
@@ -241,7 +238,9 @@ async def reingest_file(
     storage_handler.update_item(file)
 
     # Remove old chunks
-    storage_handler.delete_user_items("chunk", user_uuid, filters=[ElasticsearchStorageHandler.get_with_parent_file_filter(file.uuid)])
+    storage_handler.delete_user_items(
+        "chunk", user_uuid, filters=[ElasticsearchStorageHandler.get_with_parent_file_filter(file.uuid)]
+    )
 
     # Add new chunks
     log.info("publishing %s", file.uuid)
diff --git a/core-api/tests/conftest.py b/core-api/tests/conftest.py
index 96d53b452..108948925 100644
--- a/core-api/tests/conftest.py
+++ b/core-api/tests/conftest.py
@@ -1,10 +1,8 @@
 from pathlib import Path
 from uuid import UUID, uuid4
 from datetime import UTC, datetime
-import uuid
 
 import pytest
-from botocore.exceptions import ClientError
 from elasticsearch import Elasticsearch
 from fastapi.testclient import TestClient
 from jose import jwt
@@ -100,22 +98,15 @@ def stored_file_1(elasticsearch_storage_handler, file) -> File:
 def stored_user_files(elasticsearch_storage_handler) -> list[File]:
     user = uuid4()
     files = [
-        File(
-            creator_user_uuid=user,
-            key="testfile1.txt",
-            bucket="local"
-        ),
-        File(
-            creator_user_uuid=user,
-            key="testfile2.txt",
-            bucket="local"
-        )
+        File(creator_user_uuid=user, key="testfile1.txt", bucket="local"),
+        File(creator_user_uuid=user, key="testfile2.txt", bucket="local"),
     ]
     for file in files:
         elasticsearch_storage_handler.write_item(file)
     elasticsearch_storage_handler.refresh()
     return files
 
+
 @pytest.fixture(scope="session")
 def embedding_model_dim() -> int:
     return 3072  # 3-large default size
@@ -234,7 +225,7 @@ def stored_user_chunks(stored_user_files) -> list[list[Document]]:
             )
             for i in range(5)
         ]
-        chunks_by_file.append(normal_chunks+large_chunks)
+        chunks_by_file.append(normal_chunks + large_chunks)
     return chunks_by_file
 
 
@@ -249,6 +240,7 @@ def large_chunked_file(elasticsearch_store, stored_large_file_chunks, stored_fil
     elasticsearch_store.add_documents(stored_large_file_chunks)
     return stored_file_1
 
+
 @pytest.fixture()
 def chunked_user_files(elasticsearch_store, stored_user_chunks) -> File:
     for chunks in stored_user_chunks:
diff --git a/core-api/tests/routes/test_file.py b/core-api/tests/routes/test_file.py
index 3ed7c9ea1..9c552a841 100644
--- a/core-api/tests/routes/test_file.py
+++ b/core-api/tests/routes/test_file.py
@@ -10,6 +10,7 @@
 from core_api.routes.file import env, router
 from redbox.storage.elasticsearch import ElasticsearchStorageHandler
 
+
 @pytest.mark.asyncio()
 async def test_post_file_upload(app_client, file_pdf_path: Path, headers):
     """
@@ -20,17 +21,15 @@ async def test_post_file_upload(app_client, file_pdf_path: Path, headers):
 
     file_key = file_pdf_path.name
 
-    with file_pdf_path.open("rb") as f:
-
-        async with TestRedisBroker(router.broker):
-            response = app_client.post(
-                "/file",
-                json={
-                    "key": file_key,
-                    "bucket": env.bucket_name,
-                },
-                headers=headers,
-            )
+    async with TestRedisBroker(router.broker):
+        response = app_client.post(
+            "/file",
+            json={
+                "key": file_key,
+                "bucket": env.bucket_name,
+            },
+            headers=headers,
+        )
     assert response.status_code == HTTPStatus.CREATED
 
     file = json.loads(response.content.decode("utf-8"))
@@ -118,11 +117,11 @@ def test_reingest_file(app_client, chunked_user_files, stored_user_files, elasti
 
     previous_chunks_by_file = [
         elasticsearch_storage_handler.list_all_items(
-            "chunk", 
-            file.creator_user_uuid, 
-            filters=[ElasticsearchStorageHandler.get_with_parent_file_filter(file.uuid)]
+            "chunk",
+            file.creator_user_uuid,
+            filters=[ElasticsearchStorageHandler.get_with_parent_file_filter(file.uuid)],
         )
-        for file in stored_user_files 
+        for file in stored_user_files
     ]
 
     response = app_client.put(f"/file/{test_file.uuid}", headers=headers_for_user)
@@ -135,7 +134,9 @@ def test_reingest_file(app_client, chunked_user_files, stored_user_files, elasti
 
     for file, previous_chunks in zip(stored_user_files[1:], previous_chunks_by_file[1:]):
         post_chunks = elasticsearch_storage_handler.list_all_items("chunk", file.creator_user_uuid)
-        assert post_chunks == previous_chunks, f"Additional files had their chunks changed! Pre: {len(previous_chunks)} Post: {len(post_chunks)}"
+        assert (
+            post_chunks == previous_chunks
+        ), f"Additional files had their chunks changed! Pre: {len(previous_chunks)} Post: {len(post_chunks)}"
 
 
 def test_get_missing_file_chunks(app_client, headers):
diff --git a/redbox-core/redbox/storage/elasticsearch.py b/redbox-core/redbox/storage/elasticsearch.py
index 387c7d0b2..94faffb3d 100644
--- a/redbox-core/redbox/storage/elasticsearch.py
+++ b/redbox-core/redbox/storage/elasticsearch.py
@@ -14,10 +14,6 @@
 log = logging.getLogger()
 
 
-
-
-
-
 class ElasticsearchStorageHandler(BaseStorageHandler):
     """Storage Handler for Elasticsearch"""
 
@@ -93,7 +89,9 @@ def delete_items(self, items: list[PersistableModel]) -> ObjectApiResponse | Non
             body={"query": {"terms": {"_id": [str(item.uuid) for item in items]}}},
         )
 
-    def delete_user_items(self, model_type: str, user_uuid: UUID, filters: list[dict] = None) -> ObjectApiResponse | None:
+    def delete_user_items(
+        self, model_type: str, user_uuid: UUID, filters: list[dict] = None
+    ) -> ObjectApiResponse | None:
         target_index = f"{self.root_index}-{model_type.lower()}"
         return self.es_client.delete_by_query(
             index=target_index,
@@ -145,7 +143,6 @@ def list_all_items(self, model_type: str, user_uuid: UUID, filters: list[dict] =
             return []
         return [UUID(item["_id"]) for item in results]
 
-
     @classmethod
     def get_query_match_all_for_user(cls, user_uuid: UUID, filters: list[dict] = None):
         query = {
@@ -164,6 +161,6 @@ def get_query_match_all_for_user(cls, user_uuid: UUID, filters: list[dict] = Non
 
     @classmethod
     def get_with_parent_file_filter(cls, parent_file_uuid: UUID | str):
-        return{
+        return {
             "term": {"metadata.parent_file_uuid.keyword": str(parent_file_uuid)},
-        }
\ No newline at end of file
+        }