Skip to content

Commit

Permalink
Merge pull request #909 from i-dot-ai/feature/remove-legacy-chunk-model
Browse files Browse the repository at this point in the history
Remove chunk model
  • Loading branch information
jamesrichards4 committed Aug 5, 2024
2 parents 15fe2a2 + e8c06fc commit 0c35176
Show file tree
Hide file tree
Showing 12 changed files with 219 additions and 566 deletions.
40 changes: 5 additions & 35 deletions core-api/core_api/routes/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from core_api.auth import get_user_uuid
from core_api.publisher_handler import FilePublisher
from redbox.models import APIError404, Chunk, File, FileStatus, ProcessingStatusEnum, Settings
from redbox.models import APIError404, File, FileStatus, ProcessingStatusEnum, Settings
from redbox.storage import ElasticsearchStorageHandler

# === Functions ===
Expand Down Expand Up @@ -202,8 +202,7 @@ def delete_file(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_user_uui

storage_handler.delete_item(file)

chunks = storage_handler.get_file_chunks(file.uuid, user_uuid)
storage_handler.delete_items(chunks)
storage_handler.delete_user_items("chunk", user_uuid)
return file


Expand Down Expand Up @@ -239,7 +238,9 @@ async def reingest_file(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_
storage_handler.update_item(file)

# Remove old chunks
storage_handler.delete_file_chunks(file.uuid, user_uuid)
storage_handler.delete_user_items(
"chunk", user_uuid, filters=[ElasticsearchStorageHandler.get_with_parent_file_filter(file.uuid)]
)

# Add new chunks
log.info("publishing %s", file.uuid)
Expand All @@ -248,37 +249,6 @@ async def reingest_file(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_
return file


@file_app.get(
"/{file_uuid}/chunks",
tags=["file"],
responses={404: {"model": APIError404, "description": "The file was not found"}},
)
def get_file_chunks(file_uuid: UUID, user_uuid: Annotated[UUID, Depends(get_user_uuid)]) -> list[Chunk]:
"""Gets a list of chunks for a file in the database
Args:
file_uuid (UUID): The UUID of the file to delete
user_uuid (UUID): The UUID of the user
Returns:
Chunks (list, Chunk): The chunks belonging to the requested file
Raises:
404: If the file isn't found, or the creator and requester don't match
"""
try:
file = storage_handler.read_item(file_uuid, model_type="File")
except NotFoundError:
return file_not_found_response(file_uuid=file_uuid)

if file.creator_user_uuid != user_uuid:
return file_not_found_response(file_uuid=file_uuid)

log.info("getting chunks for file %s", file_uuid)

return storage_handler.get_file_chunks(file_uuid, user_uuid)


@file_app.get(
"/{file_uuid}/status",
tags=["file"],
Expand Down
89 changes: 62 additions & 27 deletions core-api/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from datetime import UTC, datetime

import pytest
from botocore.exceptions import ClientError
from elasticsearch import Elasticsearch
from fastapi.testclient import TestClient
from jose import jwt
Expand All @@ -25,21 +24,6 @@ def env():
return Settings()


@pytest.fixture(scope="session")
def s3_client(env):
_client = env.s3_client()
try:
_client.create_bucket(
Bucket=env.bucket_name,
CreateBucketConfiguration={"LocationConstraint": env.aws_region},
)
except ClientError as e:
if e.response["Error"]["Code"] != "BucketAlreadyOwnedByYou":
raise

return _client


@pytest.fixture(scope="session")
def es_client(env) -> Elasticsearch:
return env.elasticsearch_client()
Expand Down Expand Up @@ -91,18 +75,8 @@ def elasticsearch_storage_handler(es_client, env):


@pytest.fixture()
def file(s3_client, file_pdf_path: Path, alice, env) -> File:
def file(file_pdf_path: Path, alice, env) -> File:
file_name = file_pdf_path.name
file_type = file_pdf_path.suffix

with file_pdf_path.open("rb") as f:
s3_client.put_object(
Bucket=env.bucket_name,
Body=f.read(),
Key=file_name,
Tagging=f"file_type={file_type}",
)

return File(key=file_name, bucket=env.bucket_name, creator_user_uuid=alice)


Expand All @@ -120,6 +94,19 @@ def stored_file_1(elasticsearch_storage_handler, file) -> File:
return file


@pytest.fixture()
def stored_user_files(elasticsearch_storage_handler) -> list[File]:
user = uuid4()
files = [
File(creator_user_uuid=user, key="testfile1.txt", bucket="local"),
File(creator_user_uuid=user, key="testfile2.txt", bucket="local"),
]
for file in files:
elasticsearch_storage_handler.write_item(file)
elasticsearch_storage_handler.refresh()
return files


@pytest.fixture(scope="session")
def embedding_model_dim() -> int:
return 3072 # 3-large default size
Expand Down Expand Up @@ -201,6 +188,47 @@ def stored_large_file_chunks(stored_file_1) -> list[Document]:
return normal_chunks + large_chunks


@pytest.fixture()
def stored_user_chunks(stored_user_files) -> list[list[Document]]:
chunks_by_file = []
for file in stored_user_files:
normal_chunks = [
Document(
page_content="hello",
metadata=ChunkMetadata(
parent_file_uuid=str(file.uuid),
index=i,
file_name="test_file",
creator_user_uuid=file.creator_user_uuid,
page_number=4,
created_datetime=datetime.now(UTC),
token_count=4,
chunk_resolution=ChunkResolution.normal,
).model_dump(),
)
for i in range(25)
]

large_chunks = [
Document(
page_content="hello" * 10,
metadata=ChunkMetadata(
parent_file_uuid=str(file.uuid),
index=i,
file_name="test_file",
creator_user_uuid=file.creator_user_uuid,
page_number=4,
created_datetime=datetime.now(UTC),
token_count=20,
chunk_resolution=ChunkResolution.largest,
).model_dump(),
)
for i in range(5)
]
chunks_by_file.append(normal_chunks + large_chunks)
return chunks_by_file


@pytest.fixture()
def chunked_file(elasticsearch_store: ElasticsearchStore, stored_file_chunks, stored_file_1) -> File:
elasticsearch_store.add_documents(stored_file_chunks)
Expand All @@ -213,6 +241,13 @@ def large_chunked_file(elasticsearch_store, stored_large_file_chunks, stored_fil
return stored_file_1


@pytest.fixture()
def chunked_user_files(elasticsearch_store, stored_user_chunks) -> File:
for chunks in stored_user_chunks:
elasticsearch_store.add_documents(chunks)
return stored_user_chunks


@pytest.fixture()
def file_pdf_path() -> Path:
return Path(__file__).parents[2] / "tests" / "data" / "pdf" / "Cabinet Office - Wikipedia.pdf"
Expand Down
77 changes: 38 additions & 39 deletions core-api/tests/routes/test_file.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
import json
from http import HTTPStatus
from pathlib import Path
from jose import jwt

import pytest
from elasticsearch import NotFoundError
from faststream.redis import TestRedisBroker

from core_api.routes.file import env, router
from redbox.storage.elasticsearch import ElasticsearchStorageHandler


@pytest.mark.asyncio()
async def test_post_file_upload(s3_client, app_client, file_pdf_path: Path, headers):
async def test_post_file_upload(app_client, file_pdf_path: Path, headers):
"""
Given a new file
When I POST it to /file
Expand All @@ -19,23 +21,15 @@ async def test_post_file_upload(s3_client, app_client, file_pdf_path: Path, head

file_key = file_pdf_path.name

with file_pdf_path.open("rb") as f:
s3_client.upload_fileobj(
Bucket=env.bucket_name,
Fileobj=f,
Key=file_key,
ExtraArgs={"Tagging": "file_type=pdf"},
async with TestRedisBroker(router.broker):
response = app_client.post(
"/file",
json={
"key": file_key,
"bucket": env.bucket_name,
},
headers=headers,
)

async with TestRedisBroker(router.broker):
response = app_client.post(
"/file",
json={
"key": file_key,
"bucket": env.bucket_name,
},
headers=headers,
)
assert response.status_code == HTTPStatus.CREATED

file = json.loads(response.content.decode("utf-8"))
Expand Down Expand Up @@ -77,16 +71,15 @@ def test_get_missing_file(app_client, headers):
assert response.status_code == HTTPStatus.NOT_FOUND


def test_delete_file(s3_client, app_client, elasticsearch_storage_handler, chunked_file, headers):
def test_delete_file(app_client, elasticsearch_storage_handler, chunked_file, headers):
"""
Given a previously saved file
When I DELETE it to /file
I Expect to see it removed from s3 and elastic-search, including the chunks
"""
# check assets exist
assert s3_client.get_object(Bucket=env.bucket_name, Key=chunked_file.key)
assert elasticsearch_storage_handler.read_item(item_uuid=chunked_file.uuid, model_type="file")
assert elasticsearch_storage_handler.get_file_chunks(chunked_file.uuid, chunked_file.creator_user_uuid)
assert elasticsearch_storage_handler.list_all_items("chunk", chunked_file.creator_user_uuid)

response = app_client.delete(f"/file/{chunked_file.uuid}", headers=headers)
assert response.status_code == HTTPStatus.OK
Expand All @@ -98,7 +91,7 @@ def test_delete_file(s3_client, app_client, elasticsearch_storage_handler, chunk
with pytest.raises(NotFoundError):
elasticsearch_storage_handler.read_item(item_uuid=chunked_file.uuid, model_type="file")

assert not elasticsearch_storage_handler.get_file_chunks(chunked_file.uuid, chunked_file.creator_user_uuid)
assert not elasticsearch_storage_handler.list_all_items("chunk", chunked_file.creator_user_uuid)


def test_delete_missing_file(app_client, headers):
Expand All @@ -111,33 +104,39 @@ def test_delete_missing_file(app_client, headers):
assert response.status_code == HTTPStatus.NOT_FOUND


def test_reingest_file(app_client, chunked_file, elasticsearch_storage_handler, headers):
def test_reingest_file(app_client, chunked_user_files, stored_user_files, elasticsearch_storage_handler):
"""
Given a previously chunked file
When I PUT it to /file/uuid/
I Expect the old chunks to be removed
"""
previous_chunks = elasticsearch_storage_handler.get_file_chunks(chunked_file.uuid, chunked_file.creator_user_uuid)
test_file = stored_user_files[0]

response = app_client.put(f"/file/{chunked_file.uuid}", headers=headers)
assert response.status_code == HTTPStatus.OK
bearer_token = jwt.encode({"user_uuid": str(test_file.creator_user_uuid)}, key="nvjkernd")
headers_for_user = {"Authorization": f"Bearer {bearer_token}"}

elasticsearch_storage_handler.refresh()
assert (
elasticsearch_storage_handler.get_file_chunks(chunked_file.uuid, chunked_file.creator_user_uuid)
!= previous_chunks
)
previous_chunks_by_file = [
elasticsearch_storage_handler.list_all_items(
"chunk",
file.creator_user_uuid,
filters=[ElasticsearchStorageHandler.get_with_parent_file_filter(file.uuid)],
)
for file in stored_user_files
]

response = app_client.put(f"/file/{test_file.uuid}", headers=headers_for_user)
assert response.status_code == HTTPStatus.OK, f"Error response: [{response.status_code}] {response.text}"

def test_get_file_chunks(app_client, chunked_file, headers):
"""
Given a previously chunked file
When I GET it from /file/uuid/chunks
I Expect to receive the chunks
"""
response = app_client.get(f"/file/{chunked_file.uuid}/chunks", headers=headers)
assert response.status_code == HTTPStatus.OK
assert len(response.json()) == 12
elasticsearch_storage_handler.refresh()
assert (
elasticsearch_storage_handler.list_all_items("chunk", test_file.creator_user_uuid) != previous_chunks_by_file[0]
), f"Pre and post chunks matched and both had {len(previous_chunks_by_file[0])} chunks"

for file, previous_chunks in zip(stored_user_files[1:], previous_chunks_by_file[1:]):
post_chunks = elasticsearch_storage_handler.list_all_items("chunk", file.creator_user_uuid)
assert (
post_chunks == previous_chunks
), f"Additional files had their chunks changed! Pre: {len(previous_chunks)} Post: {len(post_chunks)}"


def test_get_missing_file_chunks(app_client, headers):
Expand Down
4 changes: 0 additions & 4 deletions redbox-core/redbox/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@
APIErrorResponse,
)
from redbox.models.file import (
Chunk,
ChunkStatus,
File,
FileStatus,
ProcessingStatusEnum,
Expand All @@ -29,8 +27,6 @@
"ChatRequest",
"ChatResponse",
"ChatRoute",
"Chunk",
"ChunkStatus",
"EmbedQueueItem",
"EmbeddingModelInfo",
"EmbeddingResponse",
Expand Down
Loading

0 comments on commit 0c35176

Please sign in to comment.