In [None]:
# Add autoreload
%reload_ext autoreload
%autoreload 2

In [70]:
import sys
from uuid import UUID, uuid4
import pandas as pd
from pathlib import Path
from mypy_boto3_s3.client import S3Client
from botocore.exceptions import ClientError
from typing import Iterable
import jsonlines
import json
import datetime

from django.core.files.uploadedfile import SimpleUploadedFile

from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan, bulk

from redbox.models import Settings, File
from redbox.models.settings import ElasticLocalSettings
from redbox.loader import UnstructuredDocumentLoader
from redbox.embeddings import get_embeddings

from langchain_core.vectorstores import VectorStore
from langchain.globals import set_verbose
from langchain_elasticsearch.vectorstores import ElasticsearchStore

from dj_notebook import activate, Plus

ROOT = Path().resolve().parent

set_verbose(False)

from dotenv import find_dotenv, load_dotenv

_ = load_dotenv(find_dotenv(ROOT / '.env'))

pd.set_option("display.max_colwidth", None)

ENV = Settings(
    minio_host="localhost", 
    object_store="minio", 
    elastic=ElasticLocalSettings(host="localhost"),
)
MODEL = ENV.embedding_model

S3_CLIENT = ENV.s3_client()
ES_CLIENT = ENV.elasticsearch_client()

EMBEDDING_MODEL = get_embeddings(ENV)
VECTOR_STORE = ElasticsearchStore(
    index_name="redbox-data-chunk",
    embedding=EMBEDDING_MODEL,
    es_connection=ES_CLIENT,
    query_field="text",
    vector_query_field=ENV.embedding_document_field_name,
)

try:
    S3_CLIENT.create_bucket(
        Bucket=ENV.bucket_name,
        CreateBucketConfiguration={"LocationConstraint": ENV.aws_region},
    )
except ClientError as e:
    if e.response["Error"]["Code"] != "BucketAlreadyOwnedByYou":
        raise

sys.path.insert(0, str(ROOT / "django_app"))

RB_APP = activate(
    dotenv_file=str(ROOT / "django_app/.env")
)

# Quick upload

The worker takes forever locally. This notebook will chunk and upload stuff using your local machine, which is much quicker for me. Unlike the eval notebooks, this also makes entried in the Postgres database.

This notebook needs the following services running:

```
docker compose up core-api db -d
```

It's also important that both `.env` files contain the same embedding model.

## Get your user UUID

In [None]:
RB_APP.read_frame(RB_APP.User.objects.all())[["id", "email"]]

In [36]:
USER_UUID = RB_APP.User.objects.first().id  # or read it and hardcode
USER_UUID

UUID('8dec0bc5-dd02-4b67-910d-77e225cdbc44')

## Embed and upload

Consider clearing all files in Elastic and Postgres.

In [None]:
RB_APP.read_frame(RB_APP.File.objects.all())[["id", "core_file_uuid", "original_file_name"]]

In [None]:
def clear_index(index: str, es: Elasticsearch) -> None:
    documents = scan(es, index=index, query={"query": {"match_all": {}}})
    bulk_data = [
        {"_op_type": "delete", "_index": doc['_index'], "_id": doc['_id']} for doc in documents
    ]
    bulk(es, bulk_data, request_timeout=300)

def clear_bucket(bucket: str, s3: S3Client) -> None:
    response = s3.list_objects_v2(Bucket=bucket)
    if "Contents" in response:
        # Delete each object
        for obj in response["Contents"]:
            s3.delete_object(Bucket=bucket, Key=obj["Key"])

clear_index(index="redbox-data-chunk", es=ES_CLIENT)
clear_index(index="redbox-data-file", es=ES_CLIENT)
_ = RB_APP.File.objects.all().delete()
clear_bucket(bucket="redbox-storage-dev", s3=S3_CLIENT)

In [None]:
def count_bucket_objects(bucket: str, s3: S3Client = S3_CLIENT) -> int:
    object_count = 0
    response = s3.list_objects_v2(Bucket=bucket)

    if 'Contents' in response:
        object_count += len(response['Contents'])
        # Paginate
        while response['IsTruncated']:
            continuation_token = response['NextContinuationToken']
            response = s3.list_objects_v2(Bucket=bucket, ContinuationToken=continuation_token)
            object_count += len(response['Contents'])

    return object_count

def count_uploads(
    es: Elasticsearch = ES_CLIENT, 
    dj_shell: Plus = RB_APP, 
    s3: S3Client = S3_CLIENT
):
    return {
        "django_files": dj_shell.File.objects.count(),
        "s3_files": count_bucket_objects(bucket="redbox-storage-dev", s3=s3),
        "elastic_files": es.count(index="redbox-data-file", body={"query": {"match_all": {}}})["count"],
        "elastic_chunks": es.count(index="redbox-data-chunk", body={"query": {"match_all": {}}})["count"],
    }

count_uploads()

Now we embed and upload.

In [None]:
def add_to_django(
    file_path: Path,
    user_uuid: UUID = USER_UUID,
    dj_shell: Plus = RB_APP,
):
    with open(file_path, "rb") as f:
        file = dj_shell.File.objects.create(
            status="complete",
            original_file=SimpleUploadedFile(
                name=file_path.name,
                content=f.read()
            ),
            user=RB_APP.User.objects.get(id=user_uuid),
            original_file_name=file_path.name,
            core_file_uuid=uuid4(),
        )
        file.save()
    
    return file

def embed_and_upload_file(
    file_path: Path,
    user_uuid: UUID = USER_UUID,
    s3_client: S3Client = S3_CLIENT,
    vector_store: VectorStore = VECTOR_STORE,
    dj_shell: Plus = RB_APP,
) -> None:
    print(f"Processing {file_path.name}")
          
    # Add to Django
    dj_file = add_to_django(file_path=file_path, user_uuid=user_uuid, dj_shell=dj_shell)

    es_file = File(
        uuid=dj_file.core_file_uuid,
        key=dj_file.url.parts[-1], 
        bucket=dj_file.url.parts[1], 
        creator_user_uuid=user_uuid,
    )

    # Add to S3
    with open(file_path, "rb") as f:
        s3_client.upload_fileobj(f, "redbox-storage-dev", dj_file.url.parts[-1])

    print(f"Added {file_path.name} to S3 and Django")

    # Chunk
    with open(file_path, "rb") as f:
        loader = UnstructuredDocumentLoader(
            file=es_file,
            file_bytes=f,
            env=ENV
        )
    
        chunks = list(loader.lazy_load())

    print(f"Chunked {file_path.name} ({len(chunks)} chunks)")

    # Embed and upload
    vector_store.add_documents(chunks)

    print(f"{file_path.name} complete!")


In [None]:
DIR_TO_UPLOAD: Path = Path("/Users/willlangdale/Downloads/Demo Data/Energy")
# DIR_TO_UPLOAD: Path = Path("/Users/willlangdale/Downloads/D&D")
# DIR_TO_UPLOAD: Path = Path("/Users/willlangdale/Downloads/Lit")
# DIR_TO_UPLOAD: Path = Path("/Users/willlangdale/Downloads/DS")

for file_path in DIR_TO_UPLOAD.rglob("[!.]*.*"):
    embed_and_upload_file(
        file_path=file_path,
        user_uuid=USER_UUID,
        s3_client=S3_CLIENT,
        vector_store=VECTOR_STORE,
        dj_shell=RB_APP,
    )

Let's check it uploaded okay.

In [None]:
count_uploads()

Now you can bring up the Django app and use the files.

```
docker compose up django-app -d --wait
```

## (Optional) Embed and save

While this is done elsewhere I found it helpful to keep here. Here we save the embeddings to `.jsonl` with a random file UUID without adding them to Django.

In [10]:
test = Path("/Users/willlangdale/Downloads/Demo Data/Energy") / "CS013b_Energy stats monthly brief september 2022.pdf"

In [8]:
[i for i in test.glob('*.*')]

[PosixPath('/Users/willlangdale/Downloads/Demo Data/Energy/.DS_Store'),
 PosixPath('/Users/willlangdale/Downloads/Demo Data/Energy/CS013b_Energy stats monthly brief september 2022.pdf'),
 PosixPath('/Users/willlangdale/Downloads/Demo Data/Energy/CS013a_Cabinet Office Mail - Fwd_ No.10 brief on monthly energy statistics - September 2022 + publication of September 2022 editions of Energy Trends and Energy Prices.pdf')]

In [13]:
test_user = uuid4()

In [14]:
with test.open("rb") as f:
    es_file = File(
        uuid=uuid4(),
        key="", 
        bucket="", 
        creator_user_uuid=test_user,
    )

    loader = UnstructuredDocumentLoader(
        file=es_file,
        file_bytes=f,
        env=ENV
    )

    chunks = list(loader.lazy_load())

In [15]:
chunks

[Document(metadata={'parent_file_uuid': UUID('34e870a7-5f1c-4cd1-a909-754e50da12c1'), 'creator_user_uuid': UUID('5a8b37eb-757b-406b-95e4-89ca4cac883b'), 'index': 0, 'page_number': 1, 'languages': ['eng'], 'link_texts': None, 'link_urls': None, 'links': None, 'created_datetime': datetime.datetime(2024, 7, 12, 6, 16, 24, 74406, tzinfo=datetime.timezone.utc), 'token_count': 13}, page_content='BEIS - Monthly energy statistics briefing note\n\nSeptember 2022'),
 Document(metadata={'parent_file_uuid': UUID('34e870a7-5f1c-4cd1-a909-754e50da12c1'), 'creator_user_uuid': UUID('5a8b37eb-757b-406b-95e4-89ca4cac883b'), 'index': 1, 'page_number': 1, 'languages': ['eng'], 'link_texts': None, 'link_urls': None, 'links': None, 'created_datetime': datetime.datetime(2024, 7, 12, 6, 16, 24, 76018, tzinfo=datetime.timezone.utc), 'token_count': 64}, page_content='Highlights for the 3 month period of May to July 2022 compared to the same period a year earlier: ● Primary energy consumption in the UK on a fuel

In [32]:
chunks[0].to_json()

{'lc': 1,
 'type': 'constructor',
 'id': ['langchain', 'schema', 'document', 'Document'],
 'kwargs': {'metadata': {'parent_file_uuid': UUID('34e870a7-5f1c-4cd1-a909-754e50da12c1'),
   'creator_user_uuid': UUID('5a8b37eb-757b-406b-95e4-89ca4cac883b'),
   'index': 0,
   'page_number': 1,
   'languages': ['eng'],
   'link_texts': None,
   'link_urls': None,
   'links': None,
   'created_datetime': datetime.datetime(2024, 7, 12, 6, 16, 24, 74406, tzinfo=datetime.timezone.utc),
   'token_count': 13},
  'page_content': 'BEIS - Monthly energy statistics briefing note\n\nSeptember 2022',
  'type': 'Document'}}

In [23]:
chunks[0].dict()

{'id': None,
 'metadata': {'parent_file_uuid': UUID('34e870a7-5f1c-4cd1-a909-754e50da12c1'),
  'creator_user_uuid': UUID('5a8b37eb-757b-406b-95e4-89ca4cac883b'),
  'index': 0,
  'page_number': 1,
  'languages': ['eng'],
  'link_texts': None,
  'link_urls': None,
  'links': None,
  'created_datetime': datetime.datetime(2024, 7, 12, 6, 16, 24, 74406, tzinfo=datetime.timezone.utc),
  'token_count': 13},
 'page_content': 'BEIS - Monthly energy statistics briefing note\n\nSeptember 2022',
 'type': 'Document'}

In [34]:
chunks_embedded: list[dict] = []

for i, chunk in enumerate(chunks):
    if i > 2:
        break
    chunk_dict = chunk.dict()
    chunk_dict["text"] = chunk.page_content
    chunk_dict["embedding"] = EMBEDDING_MODEL.embed_documents([chunk.page_content])[0]

    del chunk_dict["page_content"]
    del chunk_dict["type"]
    del chunk_dict["id"]
    
    chunks_embedded.append(chunk_dict)

In [35]:
chunks_embedded[0]

{'metadata': {'parent_file_uuid': UUID('34e870a7-5f1c-4cd1-a909-754e50da12c1'),
  'creator_user_uuid': UUID('5a8b37eb-757b-406b-95e4-89ca4cac883b'),
  'index': 0,
  'page_number': 1,
  'languages': ['eng'],
  'link_texts': None,
  'link_urls': None,
  'links': None,
  'created_datetime': datetime.datetime(2024, 7, 12, 6, 16, 24, 74406, tzinfo=datetime.timezone.utc),
  'token_count': 13},
 'text': 'BEIS - Monthly energy statistics briefing note\n\nSeptember 2022',
 'embedding': [0.015524807386100292,
  0.013220459222793579,
  -0.013515888713300228,
  0.006953665986657143,
  -0.020428933203220367,
  -0.024816058576107025,
  0.0012601903872564435,
  0.05229097977280617,
  -0.01579069346189499,
  0.005860577803105116,
  0.03686957061290741,
  -0.04384170100092888,
  -0.03855351731181145,
  -0.0355696827173233,
  -0.02855323813855648,
  0.007352495566010475,
  -0.009409422054886818,
  -0.03722408786416054,
  0.06647158414125443,
  -0.018892701715230942,
  0.008235090412199497,
  -0.01058375

In [76]:
def file_to_embedded_dict(
    file_path: Path,
    user_uuid: UUID = USER_UUID,
) -> None:
    """Embeds a file as a list of document dictionaries.
    
    Aims to replicate the shape the vector store adds, but randomises the UUID
    instead of using Django's.

    Converts UUIDs to strings ASAP for serialisation simplicity.
    """
    print(f"Processing {file_path.name}")

    es_file = File(
        uuid=str(uuid4()),
        key="", 
        bucket="", 
        creator_user_uuid=str(user_uuid),
    )

    # Chunk
    with file_path.open("rb") as f:
        loader = UnstructuredDocumentLoader(
            file=es_file,
            file_bytes=f,
            env=ENV
        )
    
        chunks = list(loader.lazy_load())

    print(f"Chunked {file_path.name} ({len(chunks)} chunks)")

    # Embed
    chunks_embedded: list[dict] = []

    for chunk in chunks:
        chunk_dict = chunk.dict()
        chunk_dict["text"] = chunk.page_content
        chunk_dict["embedding"] = EMBEDDING_MODEL.embed_documents([chunk.page_content])[0]

        del chunk_dict["page_content"]
        del chunk_dict["type"]
        del chunk_dict["id"]
        
        chunks_embedded.append(chunk_dict)

    print(f"Embedded {file_path.name} ({len(chunks_embedded)} chunks)")

    return chunks_embedded

def save_doc_dicts_to_jsonl(documents: Iterable[dict], file_path: str) -> None:
    class DocumentJSONEncoder(json.JSONEncoder):
        def default(self, obj):
            if isinstance(obj, datetime.datetime):
                return obj.isoformat()
            if isinstance(obj, UUID):
                return str(obj)
            return super().default(obj)

    def doc_dumps(obj: dict):
        return json.dumps(obj, cls=DocumentJSONEncoder)
    
    with jsonlines.open(file_path, mode="w", dumps=doc_dumps) as writer:
        writer.write_all(documents)

def embed_dir(dir_path: Path, out_file: Path, user_uuid: UUID = USER_UUID) -> None:
    print(f"Embedding {dir_path}")
    chunks: list[dict] = []
    for file_path in dir_path.rglob("[!.]*.*"):
        chunks += file_to_embedded_dict(
            file_path=file_path,
            user_uuid=user_uuid,
        )

    save_doc_dicts_to_jsonl(documents=chunks, file_path=out_file)
    print(f"Saved {dir_path} to {out_file}")

In [74]:
chunks_embedded[0]
save_doc_dicts_to_jsonl(chunks_embedded, Path("test.jsonl"))


In [56]:
chunks_test = file_to_embedded_dict(test)

Processing CS013b_Energy stats monthly brief september 2022.pdf
Chunked CS013b_Energy stats monthly brief september 2022.pdf (61 chunks)
Embedded CS013b_Energy stats monthly brief september 2022.pdf (61 chunks)


In [57]:
chunks_test

[{'metadata': {'parent_file_uuid': UUID('f13bdbb6-e521-4cd2-8179-147215355506'),
   'creator_user_uuid': UUID('8dec0bc5-dd02-4b67-910d-77e225cdbc44'),
   'index': 0,
   'page_number': 1,
   'languages': ['eng'],
   'link_texts': None,
   'link_urls': None,
   'links': None,
   'created_datetime': datetime.datetime(2024, 7, 12, 7, 8, 7, 53645, tzinfo=datetime.timezone.utc),
   'token_count': 13},
  'text': 'BEIS - Monthly energy statistics briefing note\n\nSeptember 2022',
  'embedding': [0.015524807386100292,
   0.013220459222793579,
   -0.013515888713300228,
   0.006953665986657143,
   -0.020428933203220367,
   -0.024816058576107025,
   0.0012601903872564435,
   0.05229097977280617,
   -0.01579069346189499,
   0.005860577803105116,
   0.03686957061290741,
   -0.04384170100092888,
   -0.03855351731181145,
   -0.0355696827173233,
   -0.02855323813855648,
   0.007352495566010475,
   -0.009409422054886818,
   -0.03722408786416054,
   0.06647158414125443,
   -0.018892701715230942,
   0.008

In [77]:
DIR_TO_SAVE: Path = Path("/Users/willlangdale/Downloads/Demo Data/Energy")
# DIR_TO_SAVE: Path = Path("/Users/willlangdale/Downloads/D&D")
# DIR_TO_SAVE: Path = Path("/Users/willlangdale/Downloads/Lit")
# DIR_TO_SAVE: Path = Path("/Users/willlangdale/Downloads/DS")

OUT_DIR: Path = DIR_TO_SAVE / "text-embedding-3-large.jsonl"

embed_dir(dir_path=DIR_TO_SAVE, out_file=OUT_DIR)

Embedding /Users/willlangdale/Downloads/DS
Processing KAN- Kolmogorov–Arnold Networks.pdf
Chunked KAN- Kolmogorov–Arnold Networks.pdf (647 chunks)
Embedded KAN- Kolmogorov–Arnold Networks.pdf (647 chunks)
Processing Mamba- Linear-Time Sequence Modeling with Selective State Spaces.pdf
Chunked Mamba- Linear-Time Sequence Modeling with Selective State Spaces.pdf (658 chunks)
Embedded Mamba- Linear-Time Sequence Modeling with Selective State Spaces.pdf (658 chunks)
Saved /Users/willlangdale/Downloads/DS to /Users/willlangdale/Downloads/DS/text-embedding-3-large.jsonl
