In [None]:
# Add autoreload
%reload_ext autoreload
%autoreload 2

In [None]:
import sys
from uuid import UUID, uuid4
import pandas as pd
from pathlib import Path
from mypy_boto3_s3.client import S3Client
from botocore.exceptions import ClientError

from django.core.files.uploadedfile import SimpleUploadedFile

from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan, bulk

from redbox.models import Settings, Chunk, File
from redbox.models.settings import ElasticLocalSettings
from redbox.model_db import SentenceTransformerDB
from redbox.parsing import chunk_file

from langchain_core.embeddings import Embeddings
from langchain.globals import set_verbose

from dj_notebook import activate, Plus

ROOT = Path().resolve().parent

set_verbose(False)

from dotenv import find_dotenv, load_dotenv

_ = load_dotenv(find_dotenv(ROOT / '.env'))

pd.set_option("display.max_colwidth", None)

ENV = Settings(
    minio_host="localhost", 
    object_store="minio", 
    elastic=ElasticLocalSettings(host="localhost"),
)
MODEL = ENV.embedding_model

S3_CLIENT = ENV.s3_client()
ES_CLIENT = ENV.elasticsearch_client()

try:
    S3_CLIENT.create_bucket(
        Bucket=ENV.bucket_name,
        CreateBucketConfiguration={"LocationConstraint": ENV.aws_region},
    )
except ClientError as e:
    if e.response["Error"]["Code"] != "BucketAlreadyOwnedByYou":
        raise

sys.path.insert(0, str(ROOT / "django_app"))

RB_APP = activate(
    dotenv_file=str(ROOT / "django_app/.env")
)

# Quick upload

The worker takes forever locally. This notebook will chunk and upload stuff using your local machine, which is much quicker for me. Unlike the eval notebooks, this also makes entried in the Postgres database.

This notebook needs the following services running:

```
docker compose up core-api db -d
```

It's also important that both `.env` files contain the same embedding model.

## Get your user UUID

In [None]:
RB_APP.read_frame(RB_APP.User.objects.all())[["id", "email"]]

In [None]:
USER_UUID = UUID("5c37bf4c-002c-458d-9e68-03042f76a5b1")

## Embed and upload

Consider clearing all files in Elastic and Postgres.

In [None]:
RB_APP.read_frame(RB_APP.File.objects.all())[["id", "original_file_name"]]

In [None]:
def clear_index(index: str, es: Elasticsearch) -> None:
    documents = scan(es, index=index, query={"query": {"match_all": {}}})
    bulk_data = [
        {"_op_type": "delete", "_index": doc['_index'], "_id": doc['_id']} for doc in documents
    ]
    bulk(es, bulk_data, request_timeout=300)

def clear_bucket(bucket: str, s3: S3Client) -> None:
    response = s3.list_objects_v2(Bucket=bucket)
    if "Contents" in response:
        # Delete each object
        for obj in response["Contents"]:
            s3.delete_object(Bucket=bucket, Key=obj["Key"])

clear_index(index="redbox-data-chunk", es=ES_CLIENT)
clear_index(index="redbox-data-file", es=ES_CLIENT)
_ = RB_APP.File.objects.all().delete()
clear_bucket(bucket="redbox-storage-dev", s3=S3_CLIENT)

In [None]:
def count_bucket_objects(bucket: str, s3: S3Client = S3_CLIENT) -> int:
    object_count = 0
    response = s3.list_objects_v2(Bucket=bucket)

    if 'Contents' in response:
        object_count += len(response['Contents'])
        # Paginate
        while response['IsTruncated']:
            continuation_token = response['NextContinuationToken']
            response = s3.list_objects_v2(Bucket=bucket, ContinuationToken=continuation_token)
            object_count += len(response['Contents'])

    return object_count

def count_uploads(
    es: Elasticsearch = ES_CLIENT, 
    dj_shell: Plus = RB_APP, 
    s3: S3Client = S3_CLIENT
):
    return {
        "django_files": dj_shell.File.objects.count(),
        "s3_files": count_bucket_objects(bucket="redbox-storage-dev", s3=s3),
        "elastic_files": es.count(index="redbox-data-file", body={"query": {"match_all": {}}})["count"],
        "elastic_chunks": es.count(index="redbox-data-chunk", body={"query": {"match_all": {}}})["count"],
    }

count_uploads()

Now we embed and upload.

In [None]:
def add_to_django(
    file_path: Path,
    user_uuid: UUID = USER_UUID,
    dj_shell: Plus = RB_APP,
):
    with open(file_path, "rb") as f:
        file = dj_shell.File.objects.create(
            status="complete",
            original_file=SimpleUploadedFile(
                name=file_path.name,
                content=f.read()
            ),
            user=RB_APP.User.objects.get(id=user_uuid),
            original_file_name=file_path.name,
            core_file_uuid=uuid4(),
        )
        file.save()
    
    return file

def embed_and_upload_file(
    file_path: Path,
    model: Embeddings,
    user_uuid: UUID = USER_UUID,
    es_client: Elasticsearch = ES_CLIENT,
    s3_client: S3Client = S3_CLIENT,
    dj_shell: Plus = RB_APP,
) -> None:
    print(f"Processing {file_path.name}")
          
    # Add to Django
    dj_file = add_to_django(file_path=file_path, user_uuid=user_uuid, dj_shell=dj_shell)

    es_file = File(
        uuid=dj_file.core_file_uuid,
        key=dj_file.url.parts[-1], 
        bucket=dj_file.url.parts[1], 
        creator_user_uuid=user_uuid,
    )

    # Add to S3
    with open(file_path, "rb") as f:
        s3_client.upload_fileobj(f, "redbox-storage-dev", dj_file.url.parts[-1])

    print(f"Added {file_path.name} to S3 and Django")

    # Chunk
    chunks = chunk_file(file=es_file, s3_client=s3_client)

    print(f"Chunked {file_path.name} ({len(chunks)} chunks)")

    # Embed
    embeddings = [embedding.embedding for embedding in model.embed_sentences([chunk.text for chunk in chunks]).data]

    print(f"Embedded {file_path.name} ({len(embeddings)} chunks)")

    # Merge
    es_chunks = []
    for chunk, embedding in zip(chunks, embeddings, strict=True):
        chunk_embedded = Chunk(
            uuid=chunk.uuid,
            created_datetime=chunk.created_datetime,
            creator_user_uuid=chunk.creator_user_uuid,
            parent_file_uuid=chunk.parent_file_uuid,
            index=chunk.index,
            text=chunk.text,
            metadata=chunk.metadata,
            embedding=embedding,
        )
        es_chunks.append(chunk_embedded)

    # Add to Elastic
    es_client.index(
        index="redbox-data-file",
        id=es_file.uuid,
        body=es_file.model_dump_json(),
    )
    
    for chunk in es_chunks:
        es_client.index(
            index="redbox-data-chunk",
            id=chunk.uuid,
            body=chunk.model_dump_json(),
        )

    print(f"{file_path.name} complete!")


In [None]:
# DIR_TO_UPLOAD: Path = Path("/Users/willlangdale/Downloads/Demo Data")
# DIR_TO_UPLOAD: Path = Path("/Users/willlangdale/Downloads/D&D")
# DIR_TO_UPLOAD: Path = Path("/Users/willlangdale/Downloads/Lit")
DIR_TO_UPLOAD: Path = Path("/Users/willlangdale/Downloads/DS")

for file_path in DIR_TO_UPLOAD.rglob("[!.]*.*"):
    embed_and_upload_file(
        file_path=file_path,
        model=SentenceTransformerDB(embedding_model_name=MODEL),
        user_uuid=USER_UUID,
        es_client=ES_CLIENT,
        s3_client=S3_CLIENT,
        dj_shell=RB_APP,
    )

Let's check it uploaded okay.

In [None]:
count_uploads()

Now you can bring up the Django app and use the files.

```
docker compose up django-app -d --wait
```