In [1]:
from uuid import uuid4
from typing import List, TYPE_CHECKING
from functools import reduce, wraps
from operator import itemgetter
from pathlib import Path
from dataclasses import dataclass
import os

import boto3
from pydantic import Field
from faststream.redis.fastapi import RedisRouter
from elasticsearch import Elasticsearch
from langchain.chains import (
    StuffDocumentsChain,
    LLMChain,
    ReduceDocumentsChain,
    MapReduceDocumentsChain,
)
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
from langchain_community.chat_models import ChatLiteLLM
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_elasticsearch import ApproxRetrievalStrategy, ElasticsearchStore
from langchain.schema import Document
from langchain_core.runnables import RunnableLambda, Runnable, chain, RunnablePassthrough, RunnableBranch
from langchain_core.runnables.config import RunnableConfig
from langchain.schema import StrOutputParser, Document
from langchain_core.runnables.base import RunnableEach
from unstructured.partition.auto import partition
from unstructured.chunking.basic import chunk_elements

from core_api.src.publisher_handler import FilePublisher
from redbox.storage import ElasticsearchStorageHandler
from redbox.models import File
from redbox.models.settings import Settings
from redbox.models.file import Metadata, UUID, PersistableModel
from redbox.models.chat import ChatRequest, ChatResponse
from redbox.storage import ElasticsearchStorageHandler
from redbox.llm.prompts.core import _core_redbox_prompt
from redbox.storage.storage_handler import BaseStorageHandler
from redbox.models.file import Chunk, File
from redbox.model_db import SentenceTransformerDB


In [2]:
creator_user_uuid=UUID('673f53f0-15e5-4ca1-be4b-41adcf602ab8')

In [33]:
env = Settings(_env_file=".env")
es_root_index = "summarisation"

model_name = "gpt-4"
context_length = 8192
output_max_tokens = 512
ingest_chunk_size = 10000


max_llm_concurrency=128

es = Elasticsearch(
    hosts=[
        {
            "host": "localhost",
            "port": env.elastic.port,
            "scheme": env.elastic.scheme,
        }
    ],
    basic_auth=(env.elastic.user, env.elastic.password),
)
if env.elastic.subscription_level == "basic":
    strategy = ApproxRetrievalStrategy(hybrid=False)
elif env.elastic.subscription_level in ["platinum", "enterprise"]:
    strategy = ApproxRetrievalStrategy(hybrid=True)

sentence_transformer_db = SentenceTransformerDB(env.embedding_model)

s3_client = boto3.client("s3", endpoint_url=f"http://{env.minio_host}:{env.minio_port}", aws_access_key_id=env.aws_access_key, aws_secret_access_key=env.aws_secret_key)

storage_handler = ElasticsearchStorageHandler(
    es_client=es,
    root_index=es_root_index
)

api_base="https://oai-i-dot-ai-playground-sweden.openai.azure.com/"

print(os.environ["AZURE_OPENAI_API_KEY"])

llm = ChatLiteLLM(
    model=f"azure/{model_name}",
    api_base=api_base,
    max_tokens=output_max_tokens
)


INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-mpnet-base-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu


a877928388654d678aab2e5014c96c3a


In [34]:

### Ingest Pipeline ###

if TYPE_CHECKING:
    from mypy_boto3_s3.client import S3Client
else:
    S3Client = object

@dataclass
class LocalFile:
    creator_user_uuid: UUID
    filepath: Path


def upload_file(
        storage_handler: BaseStorageHandler, 
        s3: S3Client,
        env: Settings
    ):
    @chain
    def wrapped(local_file: LocalFile) -> File:
        file_uuid = str(uuid4())
        s3.put_object(Bucket=env.bucket_name, Key=str(file_uuid), Body=open(local_file.filepath, 'rb'))
        file = File(uuid=file_uuid, creator_user_uuid=local_file.creator_user_uuid, key=file_uuid, bucket=env.bucket_name)
        storage_handler.write_item(file)
        return file
    return wrapped


def file_chunker(env: Settings, s3_client: S3Client, max_chunk_size: int = 20000):
    @chain
    def wrapped(file: File) -> List[Chunk]:
        authenticated_s3_url = s3_client.generate_presigned_url(
            "get_object",
            Params={"Bucket": file.bucket, "Key": file.key},
            ExpiresIn=3600,
        )
        elements = partition(url=authenticated_s3_url, strategy=env.partition_strategy)
        raw_chunks = chunk_elements(
            elements, 
            new_after_n_chars=max_chunk_size, 
            max_characters=max_chunk_size+32
        )
        print(f"Elements chunked")
        return [
            Chunk(
                parent_file_uuid=file.uuid,
                index=i,
                text=raw_chunk.text,
                metadata=Metadata(
                    parent_doc_uuid=file.uuid,
                    page_number=raw_chunk.metadata.page_number,
                    languages=raw_chunk.metadata.languages,
                    link_texts=raw_chunk.metadata.link_texts,
                    link_urls=raw_chunk.metadata.link_urls,
                    links=raw_chunk.metadata.links,
                ),
                creator_user_uuid=file.creator_user_uuid,
            )
            for i, raw_chunk in enumerate(raw_chunks)
        ]
    return wrapped


def local_embedder(model: SentenceTransformerDB):
    @chain
    def wrapped(chunks: List[Chunk]) -> List[Chunk]:
        print(f"Starting Embedding")
        embedded_sentences = model.embed_sentences([c.text for c in chunks])
        for i, c in enumerate(chunks):
            c.embedding = embedded_sentences.data[i].embedding
        return chunks
    return wrapped

def chunk_writer(storage_handler: BaseStorageHandler):
    @chain
    def wrapped(chunks: List[Chunk]) -> UUID:
        print(f"Writing Chunks")
        storage_handler.write_items(chunks)
        return chunks[0].parent_file_uuid
    return wrapped

def summarisation_ingest_chain(n=20000):
    chain =(
        upload_file(storage_handler, s3_client, env)
        | file_chunker(env, s3_client, max_chunk_size=n)
        | chunk_writer(storage_handler)
    )
    return chain


In [35]:
### Execution Ingest ###

es.options(ignore_status=[400,404]).indices.delete(index=f"{es_root_index}-file")
es.options(ignore_status=[400,404]).indices.delete(index=f"{es_root_index}-chunk")

summarisation_ingest = summarisation_ingest_chain(ingest_chunk_size)

ingest_result = summarisation_ingest.invoke(
    LocalFile(
        filepath=Path("../data/Conservative-Manifesto-GE2024.pdf"),
        creator_user_uuid=creator_user_uuid
    )
)

file_uuid = ingest_result
print(file_uuid)

INFO:elastic_transport.transport:DELETE http://localhost:9200/summarisation-file [status:200 duration:0.099s]
INFO:elastic_transport.transport:DELETE http://localhost:9200/summarisation-chunk [status:200 duration:0.089s]
INFO:elastic_transport.transport:PUT http://localhost:9200/summarisation-file/_doc/1af7f379-7d71-402f-8cfc-7aea375ecc99 [status:201 duration:0.361s]


Elements chunked
Writing Chunks


INFO:elastic_transport.transport:PUT http://localhost:9200/summarisation-chunk/_doc/2acdb8cb-210c-439b-98aa-2a1e227c9704 [status:201 duration:0.337s]
INFO:elastic_transport.transport:PUT http://localhost:9200/summarisation-chunk/_doc/3c6176be-fc90-4883-849b-8db54e2654ad [status:201 duration:0.002s]
INFO:elastic_transport.transport:PUT http://localhost:9200/summarisation-chunk/_doc/2e3fe64f-44fc-4dcf-858a-f2f17d143dd2 [status:201 duration:0.002s]
INFO:elastic_transport.transport:PUT http://localhost:9200/summarisation-chunk/_doc/c52a4299-093d-4c60-b3d6-302e63201454 [status:201 duration:0.002s]
INFO:elastic_transport.transport:PUT http://localhost:9200/summarisation-chunk/_doc/8e11a441-8c4c-4d7e-bb3b-07471f5bf729 [status:201 duration:0.003s]
INFO:elastic_transport.transport:PUT http://localhost:9200/summarisation-chunk/_doc/6b5f6338-327f-4738-9915-f14832063263 [status:201 duration:0.007s]
INFO:elastic_transport.transport:PUT http://localhost:9200/summarisation-chunk/_doc/8f78d5f1-ac58-44

1af7f379-7d71-402f-8cfc-7aea375ecc99


In [40]:

### Summarisation Pipeline ###

system_prompt = "You are a helpful assistant who extracts the main points from documents and produces a simple summary in paragraphs"
map_prompt_template = "Give me a summary of this document: {context}"
reduce_prompt_template = "Summarise this document: {docs}"
max_prompt_size = context_length - output_max_tokens - len(system_prompt) - max(len(map_prompt_template), len(reduce_prompt_template))

def document_reader(storage_handler: BaseStorageHandler, user_uuid):
    @chain
    def wrapped(parent_file_uuid):
        chunks = storage_handler.get_file_chunks(
            parent_file_uuid=parent_file_uuid,
            user_uuid=user_uuid
        )
        return [
            Document(page_content=chunk.text, metadata={"source": "local"})
            for chunk in chunks
        ]
    return wrapped




@chain
def summarise(file_uuid):
    docs = document_reader(storage_handler, creator_user_uuid).invoke(file_uuid)
    mapreduce_loops = 0
    while mapreduce_loops < 3:
        summaries = (
            ChatPromptTemplate.from_messages([
                ("system", system_prompt),
                ("human", map_prompt_template)
            ])
            | llm
        ).batch(
            docs, 
            config=RunnableConfig(
                max_concurrency=max_llm_concurrency
            )
        )
        prompt_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
            model_name=model_name, chunk_size=max_prompt_size, chunk_overlap=0
        )

        def combine_summaries(summaries):
            return " ; ".join([s.content for s in summaries[:24]])

        combined = combine_summaries(summaries)
        summarise_prompt = ChatPromptTemplate.from_messages([
                ("system", system_prompt),
                ("human", reduce_prompt_template)
            ]).invoke(combined)
        prompt_multiples_of_context_length = len(prompt_splitter.split_text(str(summarise_prompt)))
        if (prompt_multiples_of_context_length == 1):
            # Stop summarising the summaries we can go to final summary
            print("Going to final summarisation")
            break
        else:
            # We can't do a summary of all docs due to length so combine them into two docs and summarise again
            print(f"Prompt too large ({prompt_multiples_of_context_length} x Context Length) - mapreducing again")
            docs = [
                combine_summaries(summaries[i:min(i+prompt_multiples_of_context_length, len(summaries))])
                for i in range(0, len(summaries)+prompt_multiples_of_context_length, prompt_multiples_of_context_length)
            ]
            mapreduce_loops += 1
    else:
        # Panic because we're looping a long time to get this down to a reasonable size?
        print("Too many loops")
    result = llm.invoke(summarise_prompt)
    return StrOutputParser().invoke(result)


In [41]:
#file_uuid="d3fb2d57-15a2-451c-806e-19fb17f8da21"
answer = summarise.invoke(file_uuid)

print(f"[{file_uuid}] {answer}")

INFO:elastic_transport.transport:POST http://localhost:9200/summarisation-chunk/_search?scroll=5m [status:200 duration:0.006s]
INFO:elastic_transport.transport:POST http://localhost:9200/_search/scroll [status:200 duration:0.002s]
INFO:elastic_transport.transport:DELETE http://localhost:9200/_search/scroll [status:200 duration:0.001s]
[92m08:30:22 - LiteLLM:INFO[0m: utils.py:1307 - [92m

POST Request Sent from LiteLLM:
curl -X POST \
https://oai-i-dot-ai-playground-sweden.openai.azure.com/ \
-H 'api_key: a87792838865********************' -H 'azure_ad_token: None' \
-d '{'model': 'gpt-4', 'messages': [{'role': 'system', 'content': 'You are a helpful assistant who extracts the main points from documents and produces a simple summary in paragraphs'}, {'role': 'user', 'content': "Give me a summary of this document: page_content='The Conservative and Unionist Party Manifesto 2024\\n\\nThe Conservative and Unionist Party Manifesto 2024\\n\\nContents ❱ Foreword ����������������������������

Going to final summarisation


INFO:httpx:HTTP Request: POST https://oai-i-dot-ai-playground-sweden.openai.azure.com//openai/deployments/gpt-4/chat/completions?api-version=2024-02-01 "HTTP/1.1 200 OK"
[92m08:32:09 - LiteLLM:INFO[0m: utils.py:3368 - Wrapper: Completed Call, calling success_handler
INFO:LiteLLM:Wrapper: Completed Call, calling success_handler


[1af7f379-7d71-402f-8cfc-7aea375ecc99] The Conservative and Unionist Party Manifesto 2024 details the party’s objectives concerning economic stability, national security, societal growth, and various other government facets. Key points include tax reduction for individuals, the provision of support for self-employed workers and pensioners, and upholding benefits resulting from Brexit. The party intends to reform education by mandating National Service for graduates and policing stricter immigration caps for border security. They also plan to bolster community strength via additional NHS spending and the recruitment of more police officers.

The document also highlights the UK's commitment to growing its economy, with strategies like reducing borrowing, supporting business innovation, implementing tax cuts, and transitioning to sustainable energy on an affordable budget. SMEs are considered essential, with a ten-point plan laid out for encouraging their growth.

The document outlines pl

In [42]:
import pprint

print(pprint.pformat(answer, width=120, compact=True))
#    print(line)

('The Conservative and Unionist Party Manifesto 2024 details the party’s objectives concerning economic stability, '
 'national security, societal growth, and various other government facets. Key points include tax reduction for '
 'individuals, the provision of support for self-employed workers and pensioners, and upholding benefits resulting '
 'from Brexit. The party intends to reform education by mandating National Service for graduates and policing stricter '
 'immigration caps for border security. They also plan to bolster community strength via additional NHS spending and '
 'the recruitment of more police officers.\n'
 '\n'
 "The document also highlights the UK's commitment to growing its economy, with strategies like reducing borrowing, "
 'supporting business innovation, implementing tax cuts, and transitioning to sustainable energy on an affordable '
 'budget. SMEs are considered essential, with a ten-point plan laid out for encouraging their growth.\n'
 '\n'
 'The document 