# Context

This code relates to building a feature that extracts a list of entities and entity metadata for an app from documents.

In [4]:
import asyncio
from typing import List
import instructor
import json
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
from openai import AsyncOpenAI, OpenAI
from pydantic import BaseModel

client = instructor.from_openai(OpenAI())


class Document(BaseModel):
    title: str
    content: str


def generate_physical_objects(n_objects: int = 3) -> List[Document]:
    prompt = (
        f"Create a list of {n_objects} documents someone might be interacting with in the workplace, like email, chat or a memo.\n"
    )
    prompt += "The content of the documents may share similar entities, themes and some content, but each document title and content must be unique.\n"
    prompt += "Contradicting information between one document and another related one, like in an email and a memo, is ok only to the extent you would expect that in real data."
    prompt += "Respond only with the list of documents and their content."
    prompt += "If a document would likely contain things like headings and rich text formatting, be sure to add such content formatted in markdown."

    try:
        objects = client.chat.completions.create(
            model="gpt-4o-mini",
            response_model=List[Document],
            messages=[{"role": "user", "content": prompt}],
        )
        return objects
    except Exception as e:
        print(f"Error generating evals: {str(e)}")
        return []


objects = generate_physical_objects()
print(f"Created {len(objects)} unique objects")
print(f"First 10 objects: {objects[:10]}")

Created 3 unique objects
First 10 objects: [Document(title='Project Update Email', content='**Subject:** Project Update: Phase 2 Progress\n\nDear Team,\n\nI hope this message finds you well. I wanted to share a quick update on the progress of Phase 2 of our project. Currently, we are on track to complete the initial stages by the end of this month. \n\n### Key Highlights:\n- **Milestone 1** completed: All user requirements have been gathered.\n- **Milestone 2** in progress: Development team is working on the features.\n\n### Next Steps:\nPlease ensure that you complete your respective tasks and provide your input for the upcoming review meeting scheduled for next week.\n\nBest regards,\n\nJohn Doe  \nProject Manager'), Document(title='Team Chat Discussion', content="**Team Chat – Project Coordination**  \n**Date:** October 10, 2023  \n\n**John:** Just a reminder that the review meeting is on Friday. Let's aim to finalize our tasks by then.  \n**Emily:** I’m still working on Milestone 2

Chunk all documents

In [2]:
# Patch the AsyncOpenAI client
async_client = instructor.from_openai(AsyncOpenAI())


class Review(BaseModel):
    review: str


class AllObjectInfo(BaseModel):
    product_title: str
    product_description: str
    review: str


async def make_reviews(
    product: Product, n: int, semaphore: asyncio.Semaphore = asyncio.Semaphore(1)
) -> List[AllObjectInfo]:
    async with semaphore:
        prompt = f"""
        Write {n} realistic but detailed/specific product reviews that might show up on a hardware store's website.

        The reviews should be about the following product:
        Product Title: {product.title}
        Product Description: {product.description}
        
        Add many relevant and concrete facts about the products (this is for synthetic data generation, make up facts about each product as necessary).

        To see the format of a possible review, here is a review for a saw:
        ```
        I've enjoyed using this saw. It is lightweight and the battery lasts longer than other brands.
        I've been using it for 3 years now and it has been very durable. It was twice as expensive as the PX-500. But
        it is comfortable to hold because of the light weight.
        ```

        Respond only with the reviews, and nothing else.
        """

        try:
            result = await async_client.chat.completions.create(
                model="gpt-4o",
                response_model=List[Review],
                messages=[{"role": "user", "content": prompt}],
            )
            return [
                AllObjectInfo(
                    product_title=product.title,
                    product_description=product.description,
                    review=r.review,
                )
                for r in result
            ]

        except Exception as e:
            print(f"Error generating FreeCAD code: {str(e)}")
            return []


async def create_synthetic_reviews(
    max_concurrency: int = 20, reviews_per_product: int = reviews_per_product
) -> List[AllObjectInfo]:
    out = []
    semaphore = asyncio.Semaphore(max_concurrency)
    tasks = [make_reviews(o, reviews_per_product, semaphore) for o in objects]
    results = await asyncio.gather(*tasks, return_exceptions=True)
    for r in results:
        if not isinstance(r, Exception):
            out.extend(r)
    return out


reviews = await create_synthetic_reviews()

Store the items to be retrieved in LanceDB

In [3]:
db = lancedb.connect("./lancedb")
func = get_registry().get("openai").create(name="text-embedding-3-small")


class Products(LanceModel):
    id: str = func.SourceField()
    title: str = func.SourceField()
    description: str = func.SourceField()
    vector: Vector(func.ndims()) = func.VectorField()


products_table = db.create_table("products", schema=Products, mode="overwrite")
products_data = [
    {"id": f"{i}", "title": obj.title, "description": obj.description}
    for i, obj in enumerate(objects)
]
products_table.add(products_data)
products_table.create_fts_index("description", replace=True)
product_id_map = {
    p["title"]: p["id"] for p in products_table.to_pandas().to_dict("records")
}


class Reviews(LanceModel):
    id: str = func.SourceField()
    product_title: str = func.SourceField()
    product_description: str = func.SourceField()
    review: str = func.SourceField()
    vector: Vector(func.ndims()) = func.VectorField()


reviews_table = db.create_table("reviews", schema=Reviews, mode="overwrite")

reviews_with_product_id = [
    {
        "id": f"{i}",
        "product_title": review.product_title,
        "product_description": review.product_description,
        "review": review.review,
    }
    for i, review in enumerate(reviews)
]
reviews_table.add(reviews_with_product_id)
reviews_table.create_fts_index("review", replace=True)

[2024-08-07T07:20:04Z WARN  lance::dataset] No existing dataset at /Users/matthewchana/Documents/systematically-improving-rag/week1_bootstrap_evals/lancedb/products.lance, it will be created
[2024-08-07T07:20:06Z WARN  lance::dataset] No existing dataset at /Users/matthewchana/Documents/systematically-improving-rag/week1_bootstrap_evals/lancedb/reviews.lance, it will be created


In case you want to see the data quickly in a text editor, we also store the data in JSON.

In [4]:
with open("./reviews.json", "w") as f:
    json.dump([i.dict() for i in reviews], f)