In [1]:
class Document:
    def __init__(self, content, metadata=None, doc_id=None, num_chunks=None):
        self.content = content
        self.metadata = metadata if metadata is not None else {}
        self.id = doc_id if doc_id is not None else str(uuid.uuid4())
        self.num_chunks = num_chunks

    def add_metadata(self, key, value):
        """
        Adds a key-value pair to the document's metadata.

        :param key: The key for the metadata item.
        :param value: The value for the metadata item.
        """
        self.metadata[key] = value

    def __str__(self):
        """
        Returns a string representation of the document.

        :return: A string containing the content and metadata of the document.
        """
        return f"Content: {self.content}\nMetadata: {self.metadata}"


    def to_json(self):
        """ Convert the Document object to a JSON string """
        # Use a dictionary comprehension to handle the renaming of 'id' to 'doc_id'
        data = {k if k != 'id' else 'doc_id': v for k, v in self.__dict__.items()}
        return json.dumps(data)

    @classmethod
    def from_json(cls, json_str):
        """ Create a Document object from a JSON string """
        data = json.loads(json_str)
        return cls(**data)


In [2]:
import json
import uuid
with open('documents.json', 'r') as file:
            json_documents = json.load(file)

loaded_documents = [Document.from_json(json_str) for json_str in json_documents]

In [3]:
type = loaded_documents[0].content.split('\n')[-1] if len(loaded_documents[0].content.split('\n')[-1]) > 6 else ""  
loaded_documents[0].content.split('\n')[0] + " " + type

'Title: How to Businessplan '

In [None]:
from FlagEmbedding import BGEM3FlagModel

model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=False)

def bge_m3_embed(query: str):
    # Can add "use_fp16=True" to speed up predictions
    embeddings = model.encode([query])['dense_vecs'][0]
    return embeddings

# Example usage (1024 dimensions)
embeddings = bge_m3_embed("This is a text I want to embed")

In [6]:
embeddings = []
for document in loaded_documents:
    #text = document.content

    #special early stage treatment where no descriptions exist
    type = document.content.split('\n')[-1] if len(document.content.split('\n')[-1]) > 6 else ""  
    text = document.content.split('\n')[0] + " " + type
    text = text.strip()
    embedding = bge_m3_embed(text)
    embeddings.append(embedding)

In [7]:
len(embeddings)

106

In [8]:
import numpy as np
np_embeddings = np.vstack(embeddings)

In [9]:
np_embeddings.shape

(106, 1536)

In [10]:
np.save('earlyopenembeddings.npy', np_embeddings)