In [1]:
# !pip install -U sentence-transformers

In [30]:
import os
import json
import uuid

from sentence_transformers import SentenceTransformer

In [53]:
def generate_embeddings(text: str) -> list[float]:
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    embeddings = model.encode(text)

    return embeddings.tolist()

In [62]:
def transform_data(input_folder_path: str) -> None:
    json_file_paths = []
    for root, directories, files in os.walk(input_folder_path):
        for file in files:
            # Construct the full file path and add it to the list
            full_path = os.path.join(root, file)
            json_file_paths.append(full_path)

    print(len(json_file_paths))

    for json_file in json_file_paths:

        print("Embedding file: {}".format(json_file))
        output_folder_name = "embedded-data"
        output_folder_path = output_folder_name + "/" + \
            os.path.basename(json_file).split('.json')[0]
        # print(output_folder_path)

        # Create folder if not exist
        if not os.path.exists(output_folder_path):
            os.makedirs(output_folder_path)

        file_name = f"{os.path.basename(json_file).split('.json')[0]}.pdf"
        file_path = os.path.join("manifesto", file_name)

        try:
            with open(json_file, 'r') as file:
                loaded_dict = json.load(file)

            for page_number, page_content in loaded_dict.items():
                # List to store document metadata
                doc_metadata = []

                for chunk in page_content:
                    temp = {"documentID": str(uuid.uuid4()),
                            "file_path": file_path,
                            "file_name": file_name,
                            "page_number": page_number,
                            "content": chunk,
                            "embedding": generate_embeddings(chunk)
                            }

                    doc_metadata.append(temp)

                json_data = json.dumps(doc_metadata)

                output_file_path = os.path.join(
                    output_folder_path, f"{page_number}.json")
                with open(output_file_path, "w") as f:
                    f.write(json_data)

        except Exception as e:
            print(e)

        # break

In [63]:
transform_data("extracted-data")

20
extracted-data\01-table-of-contents.json
embedded-data/01-table-of-contents
extracted-data\02-editorial-introduction.json
embedded-data/02-editorial-introduction
embedded-data/02-editorial-introduction
extracted-data\03-preface-to-the-1872-german-edition.json
embedded-data/03-preface-to-the-1872-german-edition
extracted-data\04-preface-to-the-1882-russian-edition.json
embedded-data/04-preface-to-the-1882-russian-edition
extracted-data\05-preface-to-the-1883-german-edition.json
embedded-data/05-preface-to-the-1883-german-edition
extracted-data\06-preface-to-the-1888-english-edition.json
embedded-data/06-preface-to-the-1888-english-edition
embedded-data/06-preface-to-the-1888-english-edition
embedded-data/06-preface-to-the-1888-english-edition
extracted-data\07-preface-to-the-1890-german-edition.json
embedded-data/07-preface-to-the-1890-german-edition
embedded-data/07-preface-to-the-1890-german-edition
extracted-data\08-preface-to-the-1892-polish-edition.json
embedded-data/08-preface-