In [4]:
from langchain_openai import ChatOpenAI
import os
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_community.document_transformers.openai_functions import (
    create_metadata_tagger,
)
from langchain.docstore.document import Document
from pymongo import MongoClient
from datetime import datetime
from DataProcessing.metadata_properties import Property1
from DataProcessing.utils import load_yaml


class MetaDataTagger:
    def __init__(self, config):
        self.category_id = config["settings"]["category_id"]
        self.file_type = config["settings"]["file_type"]
        self.prompt = load_yaml("../prompts/metadatatagger.yaml")["prompt"]
        self.data_path = config["settings"]["base_path"]
        self.data_list = [
            x for x in os.listdir(self.data_path) if not x.startswith(".")
        ]
        self.headers_to_split_on = [
            ("#", "Header 1"),
            ("##", "Header 2"),
            ("###", "Header 3"),
            ("####", "Header 4"),
        ]
        self.llm = ChatOpenAI(model="gpt-4o-mini-2024-07-18", temperature=0.1)
        self.db = MongoClient("mongodb://localhost:27017/").financedata

    def tag_metadata(self):
        for data in self.data_list:
            with open(
                os.path.join(self.data_path, data), "r", encoding="utf-8"
            ) as file:
                doc = file.read()

            splitter = MarkdownHeaderTextSplitter(
                headers_to_split_on=self.headers_to_split_on, strip_headers=False
            )
            split_docs = splitter.split_text(doc)

            # ------------------------------------------------------------------
            merged_documents = []
            for doc in split_docs:

                merged_content = f"metadata:{doc.metadata}\n\n ---- \n page_content:{doc.page_content}"

                merged_documents.append(
                    Document(page_content=merged_content, metadata=doc.metadata)
                )

            # ------------------------------------------------------------------

            document_transformer = create_metadata_tagger(Property1, self.llm)
            enhanced_documents = document_transformer.transform_documents(
                merged_documents, prompt=self.prompt
            )

            page_contents = []

            for i in range(len(split_docs)):
                page_data = {
                    "page": i + 1,
                    "page_content": split_docs[i].page_content,
                    "metadata": enhanced_documents[i].metadata,
                }
                page_contents.append(page_data)

            post = {
                "title": data,
                "page_contents": page_contents,
                "cateory_id": self.category_id,
                "filetype": self.file_type,
                "latest_update": datetime.now(),
            }

            posts = self.db.md
            posts.insert_one(post)