In [1]:
import os
import tempfile
from typing import Any, Optional
from uuid import UUID
from pydantic import BaseModel
from fastapi import UploadFile
from backend.axon.in_come import File
from backend.settings.backend_setting import PGDocStore

import re
from dotenv import load_dotenv, find_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from file_utils import sanitize_filename, open_file, save_file, compute_sha1_from_content, compute_sha1_from_file
# from gpt2brain import llm_core
import openai
from logger import setup_logger
from gpt2brain import OpenAIBrain

from langchain.document_loaders import PDFPlumberLoader, PyPDFLoader
from langchain.text_splitter import TokenTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.pgvector import PGVector
from langchain.vectorstores.pgembedding import PGEmbedding
from langchain.document_loaders import TextLoader
from langchain.docstore.document import Document


# Create a logger for this module
logger = setup_logger(__name__)

# Create a OpenAIBrain instance
brain = OpenAIBrain()



In [5]:
from backend.axon.in_come import File
from backend.settings.backend_setting import PGDocStore
from backend.neuron.neuron import Memory, Cognition
from langchain.document_loaders import PDFPlumberLoader

# Initialize the File object
file = File()    
file.import_path(
    filepath="/mnt/d/ai/cognimesh/output/katoAnticoagulationStrategiesExtracorporeal2021.pdf",
    chunk_size=1000,
    chunk_overlap=50
)

# Check if the file has been processed before
if not file.check_duplicate_in_db():
    # If not, process the file
    cognition = Cognition()
    embed = cognition.create_vector(file=file, loader_class=PDFPlumberLoader)

    store = PGDocStore()
    memory = Memory(store)

    # Add document metadata and vector embeddings to the database
    doc_id = memory.add_document(file=file)
    vector_ids = memory.add_vectors(file=file, embed=embed)
    memory.update_document_vectors(doc_id, vector_ids)
else:
    print(f"File with SHA1 {file.file_sha1} is already processed.")


File with SHA1 79daea53f994b5d3a2eb7b4406e0527589cdc246 is already processed.


In [None]:
load_dotenv(find_dotenv())
host= os.getenv('PG_HOST')
port= os.getenv('PG_PORT')
user= os.getenv('PG_USER')
password= os.getenv('PG_PASS')
dbname= 'cognimesh'

loader = PDFPlumberLoader("/mnt/d/ai/cognimesh/output/katoAnticoagulationStrategiesExtracorporeal2021.pdf")
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=4000, chunk_overlap=200)
pages = loader.load()
docs = text_splitter.split_documents(pages)
embeddings = OpenAIEmbeddings()
CONNECTION_STRING = f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{dbname}?client_encoding=utf8"
COLLECTION_NAME = "documents"

store = PGVector(
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
    # table_name="vectors",
    embedding_function=embeddings,
)

store.add_documents(docs)

# db = PGEmbedding.from_documents(
#     embedding=embeddings,
#     documents=docs,
#     collection_name=COLLECTION_NAME,
#     connection_string=CONNECTION_STRING,
#     pre_delete_collection=False,
# )
