In [2]:
from config import settings
from llama_index.core import Document
import firebase_admin
from firebase_admin import credentials, firestore
from tqdm import tqdm

In [3]:
# FIRESTORE
CREDENTIALS_JSON_NAME = settings.credentials_json_name
FIRESTORE_COLLECTION = settings.firestore_collection
FIRESTORE_PROJECT = settings.firestore_project
FIRESTORE_DATABASE = settings.firestore_database

# CHUNK
CHUNK_SIZE = 50

In [4]:
%run Bucket.ipynb

In [5]:
class Firebase:

  def __init__(self):
    print("Initialized connection to Firebase")
    from google.oauth2 import service_account
    from llama_index.storage.kvstore.firestore import FirestoreKVStore
    from llama_index.storage.docstore.firestore import FirestoreDocumentStore

    # Initialize Firebase
    firebase_credentials_file_path = self._get_credentials()
    self.cred = credentials.Certificate(firebase_credentials_file_path)
    self.cred_google_auth = service_account.Credentials.from_service_account_file(
      firebase_credentials_file_path
    )

    try:
      firebase_admin.initialize_app(self.cred)
    except ValueError:
      print("Firebase app already initialized")

    # Get Firestore client
    self.db = firestore.client()
    self.collection_name_base = FIRESTORE_COLLECTION
    self.collection_name = self.collection_name_base + '_data'
    self.project = FIRESTORE_PROJECT
    self.database = FIRESTORE_DATABASE
    self.chunk_size = CHUNK_SIZE
    
    self.kvstore = FirestoreKVStore(
      project=self.project,
      database=self.database,
      credentials=self.cred_google_auth
    )

    self.docstore = FirestoreDocumentStore(
        firestore_kvstore=self.kvstore,
        namespace=self.collection_name_base
    )

    print(f"Set up connection to Firebase Database {self.database} in project {self.project}")


  def _get_credentials(self):
    credentials_json = CREDENTIALS_JSON_NAME
    bucket = Bucket(AWS_S3_OUTPUT_BUCKET)
    firebase_credentials_file_path = bucket.download_object(credentials_json, "credentials", return_file_path=True)
    return firebase_credentials_file_path

  def upload_documents(self, documents):

    def chunks(lst):
      """Yield successive n-sized chunks from lst."""
      for i in range(0, len(lst), self.chunk_size):
          yield lst[i:i + self.chunk_size]

    print(f"Uploading {len(documents)} documents to Firebase")
    doc_chunks = chunks(documents)

    for chunk in tqdm(doc_chunks):
      self._upload_documents(chunk)

    print("Documents uploaded to Firebase!!")


  def _upload_documents(self, documents):
    from llama_index.core.node_parser import SentenceSplitter
    
    nodes = SentenceSplitter().get_nodes_from_documents(documents)
    print(f"Uploading {len(nodes)} nodes to Firebase...")
    self.docstore.add_documents(nodes)
    print("Nodes loaded!!")


  def get_document(self, document_id):
      print(f"Getting document {document_id} from collection {self.collection_name}")
      doc_ref = self.db.collection(self.collection_name).document(document_id)
      doc = doc_ref.get()
      if doc.exists:
          print(f"Document {document_id} found in collection {self.collection_name}")
          return doc.to_dict()
      else:
          print(f"Document {document_id} NOT found in collection {self.collection_name}")
          return None


  def get_all_documents(self, limit:int = None, only_paragraph:bool = False, document_title:str = None) -> list:
      print(f"Getting all documents from collection {self.collection_name}")
      if limit:
        print(f"Limiting to {limit} documents")

      docs = self.db.collection(self.collection_name)

      if only_paragraph:
        print("Getting only paragraphs in provided documents")
        docs = docs.where('data.metadata.chunk_type', '==', 'para')

      if document_title:
        print("Getting documents from the following file: ", document_title)
        docs = docs.where('data.metadata.title', '==', document_title)

      if limit:
        docs = docs.limit(limit)

      docs = docs.stream()

      documents = [doc.to_dict() for doc in docs]
      return self._documents_to_llama_index_documents(documents)

  def get_all_document_ids(self):
    # Reference to the collection
    docs = self.db.collection(self.collection_name).stream()

    # List to store document IDs
    document_ids = [doc.id for doc in docs]

    return document_ids



  def _dict_to_llama_index_document(self, doc_dict):
      # Extract fields from the dictionary
      document_data = doc_dict.get('data', {})
      document_id = document_data.get('id_', None)

      if document_id is None:
          raise ValueError("Document ID not found in the dictionary.")

      text_content = document_data.get('text', '')
      metadata = document_data.get('metadata', {})

      # Create a LlamaIndex Document
      document = Document(
          doc_id=document_id,
          text=text_content,
          metadata=metadata
      )

      return document

  def _documents_to_llama_index_documents(self, documents):
      print("Converting documents to LlamaIndex documents...")
      import os
      from tqdm import tqdm
      llama_index_documents = []
      for doc in tqdm(documents):
          llama_index_document = self._dict_to_llama_index_document(doc)
          llama_index_documents.append(llama_index_document)
      print("Conversion successful!")
      return llama_index_documents


In [None]:
firebase = Firebase()