In [1]:
import pymongo
from dataclasses import dataclass, asdict
from datetime import datetime
from uuid import uuid4, UUID
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext
from llama_index.core import SimpleDirectoryReader
from llama_index.core import load_index_from_storage
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document
from llama_index.core.schema import MetadataMode
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.indices.loading import load_index_from_storage 
import dotenv
dotenv.load_dotenv('../projectgurukul')

import pandas as pd
import re
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
import openai
import os


In [45]:
class CSVReader(BaseReader):
    """
    CSVReader class for loading data from CSV files into a list of Document objects.

    Parameters:
    - text_columns (list): List of column names containing text data.
    - metadata_columns (tuple, optional): Tuple of column names containing metadata (default is an empty tuple).
    - text_template (str, optional): Template for formatting text values (default is "{value}").
    - text_separator (str, optional): Separator for joining text values (default is "\n\n").
    - preprocess (callable, optional): Function for preprocessing rows before formatting (default is None).
    - column_name_mappings (dict, optional): Dictionary for mapping column names (default is None).
    - header (str or None, optional): Header parameter for pandas read_csv method (default is 'infer').

    Methods:
    - load_data(file, extra_info=None): Load data from a CSV file into a list of Document objects.
    - format_text(row): Format text values from a row using the specified template and separator.
    - extract_metadata(row): Extract metadata values from a row.

    """

    def __init__(self, text_columns, metadata_columns=(), text_template="{value}", text_separator="\n\n", preprocess=None, column_name_mappings=None, header=None):
        self.text_columns = text_columns
        self.metadata_columns = metadata_columns
        self.preprocess = preprocess
        self.text_template = text_template
        self.text_separator = text_separator
        self.column_name_mappings = column_name_mappings
        self.header = header if header else 'infer'  # pandas default.

    def load_data(self, file, extra_info=None):
        """
        Load data from a CSV file into a list of Document objects.

        Parameters:
        - file (str): Path to the CSV file.
        - extra_info (dict, optional): Extra information to be added to the metadata (default is None).

        Returns:
        - list: List of Document objects.

        """
        df = pd.read_csv(file, header=self.header)
        if self.column_name_mappings:
            df.rename(mapper=self.column_name_mappings, axis=1, inplace=True)
        documents = []

        for _, row in df.iterrows():
            if self.preprocess:
                row = self.preprocess(row)
            text = self.format_text(row)
            metadata = self.extract_metadata(row)

            if extra_info:
                metadata.update(extra_info)

            documents.append(Document(text=text, extra_info=metadata))

        return documents

    def format_text(self, row):
        """
        Format text values from a row using the specified template and separator.

        Parameters:
        - row (pd.Series): Row from the CSV file.

        Returns:
        - str: Formatted text.

        """
        text_values = [self.text_template.format(
            key=col, value=row[col]) for col in self.text_columns]
        return self.text_separator.join(text_values)

    def extract_metadata(self, row):
        """
        Extract metadata values from a row.

        Parameters:
        - row (pd.Series): Row from the CSV file.

        Returns:
        - dict: Metadata values.

        """
        return {col: row[col] for col in self.metadata_columns}


class RamayanaCSVReader(CSVReader):
    def __init__(self):
        pass
    def load_data(self, file, extra_info=None):
        df = pd.read_csv(file).dropna(how="all").fillna("")
        ids = df.content.map(lambda shloka: re.findall(r'.*(\d+\.\d+\.\d+).*',shloka))
        df["shloka_ids"] = ids.map(lambda ids: ", ".join(ids))
        df["sarga"] = ids.map(lambda ids: int(ids[0].split(".")[1]))

        df["shloka_with_explanation"] = df.apply(
            lambda row: f"{row.content}\n {row.explanation}",
            axis = 1
        )

        df["explanation_with_id"] = df.apply(
            lambda row: f"{row.explanation} ।।{row.shloka_ids}।।",
            axis = 1
        )
        df_grouped = df.groupby('sarga').agg(lambda lst: "\n\n".join(lst)).reset_index()
        documents = []

        for _, row in df_grouped.iterrows():
            sarga = row['sarga']
            shlokas = row['content']
            english_expl = row['explanation']
            shloka_with_explanation = row['shloka_with_explanation']
            explanation_with_id = row['explanation_with_id']

            metadata = {
                'sarga': sarga
            }


            if shlokas[0] == "[":
                metadata['summary'] = shlokas.split("]")[0][1:][:200]

            if extra_info:
                metadata.update(extra_info)

            # documents.append(Document(text=shloka_with_explanation, extra_info=metadata))
            documents.append(Document(text=explanation_with_id, extra_info=metadata))

        return documents

class MahabharataCSVReader(CSVReader):
    def __init__(self):
        pass
    def load_data(self, file, extra_info=None):
        df = pd.read_csv(file)
        documents = []

        df_grouped = df.groupby(['parva', 'chapter', 'chapter title']).agg(lambda lst: "\n\n".join(lst)).reset_index()
        parva_id_map = dict(zip(*[iter(df_grouped['parva'].unique()), iter(range(1, df_grouped.size + 1))]))

        for _, row in df_grouped.iterrows():
            parva = row['parva']
            parva_id = parva_id_map[parva]
            chapter_id = row['chapter']
            chapter_title = row['chapter title']
            content = row['content']

            metadata = {
                'parva_id' : parva_id,
                'parva': parva,
                'chapter_id':chapter_id,
                'chapter_title':chapter_title
            }
            for document in documents:
                document.metadata['scripture'] = 'mahabharata'
                document.excluded_embed_metadata_keys.extend(["file_path"])
                document.excluded_llm_metadata_keys.extend(["file_path"])
                document.excluded_embed_metadata_keys.extend(["parva_id"])
                document.excluded_llm_metadata_keys.extend(["parva_id"])
            documents.append(Document(doc_id=str(parva_id)+"."+ chapter_id, text=content, extra_info=metadata))

        return documents

In [46]:


uri = "mongodb+srv://xxxxxxxxxxxx@yyyy.mongodb.net/?retryWrites=true&w=majority"
# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))
# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [49]:
#clear db
db = client.get_database('test')
db.get_collection("knowledge").delete_many({'metadata.parva_id' : 1})

DeleteResult({'n': 14, 'electionId': ObjectId('7fffffff0000000000000156'), 'opTime': {'ts': Timestamp(1708533659, 28), 't': 342}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1708533659, 28), 'signature': {'hash': b'A\x95\xc4\xbc\x856i\xa2v\x18{\xe7\x93~\x9a\xf8R\x045\xad', 'keyId': 7286506562779086851}}, 'operationTime': Timestamp(1708533659, 28)}, acknowledged=True)

In [52]:
MAHABHARATA_OUT_FILE = "../data/mahabharata/data/mahabharata_parsed.csv"
documents = MahabharataCSVReader().load_data(MAHABHARATA_OUT_FILE)
documents[0]

Document(id_='1.Chapter Eight', embedding=None, metadata={'parva_id': 1, 'parva': 'Adi Parva', 'chapter_id': 'Chapter Eight', 'chapter_title': 'The Preceptor Drona', 'scripture': 'mahabharata'}, excluded_embed_metadata_keys=['file_path', 'parva_id'], excluded_llm_metadata_keys=['file_path', 'parva_id'], relationships={}, text='Seeing the princes enter adolescence, Maharaja Bhishma began searching for a suitable teacher to tutor them in the science of warfare. One  the heroic princes came out of Hastinapura. to play ball, and they roamed the forest areas absorbed in the ecstasy of young boys. When the ball fell into a well, they all tried their best to retrieve it, but found it impossible.\n\nAs they were looking in the well, a brahmana appeared who had just finished his daily performance of agnihotra (fire sacrifice). Seeing the princes unsuccessful in retrieving the ball, the brahmana, whose name was Dronacharya, approached them. He was dressed in white robes and looked thin and efful

In [53]:

store = MongoDBAtlasVectorSearch(client, db_name="test", collection_name="knowledge", id_key="id_", index_name="vector_index")
storage_context = StorageContext.from_defaults(vector_store=store)

In [54]:
# create your index
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)
index.summary

'None'

In [41]:
query_engine = index.as_query_engine()
response = query_engine.query("Who was Pandavas?")
print(response)

The Pandavas were the sons of Pandu, a king in the Mahabharata epic. They were Yudhishthira, Bhima, Arjuna, Nakula, and Sahadeva.


In [55]:
#load a new context 
new_index = load_index_from_storage(storage_context)
query_engine = new_index.as_query_engine()
response = query_engine.query("Who was Pandavas?")
print(response)

The Pandavas were the sons of King Pandu, born to his two wives Kunti and Madri. They were known for their righteousness, valor, and close association with Lord Krishna. The Pandavas played a significant role in the epic Mahabharata, particularly in the Kurukshetra War, where they fought against their cousins, the Kauravas, for their rightful claim to the throne.


In [43]:
new_index_2 = load_index_from_storage(storage_context)
query_engine_2 = new_index_2.as_query_engine()
response_2 = query_engine_2.query("Who were Pandavas?")
print(response_2)

The Pandavas were a group of five brothers - Yudhishthira, Bhima, Arjuna, Nakula, and Sahadeva. They were central characters in the Indian epic Mahabharata.
