## Data Ingestion   

In [1]:
from langchain.schema import Document
from langchain_core.documents import Document


In [2]:
# These metadata fields are optional and can be omitted. This is helpful for the RAG pipeline to track the source of the document,
# Also for filtering the documents
document = Document(
    page_content="Hello, world!", 
    metadata={
        "source": "example.pdf",
        "pages": 1,
        "author": "Sunil J",
        "date": "2021-01-01"  
        }
    )

print(document)

page_content='Hello, world!' metadata={'source': 'example.pdf', 'pages': 1, 'author': 'Sunil J', 'date': '2021-01-01'}


In [3]:
### Using text loader to load the data from the file
from langchain.document_loaders import TextLoader

loader = TextLoader("../data/koh_samet.txt",encoding="utf-8")
documents = loader.load()

print(documents)

[Document(metadata={'source': '../data/koh_samet.txt'}, page_content='Koh Samet is a beautiful tropical island located about three hours from Bangkok, known for its stunning white sandy beaches, clear turquoise waters, and peaceful atmosphere. This small island offers a perfect escape with its lush jungles, rocky outcrops, fresh seafood, and a mix of relaxing bars and resorts, making it an ideal destination for both adventure and relaxation.\n\nIntroduction to Koh Samet\n\nKoh Samet is a charming island situated in the Gulf of Thailand, within Rayong province. It lies approximately 220 km from Bangkok and just 75 km from Pattaya, making it a popular weekend getaway for locals and travelers seeking a break from city life. Despite its proximity to major cities, Koh Samet remains a hidden gem, known for its idyllic beaches, fresh seafood, and laid-back vibe.\n\nGetting There\n\nFrom Bangkok to Ban Phe Pier: The journey to Koh Samet begins with a 2-3 hour trip from Bangkok to Ban Phe Pier 

In [4]:
### Using directory loader to load the text files from the directory
from langchain_community.document_loaders import DirectoryLoader

loader = DirectoryLoader(
    "../data",
    glob="*.txt",
    loader_cls=TextLoader,
    show_progress=True,
    )
documents = loader.load()

print(documents)

100%|██████████| 2/2 [00:00<00:00, 561.98it/s]

[Document(metadata={'source': '../data/koh_samet.txt'}, page_content='Koh Samet is a beautiful tropical island located about three hours from Bangkok, known for its stunning white sandy beaches, clear turquoise waters, and peaceful atmosphere. This small island offers a perfect escape with its lush jungles, rocky outcrops, fresh seafood, and a mix of relaxing bars and resorts, making it an ideal destination for both adventure and relaxation.\n\nIntroduction to Koh Samet\n\nKoh Samet is a charming island situated in the Gulf of Thailand, within Rayong province. It lies approximately 220 km from Bangkok and just 75 km from Pattaya, making it a popular weekend getaway for locals and travelers seeking a break from city life. Despite its proximity to major cities, Koh Samet remains a hidden gem, known for its idyllic beaches, fresh seafood, and laid-back vibe.\n\nGetting There\n\nFrom Bangkok to Ban Phe Pier: The journey to Koh Samet begins with a 2-3 hour trip from Bangkok to Ban Phe Pier 




In [5]:
# read my excel file from the data/excel folder using langchain excel loader
from langchain_community.document_loaders import UnstructuredExcelLoader    

loader = UnstructuredExcelLoader("../data/excel/top_10_countries_population.xlsx")
documents = loader.load()
print(len(documents))
documents

1


[Document(metadata={'source': '../data/excel/top_10_countries_population.xlsx'}, page_content='Rank Country Population (millions) 1 China 1425 2 India 1417 3 United States 339 4 Indonesia 277 5 Pakistan 240 6 Nigeria 223 7 Brazil 216 8 Bangladesh 174 9 Russia 144 10 Mexico 130')]

In [6]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
class EmbeddingManager:
    """
    This class is used to manage the embeddings for the documents.
    """

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self.load_model()
    
    def load_model(self):
        try:
            print(f"Loading model {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model {self.model_name} loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, documents: List[str]) -> np.ndarray:
        """
        Generate embeddings for the documents.
        Args:
            documents: List[str]
        Returns:
            np.ndarray of shape (n_documents, n_dimensions)
        """
        print(f"Generating embeddings for {len(documents)} documents")
        embeddings = self.model.encode(documents,show_progress_bar=True)
        print(f"Embeddings generated for {len(documents)} documents with shape {embeddings.shape}")
        return embeddings

# Initialize embeddings manager
embeddings_manager = EmbeddingManager()
embeddings_manager
    

Loading model all-MiniLM-L6-v2
Model all-MiniLM-L6-v2 loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x12d693510>