<a href="https://colab.research.google.com/github/keyaaness/RAG-from-Scratch/blob/main/RAG_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# import necessary libraries
import os
import re
from transformers import AutoTokenizer, AutoModel
import uuid
import torch
import numpy as np
import json
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from google.colab import drive

In [15]:
# mount google drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
# step 1 : chunking
def chunking(directory_path, tokenizer, chunk_size, para_seperator=" /n /n", separator=" "):
    documents = {}
    all_chunks = {}
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        print(filename)
        base = os.path.basename(file_path)
        sku = os.path.splitext(base)[0]
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()

            doc_id = str(uuid.uuid4())

            paragraphs = re.split(para_seperator, text)

            for paragraph in paragraphs:
                words = paragraph.split(separator)
                current_chunk_str = ""
                chunk = []
                for word in words:
                    if current_chunk_str:
                        new_chunk = current_chunk_str + separator + word
                    else:
                        new_chunk = current_chunk_str + word
                    if len(tokenizer.tokenize(new_chunk)) <= chunk_size:
                        current_chunk_str = new_chunk
                    else:
                        if current_chunk_str:
                            chunk.append(current_chunk_str)
                        current_chunk_str = word

                if current_chunk_str:
                    chunk.append(current_chunk_str)

                for chunk in chunk:
                    chunk_id = str(uuid.uuid4())
                    all_chunks[chunk_id] = {"text": chunk, "metadata": {"file_name": sku}}
        documents[doc_id] = all_chunks
    return documents

In [17]:
# step 2 : generating embeddings
def map_document_embeddings(documents, tokenizer, model):
    mapped_document_db = {}
    for id, dict_content in documents.items():
        mapped_embeddings = {}
        for content_id, text_content in dict_content.items():
            text = text_content.get("text")
            inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
            with torch.no_grad():
                embeddings = model(**inputs).last_hidden_state.mean(dim=1).squeeze().tolist()
            mapped_embeddings[content_id] = embeddings
        mapped_document_db[id] = mapped_embeddings
    return mapped_document_db

In [18]:
def read_json(path):
    with open(path, 'r') as f:
        data = json.load(f)
    return data

In [19]:
def save_json(path, data):
    with open(path, 'w') as f:
        json.dump(data, f, indent=4)

In [20]:
if __name__ == "__main__":
    drive_base_path = "/content/drive/My Drive"
    documents_folder = os.path.join(drive_base_path, "documents")
    database_folder = os.path.join(drive_base_path, "database")
    os.makedirs(database_folder, exist_ok=True)

    model_name = "BAAI/bge-small-en-v1.5"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    chunk_size = 200
    para_seperator = " /n /n"
    separator = " "
    top_k = 2
    openai_model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key='your_api_key')  # Add your API key

    # create document store with chunk id, doc_id, text
    documents = chunking(documents_folder, tokenizer, chunk_size, para_seperator, separator)

    # now embed and map to database
    mapped_document_db = map_document_embeddings(documents, tokenizer, model)

    # save JSON to google drive
    save_json(os.path.join(database_folder, "doc_store_2.json"), documents)
    save_json(os.path.join(database_folder, "vector_store_2.json"), mapped_document_db)

    # retrieving most relevant data chunks
    query = "why toddlers throw tantrums?"
    query_embeddings = compute_embeddings(query, tokenizer, model)
    sorted_scores = retrieve_top_k_scores(query_embeddings, mapped_document_db, top_k)
    top_results = retrieve_top_results(sorted_scores)

    # reading JSON
    document_data = read_json(os.path.join(database_folder, "doc_store_2.json"))  # read document store

    # retrieve text of relevant chunk embeddings
    relevant_text = retrieve_text(top_results, document_data)

    print(relevant_text)

behaviuor1.txt
behaviuor2.txt
behaviuor3.txt
{'text': "\n\nChildren throw tantrums when they are overwhelmed by strong emotions, overstimulated, or anxious. Young children may have tantrums because of strong emotions, while older children may have tantrums because they don't know how to express or manage their feelings. Overstimulation can also cause tantrums, as children don't always recognize when bright lights or loud noises are bothering them. Anxiety can also cause tantrums, such as when a child feels anxious due to an unexpected event, unrealistic demand, or lack of routine.\n \n\nTantrums may happen when kids are tired, hungry, or uncomfortable. They can have a meltdown because they can't have something they want (like a toy or candy) or can’t get someone to do what they want (like getting a parent to pay attention to them immediately or getting a sibling to give up the tablet). Learning to deal with frustration is a skill that children gain over", 'metadata': {'file_name': 'beh