In [1]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import UnstructuredHTMLLoader
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from dotenv import load_dotenv
import os

In [2]:
loader = DirectoryLoader("/Users/hritik/Desktop/Grading Doc RAG/Document", glob="**/*.html",loader_cls=UnstructuredHTMLLoader)
docs = loader.load()
len(docs)

166

In [3]:
def remove_extra_whitespace(sentence):
    # Split the sentence into words and join them with a single space
    cleaned_sentence = ' '.join(sentence.split())
    return cleaned_sentence



clean_docs = [remove_extra_whitespace(doc.page_content) for doc in docs]

In [4]:
for i in range(len(docs)):
    docs[i].page_content = clean_docs[i]

In [5]:
docs[165]

Document(page_content='English | हिंदी | தமிழ் | తెలుగు | മലയാളം | ಕನ್ನಡ | मराठी | ગુજરાતી | বাংলা Academics Overall Structure Course Registrations Assessments Exam Cities Fee Structure Foundational Level Diploma Level BS Degree Level Sample Certificates Academic Calender Admissions Important Dates Mandatory Requirements Eligibility to Apply Application Process Admission to the Foundation Level 1. Regular Entry 2. JEE-based Entry International Students FAQ About IITM About IIT Madras Faculty Co-ordinators Contact Us SIGN IN Applications open now for May 2024 Batch. Application Close: May 28th, 2024 | Exam: Jul 07th, 2024 APPLY NOW Applications open now for May 2024 Batch. Application Close: May 26th, 2024 | Exam: Jul 07th, 2024 APPLY NOW Home Academics MA1101 Foundation Level Math for Electronics I by Prof. Andrew Thangaraj Course ID: MA1101 Course Credits: 4 Course Type: Foundation Pre-requisites: None What you’ll learnVIEW COURSE VIDEOS To introduce differential/integral calculus of 

In [6]:
dotenv_path = '../.env'
load_dotenv(dotenv_path)
HUGGINGFACEHUB_API_TOKEN = os.getenv("HF_TOKEN")
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN
embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2"
)

In [7]:
vectordb = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
    persist_directory='vectordb/chroma/'
)

In [8]:
print(vectordb._collection.count())

166


In [9]:
ques = "paradox"

In [10]:
docs = vectordb.similarity_search(ques,k=3)

In [11]:
docs

[Document(page_content='Paradox\'22 Previous Next Sports Interactive Sessions Cultural Events Workshops Company Discussions Sponsors PARADOX\'22 Student Festival Organised by the students of IITM BS Degree Program at the IIT Madras Campus 20th, 21st & 22nd May 2022 1500+ Programming & Data Science students of IIT Madras attended the first on-campus three day festival, which was fully curated and organised by the students. This was a huge platform for interaction between fellow students, faculty, industry experts, and companies. Paradox \'22 also marked the formal distribution of provisional diploma certificates to 101 students, including 17 students from families of income less than ₹1 lakh per annum. Sports Previous Next As an icebreaker and warm-up to the three day festival, a whole series of games (chess, table tennis, box cricket, football 5s, etc.) were organised where students competed against each other. On day two, over 700 participants woke up early for a 5km run / walk explor

In [12]:
import fitz
from tqdm.auto import tqdm
import re
import spacy

pdf_path = '/Users/hritik/Desktop/Grading Doc RAG/Document/IITM BS Degree Programme - Student Handbook - Latest.pdf'

def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip()
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.

    Returns:
        list[dict]: A list of dictionaries, each containing the page number
        (adjusted), character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number - 3,
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
print(pages_and_texts[:2])
    

  from .autonotebook import tqdm as notebook_tqdm
42it [00:00, 380.09it/s]

[{'page_number': -3, 'page_char_count': 119, 'page_word_count': 16, 'page_sentence_count_raw': 1, 'page_token_count': 29.75, 'text': 'IITM BS Degree Office, 3rd Floor, ICSR Building, IIT Madras, Chennai - 600036 support@study.iitm.ac.in Student Handbook'}, {'page_number': -2, 'page_char_count': 1059, 'page_word_count': 173, 'page_sentence_count_raw': 11, 'page_token_count': 264.75, 'text': 'Please read this Dear student, This Student Handbook is intended to provide IIT Madras BS Degree students with the information and policies they should be aware of, which may help them make the most of the opportunities offered in this programme. It also gives you formal notification and explanation of the programme’s regulations, policies and procedures. It is essential, and your responsibility, to read it and familiarize yourself with the content. It should be kept handy and referred to during your time with us. This handbook may be revised time and again. While this student handbook was prepared 




In [13]:
text_splitter = RecursiveCharacterTextSplitter(
    
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
    add_start_index=True
)

# Split text for each page and store the chunks
for item in tqdm(pages_and_texts):
    chunks = text_splitter.split_text(item["text"])
    item["chunks"] = chunks

100%|██████████| 42/42 [00:00<00:00, 2317.90it/s]


In [14]:
from langchain.schema import Document

embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2"
)
documents = []
for item in pages_and_texts:
    for chunk in item["chunks"]:
        documents.append(Document(
            page_content=chunk,
            metadata={"page_number": item["page_number"]}
        ))

In [15]:
vectordb = Chroma.from_documents(
    documents=documents,
    embedding=embeddings,
    persist_directory='vectordb/chroma/'
)

In [16]:
print(vectordb._collection.count())

260


In [17]:
ques = "what is the course fee for software engineering?"
docs = vectordb.similarity_search(ques,k=3)
docs[0].page_content

'4.1 For students who are outside India: Facilitation fee for exams is in addition to the above for candidates writing exams overseas - as fixed up with the local exam partner there. These are subject to periodic changes depending on the local exam partner identified. Facilitation Fee Quiz1 [irrespective of num of courses] = Rs.2000/- Quiz2 [irrespective of num of courses] = Rs.2000/- End term [per session] = Rs.2000/- [one course it is Rs.2000/- & more than one course it is Rs. 4000/-] E.,g If you opt for 4 courses Quiz1 = 2000, Quiz2 = 2000, EndTerm = 4000 Totally 8000 to be paid as a facilitation fee *Facilitation fee may vary based on country. Eg: Kuwait & Bahrain it is Rs. 3000/- 5. Admission to the programme 5.1 Regular Entry into Foundation level To enter the Foundation level, applicants have to pass the Qualifier exam. The Foundation courses allow for interested applicants with a variety of educational backgrounds to be trained in the basics (Math, Statistics, Computational'