In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_ollama import OllamaEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_community.llms import Ollama
from langchain_core.prompts import PromptTemplate
import re
import os

In [2]:
pdf_dir = "./study-materials"
md_dir  = "./loaded-materials"
os.makedirs(md_dir, exist_ok=True)
pdf_files = [os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.endswith(".pdf")]

In [3]:
def extract(pdf_list): #Load and parse the documents
    all_docs = []
    for pdf_path in pdf_list:
        loader = PyPDFLoader(pdf_path)
        docs = loader.load()
        all_text  = ""
        for doc in docs:
            all_text += "\n" + doc.page_content.strip()
        all_text = re.sub(r'\n\s*\n', '\n\n', all_text)
        fn = os.path.splitext(os.path.basename(pdf_path))[0]
        out_md = os.path.join(md_dir, fn + ".md")
        with open(out_md, "w", encoding="utf-8") as f:
            f.write(all_text)
        all_docs.append(
            Document(
                page_content=all_text,
                metadata={"source": fn + ".md"}
            )
        )
    return all_docs

In [4]:
docs = extract(pdf_files)

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200, add_start_index = True) #Split into chunks

chunks = text_splitter.split_documents(docs)

In [6]:
len(chunks) #Total number of chunks

141

In [7]:
chunks[10].page_content

'• The celestial bodies produce tides having minimum\ntidal heights, which are called neap tides.\n• Neap tides are produced twice in a lunar month.\nDownloaded from Ktunotes.in\nORIGIN OF TIDES (CONT..)\nTIDAL VARIATIONIN A LUNAR MONTH\n• The tidal variation resulting in spring and neap tides in a lunar month is shown in Figure 10.5.\nDownloaded from Ktunotes.in\nDownloaded from Ktunotes.in\nDownloaded from Ktunotes.in\nDownloaded from Ktunotes.in\nTIDALENERGY\n• The source of tidal energy is the difference in water levels between high tides and low or ebb\ntides.\n• During one tidal cycle of 12 h and 25 min, the water level rises to maximum during high tides,\nthen it starts falling to a minimum level at low tides and finally it starts rising again to a\nmaximum level at high tides.\n• The difference in water levels between high tides and low tides is the potential energy of water\nwhich can be harnessed.\n• This harnessed energy is called the tidal energy.'

In [8]:
chunks[10].metadata #extra information attached to a document or chunk of text that helps with searching, filtering, and providing context

{'source': 'M2.md', 'start_index': 8008}

In [9]:
embeddings = OllamaEmbeddings(model ='nomic-embed-text')

In [10]:
embeddings.embed_query(chunks[10].page_content)

[0.047058195,
 0.019791763,
 -0.19020984,
 -0.036460686,
 0.025702802,
 0.062759094,
 -0.03229794,
 -0.046597857,
 0.031628072,
 -0.0260503,
 0.021496324,
 0.011153604,
 0.11815011,
 0.01759029,
 0.013566668,
 -0.028037013,
 -0.033610493,
 -0.045329046,
 0.054689653,
 0.040058274,
 -0.049809072,
 -0.03199032,
 -0.053220894,
 -0.025365777,
 0.03262385,
 0.048402183,
 -0.025943598,
 -0.016307263,
 0.0088678645,
 -0.016046504,
 0.08633033,
 -0.04802753,
 0.049329888,
 -0.013106844,
 -0.09175134,
 -0.041075863,
 0.025234427,
 0.0057956846,
 0.010243117,
 0.060715355,
 0.04360279,
 -0.04221057,
 0.019113388,
 -0.003505646,
 -0.012773135,
 -0.01888331,
 0.062484145,
 0.002529438,
 -0.010674088,
 -0.08215899,
 0.056376193,
 -0.009966292,
 -0.029170165,
 -0.042779572,
 0.030268336,
 0.04422044,
 0.017114904,
 -0.03160066,
 0.04168988,
 0.0032196448,
 0.11986659,
 0.045496024,
 0.0018575822,
 0.052463353,
 0.027122099,
 0.0024777695,
 -0.025822036,
 0.056055788,
 0.013726163,
 -0.0131377,
 0.02

In [11]:
persist_dir = "./chroma" #where vector database is stored
vectordb = Chroma.from_documents(
    documents = chunks,
    embedding = embeddings,
    persist_directory = persist_dir, #for persistent storage
    collection_name="study-docs"
)

In [12]:
vectordb.persist()

  vectordb.persist()


In [13]:
ids = vectordb.add_documents(documents = chunks) #assign ids to chunks for future adding and removal of chunks

In [14]:
results = vectordb.similarity_search(
    "Differentiate between flat plate collectors and solar concentrators.",
    k = 1 #returns the first closest chunk increase to get more similar chunks
)

In [15]:
results

[Document(metadata={'source': 'M1.md', 'start_index': 17476}, page_content='SOLAR COLLECTORS\nDownloaded from Ktunotes.in\nFlat plate collector\n• A flat plate collector is simple in construction and does not require sun tracking. Therefore, it can be\nproperly secured on a rigid platform and thus becomes mechanically stronger than those requiring\nflexibility for tracking purpose.\n• As the collector is installed outdoors and exposed to atmospheric disturbances (rain, storm, etc.), the\nflat plate type is more likely to withstand harsh outdoor conditions. Also because of simple\nstationary design, a flat plate collector requires little maintenance.\n• The principal disadvantageof flat plate collector is that because of absence of optical concentration,\nthe area from which heat is lost is large. Also, due to same reason high temperatures cannot be\nattained.\nComponents of the flat plate collector\n\uf0d8A flat plate collector consists of following essential components:\ni. Absorber p

In [16]:
results = vectordb.similarity_search_with_score(
    "What are the factors that affect biogas generation?",
    k = 1
)

In [17]:
doc, score = results[0]
print(f"Score: {score}\n")
print(doc)

Score: 0.506308376789093

page_content='Downloaded from Ktunotes.in
3) Temperature
• Temperature affects bacterial activity; methane formation is optimum in the 
35°-38°C. Biogas production decreases below 20°C and stops at 8°C. In 
cold regions canopy is built over the biogas plants to maintain the desired 
temperature. Thot regions, another micro-organism called "thermophilic' is 
utilized for anaerobic fermentation in the temperature range 550C-60°C. 
Gas production rises with the increase in average ambient air temperature. 
As the temperature increases, the total retention period decreases and vice-
versa. However, the total gas production remains practically the same.
4) Seeding
• Cattle dung contains both acid forming bacteria and methane forming 
bacteria. Acid forming bacteria multiply fast, while the methane forming 
bacteria grow slowly. To start and accelerate fermentation, seeding of 
methane forming bacteria is required) Accordingly, a small quantity of' metadata={'start_

In [18]:
question = "Discuss the effect of temperature and insolation on the characteristics of solar cell."

In [19]:
retrieved_docs = [doc for doc, _ in results]

In [20]:
context = "\n\n---\n\n".join(
    f"Source: {doc.metadata.get('source')}\n{doc.page_content.strip()}"
    for doc in retrieved_docs
)

In [21]:
context

'Source: M4.md\nDownloaded from Ktunotes.in\n3) Temperature\n• Temperature affects bacterial activity; methane formation is optimum in the \n35°-38°C. Biogas production decreases below 20°C and stops at 8°C. In \ncold regions canopy is built over the biogas plants to maintain the desired \ntemperature. Thot regions, another micro-organism called "thermophilic\' is \nutilized for anaerobic fermentation in the temperature range 550C-60°C. \nGas production rises with the increase in average ambient air temperature. \nAs the temperature increases, the total retention period decreases and vice-\nversa. However, the total gas production remains practically the same.\n4) Seeding\n• Cattle dung contains both acid forming bacteria and methane forming \nbacteria. Acid forming bacteria multiply fast, while the methane forming \nbacteria grow slowly. To start and accelerate fermentation, seeding of \nmethane forming bacteria is required) Accordingly, a small quantity of'

In [22]:
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a highly effective tutor for students studying topics.

You have access to the following **study materials**:

{context}

When answering, do the following:
1. Understand the question.
2. Use **only** the information in the context—do not hallucinate.
3. Whenever you refer to learned content, include the **source filename** in parentheses (e.g. “...as shown in M1.md”).
4. Provide a **step-by-step explanation** that helps students understand the reasoning.
5. **Explain** clearly (use bullet-points, examples, analogies).
6. **Plan** your answer briefly (e.g., “I'll explain X, then compare features.”).
7. **Conclude** with a summary.
8. If the answer is incomplete or unclear, say:
   *“I'm not sure — I'd need to review more materials.”*
---

**Question:**  
{question}

---

**Answer (as tutor):**
"""
)

In [23]:
llm = Ollama(model="llama3")
message = prompt.format_prompt(context=context, question=question).to_string()
response_text = llm.invoke([{"role": "user", "content": message}])
print(response_text)

  llm = Ollama(model="llama3")


Let's break down this question step by step!

**Plan:**

* Discuss the relationship between temperature and biogas production.
* Explore how temperature affects microbial activity.
* Briefly mention the connection to solar cells.

**Temperature Effect:**

According to M4.md, temperature plays a significant role in bacterial activity. Biogas production is optimum at temperatures ranging from 35°-38°C (M4.md). Below 20°C, biogas production decreases, and it stops altogether at 8°C. In cold regions, canopies are built over biogas plants to maintain the desired temperature.

In warm regions, thermophilic microorganisms are used for anaerobic fermentation within a temperature range of 55°-60°C (M4.md). As temperature increases, gas production rises, and the total retention period decreases. However, the total gas production remains relatively constant.

**Insolation Connection:**

While insolation is not directly mentioned in M4.md, we can draw an indirect connection to solar cells. Solar p