In [1]:
# from PyPDF2 import PdfReader

# def extract_text_from_pdf(pdf_path):
#     text = ''
#     with open(pdf_path, 'rb') as file:
#         reader = PdfReader(file)
#         for page in reader.pages:
#             text += page.extract_text()
#     return text

# def write_to_markdown(text, markdown_path):
#     with open(markdown_path, 'w', encoding='utf-8') as file:
#         file.write(text)


# pdf_file_path = 'Thermo_Materials_Science_and_Technology.pdf'  # Path to your PDF file
# markdown_file_path = 'Thermo_Materials_Science_and_Technology.md'  # Path to the output Markdown file

# extracted_text = extract_text_from_pdf(pdf_file_path)
# write_to_markdown(extracted_text, markdown_file_path)


In [2]:
from langchain.document_loaders import DirectoryLoader
# import unstrctured

DATA_PATH = "thermo_md"

def load_documents():
    loader = DirectoryLoader(DATA_PATH, glob = "*.md")
    documents = loader.load()
    return documents

documents = load_documents()

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=100,
    length_function=len,
    add_start_index=True,
)
chunks = text_splitter.split_documents(documents)
print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

document = chunks[140]
print(document.page_content)
print(document.metadata)

Split 2 documents into 7645 chunks.
process, what is the temperature of the propane leaving the
valve, in /H11034C?
4.54 A large pipe carries steam as a two-phase liquid–vapor
mixture at 1.0 MPa. A small quantity is withdrawn through a
throttling calorimeter, where it undergoes a throttling process
{'source': 'thermo_md\\Fundamentals-of-Engineering-Thermodynamics-by-Michael-J.Moran-Howard-N.-Shapiro.md', 'start_index': 28406}


In [4]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
import os
import shutil

CHROMA_PATH = "chroma"

# Clear out the database first.
if os.path.exists(CHROMA_PATH):
    shutil.rmtree(CHROMA_PATH, ignore_errors = True)

# Create a new DB from the documents.
db = Chroma.from_documents(
    chunks, embeddings, persist_directory=CHROMA_PATH
)
db.persist()
print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


Saved 7645 chunks to chroma.


In [5]:
db

<langchain_community.vectorstores.chroma.Chroma at 0x2683942c5d0>

In [24]:
# PROMPT_TEMPLATE = """
# Answer the question based only on the following context:

# {context}

# ---

# Answer the question based on the above context: {question}
# """

In [5]:
query = "What are the drawbacks of Optical photon quantum computer?"
matching_docs = db.similarity_search(query)

len(matching_docs)

4

In [6]:
matching_docs[1]

Document(page_content='systems, with concomitant beneﬁts and drawbacks for society as a whole.\nQuantum computation and quantum information certainly offer challenges aplenty\nto physicists, but it is perhaps a little subtle what quantum computation and quantum', metadata={'source': 'pdf_to_markdown\\quantum-computation-and-quantum-information-nielsen-chuang.md', 'start_index': 92327})

In [6]:
from langchain_community.llms import Ollama

llm = Ollama(model="llama2")
llm.invoke("Tell me a joke")

"\nWhy don't scientists trust atoms? Because they make up everything! 😂"

In [7]:
## Design ChatPrompt Template
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template("""
Answer the following question based only on the provided context. 
Think step by step before providing a detailed answer. 
I will tip you $1000 if the user finds the answer helpful. 
<context>
{context}
</context>
Question: {input}""")

In [8]:
## Chain Introduction
## Create Stuff Docment Chain

from langchain.chains.combine_documents import create_stuff_documents_chain

document_chain=create_stuff_documents_chain(llm,prompt)

In [9]:
"""
Retrievers: A retriever is an interface that returns documents given
 an unstructured query. It is more general than a vector store.
 A retriever does not need to be able to store documents, only to 
 return (or retrieve) them. Vector stores can be used as the backbone
 of a retriever, but there are other types of retrievers as well. 
 https://python.langchain.com/docs/modules/data_connection/retrievers/   
"""

retriever=db.as_retriever()
retriever

VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000002683942C5D0>)

In [10]:
"""
Retrieval chain:This chain takes in a user inquiry, which is then
passed to the retriever to fetch relevant documents. Those documents 
(and original inputs) are then passed to an LLM to generate a response
https://python.langchain.com/docs/modules/chains/
"""
from langchain.chains import create_retrieval_chain
retrieval_chain=create_retrieval_chain(retriever,document_chain)

In [11]:
response=retrieval_chain.invoke({"input":"Which of the following techniques is NOT used to grow single crystals of semiconductors?  (A) Calendering (B) Czochralski (C) Float zone (D) Bridgman"})

In [11]:
response

{'input': 'Which of the following techniques is NOT used to grow single crystals of semiconductors?  (A) Calendering (B) Czochralski (C) Float zone (D) Bridgman',
 'context': [Document(page_content='Langasite group single crystals have been grown by many growing methods such as \nCzochralski (Cz) technique, Bridgeman method , floating zone (FZ) method, micro-pulling \ndown (\uf06d-PD) technique, as these crystals grow easily because of a low melting point around', metadata={'source': 'thermo_md\\Thermo_Materials_Science_and_Technology.md', 'start_index': 45219}),
  Document(page_content='Materials Science and Technology \n 268 \nC). The crystal growth is then disturbed by th e impingement of crystallites with each other \n(D). Our schematic model is consistent with  the nucleation-driven crystallization process \n(Zaou et al., 2000).', metadata={'source': 'thermo_md\\Thermo_Materials_Science_and_Technology.md', 'start_index': 636242}),
  Document(page_content='in a crucible with gradie

In [12]:
response['answer']


'Based on the provided context, the answer to the question is (D) Bridgman. The context describes various techniques used for growing single crystals of semiconductors, including Czochralski (Cz), floating zone (FZ), and micro-pulling down (μPD) techniques. However, there is no mention of the Bridgman technique in the provided text. Therefore, option (D) is the correct answer.'

In [12]:
# Reading JSON file

import json

# Read JSON data from file
with open('questions.json', 'r') as file:
    json_data = json.load(file)

thermo_questions = json_data['Thermodynamics']['questions']
mat_char_questions = json_data['Material characterization']['questions']
print(thermo_questions)

['Which of the following techniques is NOT used to grow single crystals of semiconductors?  (A) Calendering (B) Czochralski (C) Float zone (D) Bridgman', 'Which of the following signals is produced due to the elastic scattering of electrons by a material?  (A) Secondary electron (B) Backscattered electron (C) Auger electron (D) Photoelectron', 'Of the following materials, which is the most suitable for an LED emitting at around 380 nm?  (A) Direct bandgap material with a small bandgap (B) Indirect bandgap material with a large bandgap (C) Direct bandgap material with a large bandgap (D) Indirect bandgap material with a small bandgap ', 'Which material has the lowest specific heat capacity at room temperature?  (A) Water (B) Mercury (C) Copper (D) Silver', 'The band gap of a semiconducting material used to make an LED is 1.43 eV. What will be the minimum wavelength ofthe radiation emitted by this LED, in μm?', 'At room temperature, the typical barrier potential for silicon p-n junction 

### Material Charactization Questions

In [13]:
question_no = 0
for question in mat_char_questions[:5]:
    response=retrieval_chain.invoke({"input":f"{question}"})
    print(question_no)
    print(response)
    print("Answer: ", response['answer'])
    print("\n")
    question_no += 1

0
{'input': 'Microstrain can be measured by X-ray diffraction using peak  (A) Area and intensity (B) Position and area (C) Broadening and intensity (D) Position and broadening ', 'context': [Document(page_content='measurement with high repetition  rate using microbeams enabled us to obtain a diffraction \nintensity profile within an hour for one disk sample, and to get systematic data.', metadata={'source': 'thermo_md\\Thermo_Materials_Science_and_Technology.md', 'start_index': 629273}), Document(page_content='respectively. In addition, an X-ray microb eam technique with a phase zone plate was \napplied to achieve a few micron X-ray beam size, since the smaller beam gives a larger \nnumber of data for one disk. The laser beam diameter on the sample was adjusted to be', metadata={'source': 'thermo_md\\Thermo_Materials_Science_and_Technology.md', 'start_index': 628410}), Document(page_content='different techniques  described below. \n2.2.1 X-ray diffraction \nX-ray diffraction technique 

### Thermodynamics Questions

In [14]:
question_no = 0
for question in thermo_questions:
    response=retrieval_chain.invoke({"input":f"{question}"})
    print(question_no)
    print(response)
    print("Answer: ", response['answer'])
    print("\n")
    question_no += 1

{'input': 'Which of the following techniques is NOT used to grow single crystals of semiconductors?  (A) Calendering (B) Czochralski (C) Float zone (D) Bridgman', 'context': [Document(page_content='Langasite group single crystals have been grown by many growing methods such as \nCzochralski (Cz) technique, Bridgeman method , floating zone (FZ) method, micro-pulling \ndown (\uf06d-PD) technique, as these crystals grow easily because of a low melting point around', metadata={'source': 'thermo_md\\Thermo_Materials_Science_and_Technology.md', 'start_index': 45219}), Document(page_content='Materials Science and Technology \n 268 \nC). The crystal growth is then disturbed by th e impingement of crystallites with each other \n(D). Our schematic model is consistent with  the nucleation-driven crystallization process \n(Zaou et al., 2000).', metadata={'source': 'thermo_md\\Thermo_Materials_Science_and_Technology.md', 'start_index': 636242}), Document(page_content='in a crucible with gradient te

In [14]:
question_no = 0
for question in thermo_questions[5:]:
    response=retrieval_chain.invoke({"input":f"{question}"})
    print(question_no)
    print(response)
    print("Answer: ", response['answer'])
    print("\n")
    question_no += 1

0
{'input': 'At room temperature, the typical barrier potential for silicon p-n junction in Volt (V) is  (A) 0.7 * $10^(-23)$ (B) 0.07 (C) 0.70 (D) 7.0', 'context': [Document(page_content='Lundstroem, I.; Shiv araman, S.; Svensson, C. & Lu ndkvist, L. (1975). A hydrogen −sensitive \nMOS field −effect transistor. Applied Physics Letters , 26, 55-57. \nMa, G.P.; Yang, D.Z. & Nie, J. (2009). Preparat ion of porous ultrafine polyacrylonitrile', metadata={'source': 'thermo_md\\Thermo_Materials_Science_and_Technology.md', 'start_index': 510889}), Document(page_content='200 0.02221 770.3 848.0 2.433 0.01902 765.3 841.4 2.397# #Tables in SI Units 753\nTable A-19TABLE A-19 Properties of Selected Solids and Liquids: cp,/H9267, and /H9260\nSpecific Density, Thermal\nHeat, cp /H9267 Conductivity, /H9260\nSubstance (kJ/kg K) (kg/m3) (W/m K)\nSelected Solids, 300K', metadata={'source': 'thermo_md\\Fundamentals-of-Engineering-Thermodynamics-by-Michael-J.Moran-Howard-N.-Shapiro.md', 'start_index': 515

### Results based on thermo_md\Fundamentals-of-Engineering-Thermodynamics-by-Michael-J.Moran-Howard-N.-Shapiro.md only

In [20]:
for question in thermo_questions:
    response=retrieval_chain.invoke({"input":f"{question}"})
    print(response)
    print(response['answer'])
    print("/n")

{'input': 'Which of the following techniques is NOT used to grow single crystals of semiconductors?  (A) Calendering (B) Czochralski (C) Float zone (D) Bridgman', 'context': [Document(page_content='C. The four processes of the\ncycle are\nProcess 1–2: The gas is compressed adiabatically to state 2, where the temperature is TH.\nProcess 2–3: The assembly is placed in contact with the reservoir at TH. The gas expands\nisothermally while receiving energy QHfrom the hot reservoir by heat transfer.', metadata={'source': 'thermo_md\\Fundamentals-of-Engineering-Thermodynamics-by-Michael-J.Moran-Howard-N.-Shapiro.md', 'start_index': 113615}), Document(page_content='3. Determine\n(a)the maximum and minimum temperatures for the cycle, inK.\n(b)the pressure and volume at the beginning of the isother-mal expansion in bar and m\n3, respectively.\n(c)the work and heat transfer for each of the four processes,in kJ.\n(d)Sketch the cycle on p–vcoordinates.', metadata={'source': 'thermo_md\\Fundamentals

KeyboardInterrupt: 