
### Importing the necessary library
First, we import the `PdfReader` and `PdfWriter` classes from the `pypdf` library.
```python

In [291]:
from pypdf import PdfReader,PdfWriter

In [292]:
f =  open('../Dr.X Files/The-Alchemist.pdf', 'rb') # opening the pdf as binery

In [293]:
pdf_reader = PdfReader(f)

In [294]:
num_page = len(pdf_reader.pages)
print(num_page)

136


In [295]:
page_one = pdf_reader.pages[3]
print(page_one)

{'/Type': '/Page', '/Annots': [IndirectObject(70, 0, 2234061945168), IndirectObject(71, 0, 2234061945168)], '/Contents': IndirectObject(72, 0, 2234061945168), '/MediaBox': [0, 0, 612, 792], '/Parent': IndirectObject(12, 0, 2234061945168), '/Resources': {'/ExtGState': {'/G3': IndirectObject(67, 0, 2234061945168)}, '/Font': {'/F4': IndirectObject(68, 0, 2234061945168), '/F5': IndirectObject(840, 0, 2234061945168)}, '/ProcSet': ['/PDF', '/Text', '/ImageB', '/ImageC', '/ImageI']}, '/StructParents': 0}


In [296]:
content  = page_one.extract_text()
content

'International Acclaim for Paulo Coelho’s\nTHE ALCHEMIST\n“The story has the comic charm, dramatic tension, and psychological\nintensity of a fairy tale, but it’s full of specific wisdom as well. . . . A\nsweetly exotic tale for young and old alike.”\n—Publishers Weekly\n“Beneath this novel’s compelling story and the shimmering elegance\nwith which it’s told lies a bedrock of wisdom about following one’s\nheart.”\n—Booklist\n“As memorable and meaningful as Saint-Exupéry’s The Little\nPrince.”\n—Austin American-Statesman\n“A touching, inspiring fable.”\n—Indianapolis Star\n“A little poke in the ribs from on high.”\n—Detroit Free Press\n“The Alchemist is a fabulous success.”\n—Der Spiegel (Germany)'

In [297]:
f.close()

## Extracting Odd Pages from a PDF using pypdf

This notebook demonstrates how to use the `pypdf` library in Python to extract the odd-numbered pages from a PDF file and save them as a new PDF.



In [298]:
odd_pages = []
try:
    pdf_writer = PdfWriter()
    with open ('../Dr.X Files/The-Alchemist.pdf','rb') as f:
        pdf_reader = PdfReader(f)
        
        for i in range(0, len(pdf_reader.pages)):
            if i%2 != 0:
                odd_pages.append(pdf_reader.pages[i])
        for page in odd_pages:
                pdf_writer.add_page(page)
except Exception as e:
    print(f"An error occurred: {e}")

In [299]:
# try:
#     with open('../Dr.X Files/odd_pages.pdf', 'wb') as odd_pages_output:
#         pdf_writer.write(odd_pages_output)
# except Exception as e:
#     print(f"An error occurred: {e}")

### Start of Extracting text

In [300]:
import docx
import pandas as pd

In [301]:
def text_extract_docx(fp):
    try:
        doc = docx.Document(fp)
        content = []
        for paragraph in doc.paragraphs:
            content.append(paragraph.text)
        return "\n".join(content)
    except Exception as e:
        print(f"An error occurred: {e}")
        
def text_extract_pdf(fp):
    try:
        content = []
        with open(fp,'rb') as f:
            pdf_reader = PdfReader(f)
            for page in pdf_reader.pages:
                content.append(page.extract_text())
        return "\n".join(content)
    except Exception as e:
        print(f"An error occurred: {e}")
def text_extract_csv(fp):
    try:
        df = pd.read_csv(fp)
        return df.to_string(index=False)
    except Exception as e:
        print(f'An error occurred: {e}')
def text_extract_excel(fp):
    try:
        df = pd.read_excel(fp)
        return df.to_string(index=False)
    except Exception as e:
        print(f'An error occurred: {e}')

In [302]:
def output_as_text(txt):
    with open('output_text.txt','w') as f:
        for t in txt:
            f.write(t)
            f.write('\n'+ '='*20+ '\n') 


In [303]:
# docx_text = text_extract_docx("../Dr.X Files/Dataset summaries and citations.docx")
# pdf_text = text_extract_pdf("../Dr.X Files/odd_pages.pdf")
# excel_text = text_extract_excel("../Dr.X Files/Loan amortisation schedule1.xlsx")


In [304]:
import os
files = os.listdir("../Dr.X Files")
full_text  = []
# count = 0
for file in files:
    # print(file)
    file_path = os.path.join("../Dr.X Files", file)
    if file.endswith('.docx'):        
        # count+=1
        text = text_extract_docx(file_path)
        full_text.append(text)
    elif file.endswith('.pdf'):      
        # count+=1
        text = text_extract_pdf(file_path)
        full_text.append(text)
    elif file.endswith('.csv'):        
        # count+=1
        text = text_extract_csv(file_path)
        full_text.append(text)
    elif file.endswith('.xlsx' or 'xls' or 'xlsm'):        
        # count+=1
        text = text_extract_excel(file_path)
        full_text.append(text)
print(len(full_text))


  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn(msg)
  warn(f"Print area cannot be set to Defined name: {defn.value}.")


10


In [305]:
import re

def all_table(txt):
    li = txt.splitlines()
    table = []

    for l in li:
        l = l.strip()
        if not l:
            continue
        cols = re.split(r'\s{2,}|\t+',l)
        s_col = []
        for col in cols:
            if col.strip():
                s_col.append(col)
        if len(s_col) >=2:
            table.append(l)
    return table




In [306]:

extract_table = all_table("\n".join(full_text))
# output_as_text(extract_table)



Tokenizing

In [307]:

import tiktoken
tokenizer = tiktoken.get_encoding("cl100k_base")
def chunk_extraction(fp, max_tokens=500):
    chunks = []

    def chunk(text, src, meta={}):
        tokens = tokenizer.encode(text)
        for i in range(0, len(tokens), max_tokens):
            data = {
                "file": os.path.basename(fp),
                "source": src,
                **meta,
                "chunk": i // max_tokens,
                "text": tokenizer.decode(tokens[i:i + max_tokens])
            }
            chunks.append(data)

    if fp.endswith('.pdf'):
        for page_num, page in enumerate(PdfReader(fp).pages):
            text = page.extract_text()
            if text:
                chunk(text, "pdf", {"page": page_num + 1})
    elif fp.endswith('.docx'):
        text = text_extract_docx(fp)
        chunk(text, "docx")
    elif fp.endswith('.csv'):
        df = pd.read_csv(fp)
        
        for ind, r in df.iterrows():
            text = ""
            for i, c in enumerate(r):
                text += str(c)
                if i < len(r) - 1:
                    text += " | "
            chunk(text, "csv", {"r": ind})
    else:
        df = pd.read_excel(fp, sheet_name=None)
        
        df = pd.read_excel(fp, sheet_name=None, engine='openpyxl')  
        for sheet_name, sheet_df in df.items():
            for ind, r in sheet_df.iterrows():
                text = ""
                for i, c in enumerate(r):
                    text += str(c)
                    if i < len(r) - 1:
                        text += " | "
                    chunk(text, "excel", {"sheet": sheet_name, "row": ind})

    return chunks



In [308]:

file_folder = "../Dr.X Files"
all_chunks = []

for file in os.listdir(file_folder):
    fp = os.path.join(file_folder, file)
    file_chunks = chunk_extraction(fp)
    all_chunks.extend(file_chunks)

print(f"Total chunks: {len(all_chunks)}")



  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn(msg)
  warn("""Cannot parse header or footer so it will be ignored""")
  warn(msg)
  warn(f"Print area cannot be set to Defined name: {defn.value}.")
  warn(f"Print area cannot be set to Defined name: {defn.value}.")


Total chunks: 2644


In [309]:
import json
c_s = []
for chunk in all_chunks:
        c_s.append(json.dumps(chunk))
        "\n".join(c_s)
output_as_text(c_s)

### Database

In [310]:
from langchain_nomic import NomicEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document

In [None]:

os.environ["NOMIC_API_KEY"] = "Generate Nomic Key and Put Here" 

In [312]:
txt = []
meta = []
for chunk in all_chunks:
    text = chunk["text"]
    data = {}
    for k in chunk:
        if k != "text":
            data[k] = chunk[k]
    txt.append(text)
    meta.append(data)


In [313]:
nomic = NomicEmbeddings(model="nomic-embed-text-v1.5")
embeddings = nomic.embed_documents(txt)

Exception: (400, '{"detail":"You have exceeded your 10000000 free tokens of Nomic Embedding API usage. Enter a payment method at https://atlas.nomic.ai to continue with usage-based billing."}')

In [None]:
docs = []
for i in range(len(txt)):
    doc = Document(page_content=txt[i], metadata=meta[i])
    docs.append(doc)


In [None]:
import faiss
db = FAISS.from_documents(docs, nomic)
db.save_local("vector_db")

Exception: (400, '{"detail":"You have exceeded your 10000000 free tokens of Nomic Embedding API usage. Enter a payment method at https://atlas.nomic.ai to continue with usage-based billing."}')

### RAG

In [None]:
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [None]:
from langchain.llms import LlamaCpp

llm = LlamaCpp(
    model_path="../models/llama-2-7b.Q2_K.gguf",
    temperature=0.7,
    max_tokens=300,
    top_p=1,
    n_ctx=2048,
    n_batch=64,
    verbose=True
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ../models/llama-2-7b.Q2_K.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u32             

In [None]:
from langchain.chains import RetrievalQA
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.prompts import PromptTemplate

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
    You are a helpful assistant for answering questions from scientific publications.
    Use the following context to answer the question. If the answer is not in the context, say you don't know.

    Context:
    {context}

    Question:
    {question}

    Answer:
    """ 
)


combine_docs_chain = load_qa_with_sources_chain(
    llm,
    prompt,
    document_variable_name="context" 
)

qa_chain = RetrievalQA(
    retriever=retriever,
    combine_documents_chain=combine_docs_chain,
    return_source_documents=True
)


query = "Type Your question here"
result = qa_chain.invoke({"question": query})

print("\n🧠 Answer:\n", result["result"])
print("\n📚 Source Documents:")
for doc in result["source_documents"]:
    print(f"- File: {doc.metadata.get('file', 'Unknown')} | Chunk: {doc.metadata.get('chunk')}")

ValueError: Missing some input keys: {'query'}