<a href="https://colab.research.google.com/github/kanishk1906/Ml-projects/blob/main/PDFQ%26ABOT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [45]:
!pip install PyPDF2 langchain faiss-cpu transformers torch sentence-transformers
!pip install -U langchain-community




In [46]:
import os
import PyPDF2
from google.colab import files
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import pipeline

In [47]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

def process_pdfs(pdf_files):
    texts = []
    for pdf_file in pdf_files:
        text = extract_text_from_pdf(pdf_file)
        texts.append(text)
    return texts



In [48]:
def split_texts(texts):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = []
    for text in texts:
        chunks.extend(text_splitter.split_text(text))
    return chunks



In [49]:
def create_vector_store(chunks):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vector_store = FAISS.from_texts(chunks, embeddings)
    return vector_store

In [50]:
def generate_questions(vector_store, num_questions=5):
    question_generator = pipeline("text2text-generation", model="google/flan-t5-base")

    questions = []
    for _ in range(num_questions):
        # Randomly sample a chunk from the vector store
        sample_chunk = vector_store.similarity_search("", k=1)[0].page_content

        # Generate a question based on the chunk
        prompt = f"Generate a question based on this text: {sample_chunk}"
        question = question_generator(prompt, max_length=50, num_return_sequences=1)[0]['generated_text']
        questions.append(question)

    return questions


In [51]:
import os
import PyPDF2
from google.colab import files
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

def process_pdfs():
    all_pdf_files = []
    max_pdfs = 10
    pdf_contents = {}
    all_chunks = []

    print(f"You can upload up to {max_pdfs} PDF files.")
    print("Please upload your PDF files one by one. Type 'done' when finished or when you've uploaded 10 files.")

    while len(all_pdf_files) < max_pdfs:
        if len(all_pdf_files) > 0:
            print(f"\nCurrently uploaded {len(all_pdf_files)} file(s). You can upload {max_pdfs - len(all_pdf_files)} more.")
            user_input = input("Press Enter to upload another file, or type 'done' to proceed: ")
            if user_input.lower() == 'done':
                break

        print("\nPlease upload a PDF file.")
        uploaded = files.upload()

        if not uploaded:
            print("No file was uploaded. Please try again or type 'done' to proceed.")
            continue

        new_files = list(uploaded.keys())
        all_pdf_files.extend(new_files)
        print(f"Successfully uploaded: {', '.join(new_files)}")

    if not all_pdf_files:
        print("No PDF files were uploaded. Exiting.")
        return None, None

    print(f"\nProcessing {len(all_pdf_files)} PDF file(s)...")

    for pdf_file in all_pdf_files:
        print(f"Processing {pdf_file}...")
        text = extract_text_from_pdf(pdf_file)
        pdf_contents[pdf_file] = text

        # Split text into chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        chunks = text_splitter.split_text(text)
        all_chunks.extend(chunks)

    # Create vector store
    print("Creating vector store...")
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vector_store = FAISS.from_texts(all_chunks, embeddings)

    return pdf_contents, vector_store

# Run this function to process PDFs
pdf_contents, vector_store = process_pdfs()
if pdf_contents and vector_store:
    print(f"Processed {len(pdf_contents)} PDF(s). Vector store created.")
else:
    print("No PDFs were processed.")

You can upload up to 10 PDF files.
Please upload your PDF files one by one. Type 'done' when finished or when you've uploaded 10 files.

Please upload a PDF file.


Saving jesc110.pdf to jesc110 (5).pdf
Successfully uploaded: jesc110 (5).pdf

Currently uploaded 1 file(s). You can upload 9 more.
Press Enter to upload another file, or type 'done' to proceed: 

Please upload a PDF file.


Saving jesc101.pdf to jesc101 (3).pdf
Successfully uploaded: jesc101 (3).pdf

Currently uploaded 2 file(s). You can upload 8 more.
Press Enter to upload another file, or type 'done' to proceed: 

Please upload a PDF file.


Saving jesc108.pdf to jesc108 (1).pdf
Successfully uploaded: jesc108 (1).pdf

Currently uploaded 3 file(s). You can upload 7 more.
Press Enter to upload another file, or type 'done' to proceed: 

Please upload a PDF file.


Saving jesc103.pdf to jesc103 (1).pdf
Successfully uploaded: jesc103 (1).pdf

Currently uploaded 4 file(s). You can upload 6 more.
Press Enter to upload another file, or type 'done' to proceed: 

Please upload a PDF file.


Saving jesc102.pdf to jesc102 (1).pdf
Successfully uploaded: jesc102 (1).pdf

Currently uploaded 5 file(s). You can upload 5 more.
Press Enter to upload another file, or type 'done' to proceed: done

Processing 5 PDF file(s)...
Processing jesc110 (5).pdf...
Processing jesc101 (3).pdf...
Processing jesc108 (1).pdf...
Processing jesc103 (1).pdf...
Processing jesc102 (1).pdf...
Creating vector store...
Processed 5 PDF(s). Vector store created.


In [52]:
from transformers import pipeline
import random

def generate_complex_questions(pdf_contents, vector_store, questions_per_pdf=5):
    question_generator = pipeline("text2text-generation", model="google/flan-t5-base")

    all_questions = {}

    for pdf_name, content in pdf_contents.items():
        print(f"\nGenerating questions for {pdf_name}:")
        pdf_questions = set()  # Using a set to avoid duplicates

        # Get multiple chunks from the PDF content
        chunks = vector_store.similarity_search(content, k=questions_per_pdf * 2)

        attempts = 0
        while len(pdf_questions) < questions_per_pdf and attempts < questions_per_pdf * 3:
            # Randomly select a chunk
            sample_chunk = random.choice(chunks).page_content

            # Generate a question based on the chunk
            prompts = [
                f"Generate a complex analytical question based on this text: {sample_chunk}",
                f"Create a thought-provoking question that requires deep understanding of this text: {sample_chunk}",
                f"Formulate a question that challenges the reader's comprehension of this text: {sample_chunk}",
                f"Devise a question that requires synthesizing information from this text: {sample_chunk}",
                f"Construct a question that encourages critical thinking about this text: {sample_chunk}"
            ]

            prompt = random.choice(prompts)
            question = question_generator(prompt, max_length=100, num_return_sequences=1)[0]['generated_text']

            # Clean up the question
            question = question.strip().capitalize()
            if not question.endswith('?'):
                question += '?'

            # Add the question if it's unique
            if question not in pdf_questions:
                pdf_questions.add(question)
                print(f"- {question}")

            attempts += 1

        all_questions[pdf_name] = list(pdf_questions)

    return all_questions

# Run this function to generate questions
# Make sure you've run the PDF processing cell first
generated_questions = generate_complex_questions(pdf_contents, vector_store)


Generating questions for jesc110 (5).pdf:
- What is the object's shape?
- What is the opposite of obliquely from one medium to another?
- What is the wave theory of light?
- What is the relationship between light and air?
- What is the difference between light and air?

Generating questions for jesc101 (3).pdf:
- What is the activity 1.1?
- What is the chemical property of metals?
- What is the best way to test the chemical properties of metals?
- What is the law of conservation of mass?
- What is the purpose of this chapter?

Generating questions for jesc108 (1).pdf:
- What is the most likely outcome of a cell's death?
- What is the process of preparing yeast for a re-production experiment?
- What is the most important factor in the reproduction of organisms?
- Reproduction is not necessary to maintain the life of an individual organism?
- What is the most obvious reason that organisms reproduce?

Generating questions for jesc103 (1).pdf:
- What is the difference between a metal and 