In [11]:
!pip install transformers sentencepiece




In [13]:
from transformers import pipeline

# Load BART summarizer
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")


Device set to use cpu


In [13]:
from google.colab import files
import fitz  # PyMuPDF

uploaded = files.upload()
file_name = list(uploaded.keys())[0]

def extract_text_from_pdf(file_path):
    text = ""
    with fitz.open(file_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

pdf_text = extract_text_from_pdf(file_name)
print("✅ PDF Loaded and Extracted")
print(pdf_text[:1000])


Saving NLP lecture 22-23.pdf to NLP lecture 22-23 (1).pdf
✅ PDF Loaded and Extracted
NLP Lecture - 22,23
Information Extraction (IE):
1. A subfield of Natural Language Processing (NLP).
2. Focuses on automatically extracting structured information from unstructured text.
Main goal: Identify and categorize key pieces of data, making it easier to analyze and use.
Relation Extraction (RE):
A core task in Information Extraction.
Goal: Identify semantic relationships between entities in a text.
💡 Basically, if IE is about finding the characters in a story, RE is figuring out how they’re
connected.
🔍 Example:
Sentence: "Elon Musk is the CEO of Tesla."
Entities Extracted:
5/27/25, 8:49 AM
Online Notepad
https://onlinenotepad.org/notepad
1/7
Elon Musk → Person
Tesla → Organization
Relation Extracted: Elon Musk is the CEO of Tesla → This identifies a "leadership" or
"employment" relation.
🧠 Types of Relations (in Relation Extraction – NLP Style)
1. Binary Relations – 🎯 Two entities, one link
Ex

In [4]:
# ✅ Define chunk_text() before using it
def chunk_text(text, max_chunk_size=900):
    sentences = text.split(". ")
    chunks, chunk = [], ""

    for sentence in sentences:
        if len(chunk) + len(sentence) <= max_chunk_size:
            chunk += sentence + ". "
        else:
            chunks.append(chunk.strip())
            chunk = sentence + ". "
    chunks.append(chunk.strip())
    return chunks


In [15]:
!pip install -U sentence-transformers faiss-cpu


Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from

In [6]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load lightweight embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2')


In [7]:
def chunk_text(text, max_chunk_size=900):
    sentences = text.split(". ")
    chunks, chunk = [], ""

    for sentence in sentences:
        if len(chunk) + len(sentence) <= max_chunk_size:
            chunk += sentence + ". "
        else:
            chunks.append(chunk.strip())
            chunk = sentence + ". "
    chunks.append(chunk.strip())
    return chunks


In [9]:
from google.colab import files
import fitz  # PyMuPDF

# Upload your PDF
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

# Extract the text
def extract_text_from_pdf(file_path):
    text = ""
    with fitz.open(file_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

pdf_text = extract_text_from_pdf(file_name)
print("✅ PDF Loaded and Text Extracted")
print(pdf_text[:500])  # Optional preview


Saving NLP lecture 22-23.pdf to NLP lecture 22-23 (2).pdf
✅ PDF Loaded and Text Extracted
NLP Lecture - 22,23
Information Extraction (IE):
1. A subfield of Natural Language Processing (NLP).
2. Focuses on automatically extracting structured information from unstructured text.
Main goal: Identify and categorize key pieces of data, making it easier to analyze and use.
Relation Extraction (RE):
A core task in Information Extraction.
Goal: Identify semantic relationships between entities in a text.
💡 Basically, if IE is about finding the characters in a story, RE is figuring out how they


In [10]:
# Split the full text into manageable chunks
doc_chunks = chunk_text(pdf_text, max_chunk_size=600)

# Embed each chunk using the transformer model
doc_embeddings = embedder.encode(doc_chunks)

# Create FAISS index for fast similarity search
dimension = doc_embeddings[0].shape[0]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(doc_embeddings))


In [11]:
def ask_question(question):
    # Convert question to embedding
    q_embedding = embedder.encode([question])

    # Use FAISS to find closest matching chunk
    top_k = 1
    _, result_indices = index.search(np.array(q_embedding), top_k)

    # Extract the best matching chunk
    best_chunk = doc_chunks[result_indices[0][0]]

    # Summarize the answer based on the chunk
    prompt = f"Answer the question based on the context below:\n\n{best_chunk}\n\nQ: {question}"
    answer = summarizer(prompt, max_length=100, min_length=30, do_sample=False)[0]['summary_text']

    return answer


In [14]:
response = ask_question("What is the main goal of the internship?")
print("💬", response)


💬 Use the question based on the context below to answer the question. The question is about meaning, not about the content of the answer.


In [18]:
import random
import pandas as pd

# Dummy chunks – replace with real chunking logic
chunks = [
    "Artificial Intelligence aims to simulate human intelligence in machines to perform tasks such as learning, reasoning, and problem-solving.",
    "Supervised learning uses labeled data to train models, enabling them to predict outputs based on input data.",
    "Natural Language Processing enables computers to understand, interpret, and generate human language."
]

# Fake summarizer
def summarizer(text):
    return f"Summary: {text[:120]}..."

# Manual MCQ generation logic
def generate_mcq_from_summary(summary):
    if "labeled data" in summary.lower():
        return {
            "Question": "What does supervised learning use to train models?",
            "Options": ["Unlabeled data", "Raw files", "Labeled data", "No input data"],
            "Answer": "C"
        }
    elif "natural language processing" in summary.lower():
        return {
            "Question": "What does NLP help computers do?",
            "Options": ["Generate machine code", "Understand human language", "Compile code", "Read barcodes"],
            "Answer": "B"
        }
    elif "artificial intelligence" in summary.lower():
        return {
            "Question": "What is the goal of Artificial Intelligence?",
            "Options": ["Simulate human intelligence", "Create hardware", "Automate UI testing", "Scan documents"],
            "Answer": "A"
        }
    else:
        return {
            "Question": "What is this chunk about?",
            "Options": ["AI", "ML", "NLP", "None"],
            "Answer": "A"
        }

# 💾 History log
history_log = []

# Main function with logging
def challenge_me_mode():
    selected_chunks = random.sample(chunks, k=2)
    score = 0

    for i, chunk in enumerate(selected_chunks):
        print(f"\n📄 Chunk {i+1} Preview:", chunk[:100], "...")
        summary = summarizer(chunk)
        mcq = generate_mcq_from_summary(summary)

        print("\n🧠", mcq["Question"])
        options = mcq["Options"]
        print("A.", options[0])
        print("B.", options[1])
        print("C.", options[2])
        print("D.", options[3])
        user_ans = input("🎯 Your Answer (A/B/C/D): ").strip().upper()
        is_correct = (user_ans == mcq["Answer"])

        # Score update
        if is_correct:
            print("✅ Correct!")
            score += 1
        else:
            print(f"❌ Nope! Correct answer: {mcq['Answer']}")

        # Log entry
        history_log.append({
            "Question": mcq["Question"],
            "Option A": options[0],
            "Option B": options[1],
            "Option C": options[2],
            "Option D": options[3],
            "User Answer": user_ans,
            "Correct Answer": mcq["Answer"],
            "Result": "Correct" if is_correct else "Incorrect"
        })

    # 🧾 Show Summary
    print("\n🧾 Quiz Complete!")
    print(f"🎯 Your Score: {score}/{len(selected_chunks)}")

    # 🧠 Export to DataFrame
    df = pd.DataFrame(history_log)
    print("\n📊 Answer History Log:\n", df)

    # 💾 Save to CSV (Optional)
    df.to_csv("quiz_history.csv", index=False)
    print("📝 Saved quiz history to 'quiz_history.csv'")

# Run it
challenge_me_mode()



📄 Chunk 1 Preview: Natural Language Processing enables computers to understand, interpret, and generate human language. ...

🧠 What does NLP help computers do?
A. Generate machine code
B. Understand human language
C. Compile code
D. Read barcodes
🎯 Your Answer (A/B/C/D): B
✅ Correct!

📄 Chunk 2 Preview: Artificial Intelligence aims to simulate human intelligence in machines to perform tasks such as lea ...

🧠 What is the goal of Artificial Intelligence?
A. Simulate human intelligence
B. Create hardware
C. Automate UI testing
D. Scan documents
🎯 Your Answer (A/B/C/D): A
✅ Correct!

🧾 Quiz Complete!
🎯 Your Score: 2/2

📊 Answer History Log:
                                        Question                     Option A  \
0              What does NLP help computers do?        Generate machine code   
1  What is the goal of Artificial Intelligence?  Simulate human intelligence   

                    Option B             Option C        Option D User Answer  \
0  Understand human language  