**Required Library Installation**

In [None]:
!pip install python-docx sentence-transformers faiss-cpu transformers --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m55.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m95.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m77.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m47.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

**Document loading and Paragraph Extraction**

In [None]:
from docx import Document

# Load your handout
doc = Document('ML_course_content_1.docx')
paras = [p.text.strip() for p in doc.paragraphs if p.text.strip()]

# Join all paragraphs into one big text
handout_text = "\n".join(paras)
documents = [handout_text]


**Hybrid chunking**

In [None]:
import nltk
# 'punkt' and 'punkt_tab' for sentence tokenization
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

def hybrid_chunking(paras, min_words=40, max_words=200):
    chunks = []
    buffer = []
    buffer_len = 0

    for para in paras:
        para = para.strip()
        if not para:
            continue
        para_words = para.split()
        n_words = len(para_words)

        # Case 1: Paragraph is too long, split by sentences
        if n_words > max_words:
            sentences = sent_tokenize(para)
            sent_buffer = []
            sent_count = 0
            for sent in sentences:
                sent_words = sent.split()
                sent_buffer += sent_words
                sent_count += len(sent_words)
                if sent_count >= min_words:
                    chunks.append(' '.join(sent_buffer))
                    sent_buffer = []
                    sent_count = 0
            if sent_buffer:
                chunks.append(' '.join(sent_buffer))
            continue

        # Case 2: Paragraph is short, merge into buffer
        buffer += para_words
        buffer_len += n_words
        if buffer_len >= min_words:
            chunks.append(' '.join(buffer))
            buffer = []
            buffer_len = 0

    # Flush buffer
    if buffer:
        chunks.append(' '.join(buffer))
    return chunks

# Usage with your docx paragraphs:
from docx import Document

doc = Document('ML_course_content_1.docx')
paras = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
chunk_texts = hybrid_chunking(paras, min_words=40, max_words=120)

print(f"Created {len(chunk_texts)} hybrid chunks.")
for i, ch in enumerate(chunk_texts[:5]):
    print(f"\nChunk {i+1} ({len(ch.split())} words):\n{ch}\n{'='*40}")

Created 143 hybrid chunks.

Chunk 1 (68 words):
1.1 Introduction to ML Machine Learning (ML) is a subfield of artificial intelligence that focuses on designing algorithms capable of learning from data and improving over time without being explicitly programmed for every possible task. Unlike traditional software, where rules are crafted by human programmers, ML systems identify and extract patterns from large volumes of data, allowing them to make predictions, detect anomalies, and even generate new content.

Chunk 2 (49 words):
<br> The applications of ML are vast: self-driving cars, language translation, recommendation engines, medical image analysis, fraud detection, and more. ML approaches include supervised learning (with labeled data), unsupervised learning (discovering structure in unlabeled data), semi-supervised learning (combining both), and reinforcement learning (learning via trial and error and rewards).

Chunk 3 (31 words):
<br> The success of modern ML stems from increas

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


**Compute Embeddings for Chunks**

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(chunk_texts, show_progress_bar=True)

embeddings = np.array(embeddings).astype('float32')
faiss.normalize_L2(embeddings)
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)

# peaking into embeddings

print("Embedding shape:", embeddings.shape)
print("First chunk embedding (first 10 dims):", embeddings[0][:10])
print("First 3 embeddings:\n", embeddings[:3])

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Embedding shape: (143, 384)
First chunk embedding (first 10 dims): [-0.04891609 -0.03908867  0.04322965  0.00759483  0.02771054 -0.0615214
 -0.00089626 -0.06041696 -0.05594254 -0.01826064]
First 3 embeddings:
 [[-0.04891609 -0.03908867  0.04322965 ...  0.09220957  0.04523145
  -0.05161026]
 [-0.0522799  -0.04477102  0.013062   ...  0.00473073  0.01609313
  -0.04416708]
 [-0.00738526 -0.08043031  0.02125871 ... -0.05418754  0.03361555
  -0.03006891]]


**Build FAISS Index and Search**

In [None]:
query = "which is the most widely used linear classification model?"
q_emb = model.encode([query]).astype('float32')
faiss.normalize_L2(q_emb)
top_k = 10  # Retrieve more chunks for context expansion!
D, I = index.search(q_emb, top_k)

retrieved_chunks = [chunk_texts[idx] for idx in I[0]]
print("Top retrieved chunks for context expansion:\n")
for rank, chunk in enumerate(retrieved_chunks):
    print(f"Rank {rank+1}: (Score: {D[0][rank]:.4f})")
    print(chunk)
    print('-' * 80)

Top retrieved chunks for context expansion:

Rank 1: (Score: 0.6250)
Imagine you have a scatter plot of data points, and your goal is to draw a straight line (or a flat plane in higher dimensions) that perfectly divides these points into distinct categories. This simple yet powerful idea is the essence of linear models for classification. These models are fundamental to machine learning because they offer a straightforward, interpretable, and computationally efficient way to categorize data.
--------------------------------------------------------------------------------
Rank 2: (Score: 0.6003)
The primary advantages of linear models are their simplicity, making them easy to understand and implement; their interpretability, as you can see which features contribute most to the classification based on their weights; and their speed, making them suitable for large datasets. They serve as an excellent baseline for many real-world classification problems.
-----------------------------------

In [None]:
from transformers import pipeline

# BERT-based QA Pipeline

qa = pipeline(
    "question-answering",
    model="bert-large-uncased-whole-word-masking-finetuned-squad",  # BERT SQuAD v1.1 model
    tokenizer="bert-large-uncased-whole-word-masking-finetuned-squad"
)

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [None]:
import re
from collections import Counter

def get_best_sentence_from_topk(chunk_texts, question, I, topk=5, min_length=10):
    stopwords = set([
        'what', 'is', 'the', 'a', 'an', 'of', 'in', 'on', 'and', 'or', 'for', 'to', 'with', 'by', 'it', 'as', 'that',
        'this', 'from', 'at', 'which', 'are', 'be', 'was', 'were', 'has', 'have', 'had', 'but', 'not', 'all', 'so', 'should'
    ])
    q_words = set([w.lower() for w in re.findall(r'\w+', question) if w.lower() not in stopwords])
    best_score = 0
    best_sentence = ""
    for idx in I[0][:topk]:
        chunk = chunk_texts[idx]
        sentences = re.split(r'(?<=[.!?]) +', chunk)
        for sent in sentences:
            s_words = set([w.lower() for w in re.findall(r'\w+', sent)])
            score = len(q_words.intersection(s_words))
            if score > best_score and len(sent.split()) >= min_length:
                best_score = score
                best_sentence = sent.strip()
    return best_sentence

def smart_context_qa_best_global(
    qa_pipeline, query, chunk_texts, I,
    min_length=10, min_score=0.2, expand_topk=5):

    # Try QA model on top-1 chunk
    context = chunk_texts[I[0][0]]
    result = qa_pipeline(question=query, context=context)
    answer = result['answer'].strip()
    score = result['score']
    nwords = len(answer.split())
    print("Answer given by pipeline(Before applying any fallback conditions):\n", answer);

    # If QA answer is good, use it
    if nwords >= min_length and score >= min_score:
        print("Extracted answer from QA model:\n", answer)
        return answer

    # If not, run global keyword search on top-K chunks
    best_sentence = get_best_sentence_from_topk(chunk_texts, query, I, topk=expand_topk, min_length=min_length)
    if best_sentence:
        print("Answer by global keyword-matched sentence:\n", best_sentence)
        return best_sentence

    # Fallback strategy
    fallback_sent = re.split(r'(?<=[.!?]) +', chunk_texts[I[0][0]])[0].strip()
    print("Fallback: First sentence from top chunk:\n", fallback_sent)
    return fallback_sent

# Usage
smart_context_qa_best_global(qa, query, chunk_texts, I)



Answer given by pipeline(Before applying any fallback conditions):
 linear models for classification
Answer by global keyword-matched sentence:
 3.4 Logistic Regression Logistic Regression is one of the most fundamental and widely used linear classification algorithms in machine learning.


'3.4 Logistic Regression Logistic Regression is one of the most fundamental and widely used linear classification algorithms in machine learning.'