In [None]:
# Core imports and test data (reused from earlier notebooks)
from sample_data import SAMPLE_TEXT
import os
import nltk
nltk.download('punkt', quiet=True)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import chromadb
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
import numpy as np
from groq import Groq  # replace with your LLM client if needed
import time
print('‚úÖ Setup imports complete')

## 1) Setup & Reusable Helpers ‚Äî from previous parts

Re-creating minimal helpers (chunking, embeddings, Chroma vector store, hybrid retrieval) so this notebook runs standalone. We'll also set up prompt templates and token estimation. Ensure `GROQ_API_KEY` is set in your environment.

In [None]:
# --- embeddings + vector store (minimal demo) ---
print('üîÑ Loading embedding model...')
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print('‚úÖ Embedding model ready')
# Chunk the sample document (simple sentence-based chunker reused)
def chunk_by_semantic_similarity(text: str, similarity_threshold: float = 0.5, overlap_sentences: int = 2, min_chunk_size: int = 2) -> list:
    sentences = nltk.sent_tokenize(text)
    if len(sentences) <= min_chunk_size:
        return [text]
    vec = TfidfVectorizer(stop_words='english')
    try:
        sentence_vectors = vec.fit_transform(sentences)
    except ValueError:
        return [text]
    similarities = [cosine_similarity(sentence_vectors[i:i+1], sentence_vectors[i+1:i+2])[0][0] for i in range(len(sentences)-1)]
    chunk_boundaries = [0]
    current_chunk_size = 1
    for i, sim in enumerate(similarities):
        if sim < similarity_threshold and current_chunk_size >= min_chunk_size:
            chunk_boundaries.append(i+1); current_chunk_size = 1
        else:
            current_chunk_size += 1
    if chunk_boundaries[-1] != len(sentences):
        chunk_boundaries.append(len(sentences))
    chunks = []
    for i in range(len(chunk_boundaries)-1):
        start_idx = chunk_boundaries[i]
        end_idx = chunk_boundaries[i+1]
        if i > 0 and overlap_sentences > 0:
            overlap_start = max(0, start_idx-overlap_sentences)
            chunk_sentences = sentences[overlap_start:end_idx]
        else:
            chunk_sentences = sentences[start_idx:end_idx]
        chunk = ' '.join(chunk_sentences)
        chunks.append(chunk)
    return chunks
chunks = chunk_by_semantic_similarity(SAMPLE_TEXT, similarity_threshold=0.15, overlap_sentences=2)
print(f'‚úÖ Created {len(chunks)} chunks from sample data')
# Build embeddings + Chroma collection (lightweight demo)
embeddings = embedding_model.encode(chunks, show_progress_bar=False)
client = chromadb.Client()
collection = client.get_or_create_collection(name='part5_generation_chunks', metadata={'hnsw:space':'cosine'})
ids = [f'chunk_{i}' for i in range(len(chunks))]
collection.add(ids=ids, embeddings=embeddings.tolist(), documents=chunks, metadatas=[{'chunk_index':i,'source':'SAMPLE_TEXT','length':len(chunks[i])} for i in range(len(chunks))])
print(f'‚úÖ Vector store ready with {collection.count()} chunks')

### 1.1 Hybrid retrieval (dense + sparse) ‚Äî reused from earlier parts

Balances BM25 (lexical) and semantic similarity for robust multi-faceted retrieval.

In [None]:
def hybrid_retrieval(query: str, top_k: int = 3, alpha: float = 0.7):
    query_embedding = embedding_model.encode([query])[0]
    qresults = collection.query(query_embeddings=[query_embedding.tolist()], n_results=top_k)
    vector_scores_dict = {}
    for chunk_id, distance in zip(qresults['ids'][0], qresults['distances'][0]):
        chunk_idx = int(chunk_id.split('_')[1])
        vector_scores_dict[chunk_idx] = 1 - distance
    tokenized_chunks = [c.lower().split() for c in chunks]
    bm25 = BM25Okapi(tokenized_chunks)
    query_tokens = query.lower().split()
    bm25_scores_dict = {idx: bm25.get_scores(query_tokens)[idx] for idx in vector_scores_dict.keys()}
    bm25_list = list(bm25_scores_dict.values())
    bm25_min, bm25_max = min(bm25_list), max(bm25_list)
    bm25_norm = {k: (v - bm25_min) / (bm25_max - bm25_min + 1e-10) for k, v in bm25_scores_dict.items()}
    hybrid_scores = {k: alpha * vector_scores_dict[k] + (1-alpha)*bm25_norm[k] for k in vector_scores_dict.keys()}
    sorted_results = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)
    results = []
    for idx, score in sorted_results:
        meta = collection.get(ids=[f'chunk_{idx}'])['metadatas'][0]
        results.append({'chunk_idx': idx, 'hybrid_score': score, 'vector_score': vector_scores_dict[idx], 'bm25_score': bm25_norm[idx], 'content': chunks[idx], 'metadata': meta})
    return results
print('‚úÖ Hybrid retrieval helper ready')

### 1.2 Prompt Templates & LLM Generation

Defines multiple prompt templates (QA, instructional, citations, email, chain-of-thought) and a `generate_answer()` helper that calls Groq and returns tokens/timing metrics.

In [None]:
# Prompt templates tailored for generation & attribution
BASIC_QA_TEMPLATE = """Context:
{context}

Question: {query}

Answer:"""

print('‚úÖ Generation templates ready')

In [None]:
# Token estimation util (reuse the same heuristic used earlier)
def estimate_tokens(text: str):
    return max(1, len(text) // 4)

# LLM client (Groq used in earlier notebooks) ‚Äî ensure API key is set in the environment
groq_client = Groq(api_key=os.environ.get('GROQ_API_KEY')) if os.environ.get('GROQ_API_KEY') else None
if groq_client:
    print('‚úÖ LLM client ready (GROQ)')
else:
    print('‚ö†Ô∏è GROQ_API_KEY missing ‚Äî generation calls will be mocked unless you set the environment variable')

#### 1.2.1 generate_answer helper

Wraps the Groq LLM call with timing and token metrics.

In [None]:
def generate_answer(prompt_text: str, model='llama-3.1-8b-instant', temperature=0.2, max_output_tokens=400, 
                   top_p=1.0, top_k=None, frequency_penalty=0.0, presence_penalty=0.0, stop=None):
    start = time.time()
    
    # Build API parameters dynamically
    api_params = {
        'model': model,
        'messages': [{'role':'user','content': prompt_text}],
        'temperature': temperature,
        'max_tokens': max_output_tokens,
        'top_p': top_p,
        'frequency_penalty': frequency_penalty,
        'presence_penalty': presence_penalty,
    }
    
    if top_k is not None:
        api_params['top_k'] = top_k
    
    if stop is not None:
        api_params['stop'] = stop
    
    response = groq_client.chat.completions.create(**api_params)
    out = response.choices[0].message.content.strip()
    elapsed = time.time() - start
    
    return {'text': out, 'tokens': estimate_tokens(out), 'elapsed': elapsed, 'raw': response}

print('‚úÖ generate_answer helper ready')

## 2) LLM Decoding Parameters: Complete Reference

### 2.1 **TEMPERATURE** (0.0 to 2.0+) | Default: 1.0
Controls randomness/creativity of next token selection.

**Low (0.0-0.3):**
- ‚úì Deterministic and focused
- ‚úì Best for factual Q&A, code generation
- ‚úì Repeatable outputs (ideal for testing)
- ‚úó May be boring or repetitive
- Example: `temperature=0.1` for grounded RAG answers

**Medium (0.7-1.0):**
- ‚úì Balanced: coherent + somewhat creative
- ‚úì Good default for most tasks
- ‚úì Natural-sounding responses
- Example: `temperature=0.7` for general QA

**High (1.2+):**
- ‚úì Very creative, explores more diverse tokens
- ‚úó Increased hallucination risk
- ‚úó May become incoherent
- ‚ö†Ô∏è Use sparingly for brainstorming only
- Example: `temperature=1.5` for creative writing

üß† **Effects on LLM Output:**
- Changes how sharp or flat the probability distribution is
- Low ‚Üí deterministic, safe, repetitive
- High ‚Üí creative, varied, risky
- Affects style, not length

#### üî¨ Demo: Temperature in Action

In [None]:
# Demo: How temperature affects output creativity
test_query = "What are the main requirements of remote work for employees and the organization? What are the benefits ?"
ctx = '\n\n'.join([f"[chunk_{r['chunk_idx']}]\n{r['content']}" for r in hybrid_retrieval(test_query, top_k=2)])
prompt = BASIC_QA_TEMPLATE.format(context=ctx, query=test_query)

print("=" * 80)
print("üî¨ TEMPERATURE DEMO: Same prompt, different temperature values")
print("=" * 80)

for temp_value in [0.01, 1.0, 2.0]:
    print(f"\n{'‚îÄ' * 80}")
    print(f"üå°Ô∏è Temperature = {temp_value}")
    print(f"{'‚îÄ' * 80}")
    
    result = generate_answer(
        prompt_text=prompt, 
        #prompt_text="What does AI mean for a 10 year old?",
        temperature=temp_value,
        max_output_tokens=500
    )
    
    print(f"Output: {result['text']}.")
    print(f"Tokens: {result['tokens']} | Time: {result['elapsed']:.2f}s")

---

### 2.2 **TOP_P (NUCLEUS SAMPLING)** (0.0 to 1.0) | Default: 1.0 (disabled)
Keep the smallest set of tokens whose cumulative probability reaches or exceeds `top_p`; discard the rest.

**How it works:**
1. LLM ranks next tokens by probability: [0.4, 0.3, 0.15, 0.1, 0.05]
2. Accumulate until cumsum ‚â• top_p. With top_p=0.9: 0.4+0.3+0.15=0.85 (<0.9) ‚Üí add 0.1 ‚Üí 0.95 (stop)
3. Tokens after the cutoff (0.05 here) are excluded; the set always crosses the threshold, never stops below it
4. Result: More coherent, fewer low-probability odd tokens

**Example with words:**
- Sentence prefix: "Today I went to the"
- Candidate next words with probs: store(0.46), park(0.22), office(0.12), beach(0.09), moon(0.06), volcano(0.05)
- `top_p=0.8` nucleus: store + park + office = 0.80 (LLM samples only from these 3)
- `top_p=0.9` nucleus: store + park + office + beach + moon = 0.95 (needs to cross 0.9; moon is added)
- `top_p=1.0`: keeps all options, including moon/volcano (most diverse)

**Recommended values:**
- `top_p=0.9`: Recommended; filters out very unlikely tokens
- `top_p=1.0`: Disabled (use all tokens)
- `top_p=0.5`: Very restrictive; safer but may limit diversity

üß† **Effects on LLM Output:**
- Limits which tokens are allowed based on probability mass
- Low ‚Üí only very likely tokens
- High ‚Üí more linguistic variety
- Improves coherence vs. creativity balance

#### 2.2.1 Temperature vs. top_p ‚Äî concise and practical


#### üß™ Concrete example

##### Original probabilities
```
A: 0.50
B: 0.30
C: 0.15
D: 0.05
```

---

##### Case 1: Low temperature (0.2)
```
A: 0.85
B: 0.10
C: 0.04
D: 0.01
```
Now with `top_p = 0.9`:

* Allowed: A + B
* Very deterministic

---

##### Case 2: High temperature (1.2)
```
A: 0.35
B: 0.30
C: 0.20
D: 0.15
```
Now with `top_p = 0.9`:

* Allowed: A + B + C + D
* Much more variation

üëâ Same top_p, different result
Because temperature changed the distribution before top_p was applied.

Practical defaults:
- For RAG stability: top_p ‚âà 0.9, temperature ‚âà 0.2‚Äì0.4.
- For more creativity: increase temperature; adjust top_p cautiously to avoid drift/hallucinations.


---

### 2.3 **TOP_K** (1 to vocab_size, typically 50-100) | Default: Not set (disabled)
Keep only top-k highest probability tokens; ignore all others.

**How it works:**
1. LLM ranks tokens by probability: [0.4, 0.3, 0.15, 0.1, 0.05]
2. top_k=3 keeps only: [0.4, 0.3, 0.15]
3. Discards low-probability tokens entirely
4. Result: Simpler but less flexible than top_p

**Example with words:**
- Sentence prefix: "Today I went to the"
- Candidate next words with probs: store(0.46), park(0.22), office(0.12), beach(0.09), moon(0.06), volcano(0.05)
- `top_k=3`: keeps only store, park, office (exactly top 3 by probability)
- `top_k=5`: keeps store, park, office, beach, moon (top 5; excludes volcano)
- `top_k=6`: keeps all options (no restriction; moon and volcano both allowed)

**Recommended values:**
- `top_k=50`: Common; balances diversity and safety
- `top_k=10`: Very restrictive; deterministic
- `top_k=100`: More permissive; allows more variety

**Note:** Less popular than top_p; top_p is generally preferred for nuanced control

üß† **Effects on LLM Output:**
- Limits the number of possible tokens
- Low ‚Üí strict, predictable
- High ‚Üí diverse, sometimes noisy
- Less adaptive than top_p

---

### 2.4 **MAX_TOKENS** (1 to model_max, e.g., 8192 for Llama 3.1-8b) | Default: Model-dependent
Maximum length of the generated response (hard limit).

**Impact:**
- **Cost:** Longer max_tokens ‚Üí higher price (pay per output token)
- **Latency:** Longer sequences take more time to generate
- **Truncation:** If output hits limit, it may be cut off mid-sentence

**Guidance:** Set to ~1.5√ó your expected answer length to avoid abrupt cutoff

üß† **Effects on LLM Output:**
- Limits response length
- Controls cost and latency
- Does not change creativity or meaning

---

### 2.5 **FREQUENCY_PENALTY** (0.0 to 2.0) | Default: 0.0 (disabled)
Reduces the probability of tokens that have already appeared in the generated text.
In simple terms: ‚ÄúDon‚Äôt keep repeating the same words.‚Äù

- Higher values reduce repetition.

**Example:**
- Generated so far: "The research shows that the method is effective. The results indicate that the"
- `frequency_penalty=0.0`: word "the" has same probability as usual
- `frequency_penalty=0.5`: word "the" gets penalty; LLM tries alternatives like "results" or "findings"
- `frequency_penalty=1.0`: strong penalty on "the"; LLM strongly avoids this word (avoids repetition)

**Use case:** `frequency_penalty=0.5` to `1.0` reduces repetition in longer outputs

üß† **Effects on LLM Output:**
- Reduces repetition of frequently used tokens
- Encourages lexical diversity
- Too high ‚Üí unnatural wording

---

### 2.6 **PRESENCE_PENALTY** (0.0 to 2.0) | Default: 0.0 (disabled)
Binary penalty: if a token appears 1 or more times in output, apply penalty (regardless of frequency).

**How it works:**
1. Tracks which tokens have appeared at least once
2. If token reappears: penalty applies equally (not scaled by count)
3. Useful for avoiding specific repeated phrases/tokens
4. Different from frequency_penalty which scales by occurrence count

**Example:**
- Generated so far: "In conclusion, I believe this approach is sound."
- `presence_penalty=0.0`: "conclusion" can reappear with normal probability
- `presence_penalty=0.6`: "conclusion" gets penalized if used again (penalty same whether 1 or 5 times)
- `presence_penalty=1.0`: strong penalty; "In conclusion" phrase likely avoided entirely

**Recommended values:**
- `presence_penalty=0.0`: Disabled; no penalty for reusing tokens
- `presence_penalty=0.6`: Moderate penalty; discourages repeating any phrase
- `presence_penalty=1.0+`: Strong penalty; strongly avoids reused tokens

**Use case:** `presence_penalty=0.6` avoids repeating phrases like "in conclusion" or "furthermore" multiple times

üß† **Effects on LLM Output:**
- Discourages reuse of already used concepts
- Pushes model toward new ideas/topics
- Can cause topic drift if high

---

#### 2.6.1 Frequency Penalty vs. Presence Penalty ‚Äî Key Differences

üß† **The key difference **

In frequency panelty more occurence of word will increase the penalty but in presence penalty once the word is used the constant penalty will be applied no matter how many times it is used again.

- **Frequency penalty** says: "You're using this too often ‚Äî slow down."
- **Presence penalty** says: "You already used this ‚Äî try something new."
---

üß™ **Simple side-by-side example, Topic Shift**

**Prompt:** "Explain DNS briefly."

**No penalties:**

> DNS resolves domain names into IP addresses. 
> DNS helps clients locate servers. 
> DNS also caches addresses for faster lookup. 

**Frequency penalty only:**

> DNS resolves domain names into IP addresses. 
> This system helps clients locate servers.
> DNS also caches addresses for faster lookup.

> ‚û° Same topic, better wording

**Presence penalty only:**

> DNS resolves domain names into IP addresses.
> This system enables clients to find servers efficiently.
> Additionally, it maintains a cache to speed up requests.

> ‚Üí Notice: The model uses new words and phrases, slightly shifting focus to efficiency, caching, and functionality. This can feel like a subtle topic expansion or shift.


---

### 2.7 **STOP** (List of strings or None) | Default: None (disabled)
Stop sequences: generation halts when any of these strings is produced.

**How it works:**
1. Specify one or more stop sequences (e.g., `["\n\n", "User:", "###"]`)
2. LLM generates tokens until one of these sequences is produced
3. The stop sequence itself is NOT included in the output
4. Useful for controlling output structure and preventing over-generation

**Example:**
- Prompt: "Write a short bio for Jane. Stop when done."
- `stop=None`: LLM keeps generating (may write extra paragraphs)
- `stop=["\n\n"]`: Output = "Jane is a developer with 10 years of experience." (stops at double newline)
- `stop=["Sources:", "References:"]`: For Q&A, stops before LLM tries to fabricate citations

**Recommended values:**
- `stop=None`: Disabled; generation continues until max_tokens reached
- `stop=["\n\n"]`: Common; stops after double newline (paragraph break)
- `stop=["User:", "Assistant:"]`: Useful for chat turn-taking; stops at next speaker label
- `stop=["###", "---"]`: Good for structured outputs; stops at section break

**Use case:** `stop=["\n\n"]` prevents multi-paragraph answers when single-paragraph response is desired; RAG: `stop=["Sources:", "References:"]` to control citation placement

## 3) Practical Demo: Decoding Parameter Trade-offs

Now that you understand each parameter, let's see how they affect token count, latency, and quality. We'll vary temperature and max_tokens across two model sizes.

In [None]:
def compare_decoding_parameters(query: str):
    """Compare outputs and metrics across different parameter settings."""
    
    # Retrieve context (reused for all attempts)
    ctx = '\n\n'.join([f"[chunk_{r['chunk_idx']}]\n{r['content']}" for r in hybrid_retrieval(query, top_k=2)])
    base_prompt = BASIC_QA_TEMPLATE.format(context=ctx, query=query)
    
    # Test different parameter configurations
    configs = [
        {
            'name': 'Conservative (Factual RAG)',
            'temperature': 0.1,
            'max_tokens': 200,
            'top_p': 0.9,
            'frequency_penalty': 0.5,
            'stop': None,
            'desc': 'Deterministic; best for Q&A grounded in context'
        },
        {
            'name': 'Balanced (Natural Conversation)',
            'temperature': 0.7,
            'max_tokens': 300,
            'top_p': 0.95,
            'frequency_penalty': 0.0,
            'stop': None,
            'desc': 'Moderate creativity; natural but still coherent'
        },
        {
            'name': 'Creative (Brainstorming)',
            'temperature': 1.5,
            'max_tokens': 300,
            'top_p': 1.0,
            'frequency_penalty': 0.2,
            'stop': None,
            'desc': 'High randomness; generates diverse ideas'
        },
    ]
    
    results = {}
    print('\n' + '='*80)
    print('DECODING PARAMETER COMPARISON DEMO')
    print('='*80)
    print(f'Query: "{query}"\n')
    
    for config in configs:
        print(f"\nüìä {config['name']}")
        print(f"   Settings: temp={config['temperature']}, max_tokens={config['max_tokens']}, top_p={config['top_p']}")
        print(f"   Purpose: {config['desc']}")
        print('-' * 80)
        
        response = generate_answer(
            base_prompt,
            model='llama-3.1-8b-instant',
            temperature=config['temperature'],
            max_output_tokens=config['max_tokens'],
            top_p=config.get('top_p', 1.0),
            frequency_penalty=config.get('frequency_penalty', 0.0),
            presence_penalty=config.get('presence_penalty', 0.0),
            stop=config.get('stop', None)
        )
        
        results[config['name']] = response
        
        print(f"   Tokens: {response['tokens']} | Time: {response['elapsed']:.3f}s")
        print(f"   Output (first 300 chars):\n   {response['text'][:300]}...")
    
    print('\n' + '='*80)
    print('KEY INSIGHTS')
    print('='*80)
    print('''
    1. Conservative (temp=0.1): Predictable, repetitive, great for grounded answers
    2. Balanced (temp=0.7): Natural variation while staying on-topic
    3. Creative (temp=1.5): More diverse ideas but higher hallucination risk
    
    ‚Üí For RAG: Use Conservative settings (temperature ‚â§ 0.3)
    ‚Üí For conversation: Use Balanced settings (temperature ‚âà 0.7)
    ‚Üí For creative tasks only: Use Creative (temperature > 1.0)
    ''')
    print('='*80)
    
    return results

param_results = compare_decoding_parameters(test_query)

# Run comparison demotest_query = 'What are the main eligibility criteria and performance standards for remote work at our organization?'