# Prompts Module

> Few-shot prompt construction and kNN example selection.

This module handles:
- kNN-based selection of semantically similar training examples
- Few-shot prompt construction with chain-of-thought reasoning
- Prompt templates for data and code classification

**Research Background**: Few-shot learning with semantic similarity-based example selection substantially improves LLM classification accuracy over random sampling (Brown et al., 2020; Liu et al., 2022).

In [None]:
#| default_exp prompts

In [None]:
#| export
from __future__ import annotations
import numpy as np
from typing import List, Optional

from openness_classifier.core import OpennessCategory, ClassificationType
from openness_classifier.data import TrainingExample, EmbeddingModel

## kNN Example Selection

Select the k most semantically similar training examples for a given statement.

In [None]:
#| export
def select_knn_examples(
    statement: str,
    training_examples: List[TrainingExample],
    embedding_model: EmbeddingModel,
    k: int = 5,
    diversify: bool = True
) -> List[TrainingExample]:
    """Select k most similar training examples using kNN.
    
    Uses cosine similarity between sentence embeddings to find
    the most relevant examples for few-shot prompting.
    
    Args:
        statement: The statement to classify
        training_examples: Pool of training examples with embeddings
        embedding_model: Model for computing statement embedding
        k: Number of examples to select
        diversify: If True, ensure variety in selected examples' labels
        
    Returns:
        List of k most similar training examples
    """
    if not training_examples:
        return []
    
    # Ensure all examples have embeddings
    for ex in training_examples:
        if ex.embedding is None:
            ex.embedding = embedding_model.encode(ex.statement_text)
    
    # Compute embedding for input statement
    statement_embedding = embedding_model.encode(statement)
    
    # Compute similarities
    similarities = []
    for ex in training_examples:
        sim = _cosine_similarity(statement_embedding, ex.embedding)
        similarities.append((ex, sim))
    
    # Sort by similarity (descending)
    similarities.sort(key=lambda x: x[1], reverse=True)
    
    if diversify:
        # Select examples ensuring label diversity
        return _select_diverse_examples(similarities, k)
    else:
        # Just take top k
        return [ex for ex, _ in similarities[:k]]


def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    """Compute cosine similarity between two vectors."""
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))


def _select_diverse_examples(
    sorted_examples: List[tuple],
    k: int
) -> List[TrainingExample]:
    """Select examples ensuring label diversity.
    
    Tries to include at least one example from each category
    while still prioritizing similarity.
    """
    selected = []
    seen_labels = set()
    
    # First pass: get one example per unique label from top candidates
    for ex, sim in sorted_examples:
        if ex.ground_truth not in seen_labels:
            selected.append(ex)
            seen_labels.add(ex.ground_truth)
            if len(selected) >= k:
                return selected
    
    # Second pass: fill remaining slots with most similar
    for ex, sim in sorted_examples:
        if ex not in selected:
            selected.append(ex)
            if len(selected) >= k:
                break
    
    return selected

## Prompt Templates

Chain-of-thought prompt templates for classification.

In [None]:
#| export
SYSTEM_PROMPT = """You are an expert research analyst specializing in evaluating data and code availability statements in scholarly publications. Your task is to classify the openness of availability statements using a 4-category taxonomy.

Classification Categories (from most open to least open):

1. **open**: Fully accessible with no restrictions
   - Data/code in public repository (Zenodo, Figshare, GitHub public)
   - No registration, login, or approval required
   - Open license (CC-BY, MIT, etc.)

2. **mostly_open**: Largely accessible with minor restrictions
   - Public repository but requires free registration
   - Institutional access (freely available to affiliated researchers)
   - Minor conditions (e.g., cite the source)

3. **mostly_closed**: Largely restricted with limited access
   - Data use agreements required
   - Partial availability (some data/code withheld)
   - Significant restrictions on use or redistribution
   - Available only through specific collaborations

4. **closed**: Not accessible
   - "Available upon request" (regardless of how polite)
   - Confidential, proprietary, or restricted
   - No statement provided
   - Contact author for access

IMPORTANT: "Available upon request" or "contact the authors" is ALWAYS classified as **closed**."""


DATA_CLASSIFICATION_TEMPLATE = """Classify the following DATA availability statement.

{few_shot_examples}

Now classify this statement:

Statement: {statement}

Think step-by-step:
1. What repository or location is mentioned (if any)?
2. What access restrictions are described?
3. Is there any "upon request" or "contact author" language?

Based on your analysis, provide:
- Classification: [open/mostly_open/mostly_closed/closed]
- Confidence: [0.0-1.0]
- Reasoning: [brief explanation]"""


CODE_CLASSIFICATION_TEMPLATE = """Classify the following CODE availability statement.

{few_shot_examples}

Now classify this statement:

Statement: {statement}

Think step-by-step:
1. What repository or platform is mentioned (if any)?
2. Is the code publicly accessible?
3. Are there any restrictions on access or use?

Based on your analysis, provide:
- Classification: [open/mostly_open/mostly_closed/closed]
- Confidence: [0.0-1.0]
- Reasoning: [brief explanation]"""

## Prompt Construction

In [None]:
#| export
def build_few_shot_prompt(
    statement: str,
    statement_type: ClassificationType,
    examples: List[TrainingExample],
    include_reasoning: bool = True
) -> str:
    """Build a few-shot classification prompt.
    
    Args:
        statement: The statement to classify
        statement_type: DATA or CODE
        examples: Selected few-shot examples
        include_reasoning: Whether to include CoT reasoning template
        
    Returns:
        Complete prompt string
    """
    # Format examples
    example_strs = []
    for i, ex in enumerate(examples, 1):
        example_strs.append(
            f"Example {i}:\n"
            f"Statement: {ex.statement_text}\n"
            f"Classification: {ex.ground_truth.value}"
        )
    
    few_shot_block = "\n\n".join(example_strs)
    
    if few_shot_block:
        few_shot_block = f"Here are some examples:\n\n{few_shot_block}\n"
    
    # Select template
    if statement_type == ClassificationType.DATA:
        template = DATA_CLASSIFICATION_TEMPLATE
    else:
        template = CODE_CLASSIFICATION_TEMPLATE
    
    return template.format(
        few_shot_examples=few_shot_block,
        statement=statement
    )

In [None]:
#| export
def parse_classification_response(response: str) -> tuple:
    """Parse LLM response to extract classification, confidence, and reasoning.
    
    Args:
        response: Raw LLM response text
        
    Returns:
        Tuple of (OpennessCategory, confidence_score, reasoning)
    """
    import re
    
    # Default values
    category = None
    confidence = 0.8
    reasoning = response
    
    # Try to extract classification
    class_match = re.search(
        r'Classification:\s*(open|mostly_open|mostly_closed|closed)',
        response,
        re.IGNORECASE
    )
    if class_match:
        category = OpennessCategory.from_string(class_match.group(1))
    else:
        # Try alternative patterns
        for cat in ['open', 'mostly_open', 'mostly_closed', 'closed']:
            if cat in response.lower():
                category = OpennessCategory.from_string(cat)
                break
    
    # Try to extract confidence
    conf_match = re.search(r'Confidence:\s*([0-9.]+)', response, re.IGNORECASE)
    if conf_match:
        try:
            confidence = float(conf_match.group(1))
            confidence = max(0.0, min(1.0, confidence))  # Clamp to [0, 1]
        except ValueError:
            pass
    
    # Try to extract reasoning
    reason_match = re.search(r'Reasoning:\s*(.+?)(?=$|Classification:|Confidence:)', 
                            response, re.IGNORECASE | re.DOTALL)
    if reason_match:
        reasoning = reason_match.group(1).strip()
    
    if category is None:
        # Default to closed if we can't parse
        category = OpennessCategory.CLOSED
        confidence = 0.3  # Low confidence for unparseable response
        reasoning = f"Could not parse response: {response[:200]}..."
    
    return category, confidence, reasoning

In [None]:
# Test prompt construction
from openness_classifier.core import OpennessCategory, ClassificationType

# Create mock examples
class MockExample:
    def __init__(self, text, label):
        self.statement_text = text
        self.ground_truth = OpennessCategory.from_string(label)

examples = [
    MockExample("Data available at https://zenodo.org/record/12345", "open"),
    MockExample("Data available upon request from the authors", "closed"),
]

prompt = build_few_shot_prompt(
    "Data are deposited in Figshare at doi:10.6084/m9.figshare.12345",
    ClassificationType.DATA,
    examples
)

print(prompt[:500])
print("...")
print("Prompt construction test passed!")

In [None]:
# Test response parsing
test_response = """Let me analyze this statement step by step.

1. The data is stored in Zenodo, which is a public repository.
2. No access restrictions are mentioned.
3. No "upon request" language.

Classification: open
Confidence: 0.95
Reasoning: Data is deposited in a public repository (Zenodo) with a DOI, indicating full open access."""

category, confidence, reasoning = parse_classification_response(test_response)
print(f"Category: {category.value}")
print(f"Confidence: {confidence}")
print(f"Reasoning: {reasoning}")
print("Response parsing test passed!")

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()