In [1]:
import json
import os
from pathlib import Path

import anthropic

# Import from the rl module
import sys
sys.path.insert(0, str(Path.cwd()))

from rl.prompts import build_writing_assistant_prompt, RUBRIC_SCORING_PROMPT

In [2]:
# Model configuration
CLAUDE_MODEL = "claude-sonnet-4-5"

# File paths
RUBRIC_FILE = "rl/rubrics/example_rubric.json"
PROMPTS_FILE = "rl/prompts/research_intro_prompts.txt"

# Generation parameters
MAX_TOKENS = 2048
TEMPERATURE = 0.7

In [3]:
def load_rubric(rubric_file: str):
    """Load rubric from JSON file."""
    with open(rubric_file, 'r') as f:
        data = json.load(f)
    
    # Handle different rubric formats
    if isinstance(data, list):
        rubric_data = data[0]
    else:
        rubric_data = data
    
    if "rubric" in rubric_data:
        return rubric_data["rubric"]
    else:
        return rubric_data

def load_prompts(prompts_file: str):
    """Load writing prompts from file."""
    with open(prompts_file, 'r') as f:
        prompts = [line.strip() for line in f if line.strip()]
    return prompts

# Load data
rubric = load_rubric(RUBRIC_FILE)
prompts = load_prompts(PROMPTS_FILE)

print(f"Loaded rubric with {len(rubric)} criteria")
print(f"Loaded {len(prompts)} writing prompts")
print(f"\nRubric criteria:")
for i, criterion in enumerate(rubric, 1):
    print(f"{i}. {criterion['name']} (weight: {criterion['weight']}%)")

Loaded rubric with 5 criteria
Loaded 80 writing prompts

Rubric criteria:
1. Academic Tone & Register (weight: 25%)
2. Logical Argument Structure (weight: 25%)
3. Precision & Concision (weight: 20%)
4. Thematic Coherence (weight: 20%)
5. Clarity of Technical Content (weight: 10%)


In [4]:
# Build system instruction WITH rubric - co-writing assistant mode
system_instruction = build_writing_assistant_prompt(rubric=rubric)

print("System Instruction (CO-WRITING ASSISTANT MODE):")
print("=" * 80)
print(system_instruction + "...")
print("=" * 80)

System Instruction (CO-WRITING ASSISTANT MODE):
You are an AI co-writer designed to collaborate with human users to improve and develop their written pieces.

RUBRIC (Follow these criteria when writing):
[
  {
    "name": "Academic Tone & Register",
    "category": "Style",
    "description": "Uses formal, objective language appropriate for research papers; avoids subjective adjectives, metaphorical expressions, and emotional appeals; maintains consistent scholarly distance and register",
    "exemplary": "Consistently formal and objective throughout; no subjective adjectives (like 'hard-won') or metaphorical language (like 'fragmented landscape'); maintains scholarly register in all sentences; uses precise, neutral descriptors",
    "proficient": "Generally formal with rare lapses into less formal expressions; maintains objectivity in key claims; occasional subjective language that doesn't undermine scholarly credibility",
    "developing": "Mix of formal and informal language; some s

In [5]:
# Initialize Anthropic client
api_key = os.environ.get("ANTHROPIC_API_KEY")
if not api_key:
    raise ValueError("Please set ANTHROPIC_API_KEY environment variable")

client = anthropic.Anthropic(api_key=api_key)
print("Claude API client initialized")

Claude API client initialized


In [6]:
def extract_draft(response: str) -> str:
    """Extract just the <draft> section from structured response."""
    import re
    
    # Try to extract content between <draft> tags
    draft_match = re.search(r'<draft>(.*?)</draft>', response, re.DOTALL | re.IGNORECASE)
    if draft_match:
        draft_content = draft_match.group(1).strip()
        # Verify we actually got meaningful content (not just whitespace or empty)
        if len(draft_content) > 0:
            return draft_content
    
    # Fallback: if no <draft> tags found or draft was empty, remove known structured sections
    cleaned = response
    cleaned = re.sub(r'<analysis>.*?</analysis>', '', cleaned, flags=re.DOTALL | re.IGNORECASE)
    cleaned = re.sub(r'<questions>.*?</questions>', '', cleaned, flags=re.DOTALL | re.IGNORECASE)
    cleaned = re.sub(r'<feedback>.*?</feedback>', '', cleaned, flags=re.DOTALL | re.IGNORECASE)
    cleaned = re.sub(r'<rubric_assessment>.*?</rubric_assessment>', '', cleaned, flags=re.DOTALL | re.IGNORECASE)
    cleaned = cleaned.strip()
    
    # Final check: if we ended up with empty content, return the original text
    if len(cleaned) == 0:
        return response.strip()
    
    return cleaned

def generate_with_claude(prompt: str, system_instruction: str) -> dict:
    """Generate text using Claude Sonnet 4.5.
    
    Returns dict with:
        - 'full_response': The complete response from Claude
        - 'draft': Just the <draft> section extracted for evaluation
    """
    
    # Call Claude API
    response = client.messages.create(
        model=CLAUDE_MODEL,
        max_tokens=MAX_TOKENS,
        temperature=TEMPERATURE,
        system=system_instruction,
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    
    # Extract full response
    full_response = response.content[0].text
    
    # Extract just the draft section
    draft = extract_draft(full_response)
    
    return {
        "full_response": full_response,
        "draft": draft
    }

In [7]:
def evaluate_with_claude(draft: str, rubric: list, prompt: str) -> dict:
    """Evaluate draft using Claude Sonnet 4.5."""
    
    # Build evaluation prompt
    rubric_json = json.dumps(rubric, ensure_ascii=False, indent=2)
    
    evaluation_prompt = f"""
    **RUBRIC:**
    ```json
    {rubric_json}
    ```

    **WRITING TASK:**
    {prompt}

    **DRAFT TO EVALUATE:**
    {draft}
    """
        
    # Call Claude API
    response = client.messages.create(
        model=CLAUDE_MODEL,
        max_tokens=4096,
        system=RUBRIC_SCORING_PROMPT,
        messages=[
            {"role": "user", "content": evaluation_prompt}
        ]
    )
    
    # Parse response
    response_text = response.content[0].text
    
    # Extract JSON and overall score
    try:
        import re
        json_match = re.search(r'```json\s*({.*?})\s*```', response_text, re.DOTALL)
        if json_match:
            evaluation_data = json.loads(json_match.group(1))
        else:
            evaluation_data = json.loads(response_text)
        
        # Extract overall score if present
        if "overall_score" in evaluation_data:
            overall_score = float(evaluation_data["overall_score"])
        else:
            overall_score = None
            
        return {"evaluation": evaluation_data, "score": overall_score}
    except Exception as e:
        print(f"Error parsing evaluation response: {e}")
        return {"error": str(e), "raw_response": response_text, "score": None}

In [8]:
# Select a prompt to test
PROMPT_INDEX = 0

writing_prompt = prompts[PROMPT_INDEX]
print(f"Selected prompt #{PROMPT_INDEX}:")
print(f"{writing_prompt}")
print("\n" + "="*80 + "\n")

Selected prompt #0:
Write a research paper introduction about a machine learning system that improves accessibility for screen reader users.




In [9]:
# Generate with Claude (WITH RUBRIC IN PROMPT)
print("Generating with Claude 4.5 (co-writing assistant mode)...\n")
response = generate_with_claude(writing_prompt, system_instruction)

print("FULL RESPONSE (with analysis, questions, feedback):")
print("="*80)
print(response["full_response"])
print("="*80)
print(f"\nFull response length: {len(response['full_response'])} characters")

print("\n" + "="*80)
print("EXTRACTED DRAFT (for evaluation):")
print("="*80)
print(response["draft"])
print("="*80)
print(f"\nDraft length: {len(response['draft'])} characters")

Generating with Claude 4.5 (co-writing assistant mode)...

FULL RESPONSE (with analysis, questions, feedback):
<analysis>
This prompt requires a research paper introduction about an ML system for screen reader accessibility. Key considerations:

**Audience:** Academic researchers in HCI, accessibility, and/or ML
**Purpose:** Establish the problem, motivate the solution, and preview the contribution
**Tone:** Formal, objective, scholarly (per rubric criteria #1)
**Structure needs:** Problem→solution arc with explicit logical connections (rubric #2)
**Content requirements:** Technical clarity about the ML system while remaining accessible (rubric #5)

**Key decisions I'm making:**
- Focusing on web content accessibility as a concrete domain
- Framing the problem around semantic understanding challenges for screen readers
- Proposing an ML system that generates structural metadata to improve navigation
- Maintaining formal academic tone throughout (avoiding subjective adjectives)
- Buildi

In [10]:
# Evaluate with Claude - ONLY the draft section
print("\nEvaluating draft with Claude Sonnet 4.5...\n")
result = evaluate_with_claude(response["draft"], rubric, writing_prompt)

print("EVALUATION RESULTS:")
print("="*80)
if "evaluation" in result:
    print(json.dumps(result["evaluation"], indent=2))
    if result["score"] is not None:
        print(f"\n{'='*80}")
        print(f"OVERALL SCORE: {result['score']:.2f}/100")
        print("="*80)
else:
    print(json.dumps(result, indent=2))
print("="*80)


Evaluating draft with Claude Sonnet 4.5...

EVALUATION RESULTS:
{
  "overall_score": 90.0,
  "score_interpretation": "Exceptional - This draft strongly aligns with the user's values and would likely require only minor revisions",
  "criteria_scores": [
    {
      "name": "Academic Tone & Register",
      "weight": 25,
      "achievement_level": "Exemplary",
      "level_percentage": 100,
      "weighted_score": 25.0,
      "evidence_summary": "Maintains consistently formal, objective language throughout with no subjective adjectives, metaphors, or emotional appeals; scholarly register never wavers."
    },
    {
      "name": "Logical Argument Structure",
      "weight": 25,
      "achievement_level": "Exemplary",
      "level_percentage": 100,
      "weighted_score": 25.0,
      "evidence_summary": "Problem\u2192solution arc is exceptionally clear with explicit transitions making connections visible; each paragraph builds transparently on previous points."
    },
    {
      "name":

In [13]:
# Evaluate multiple prompts
NUM_SAMPLES = 10  # Change this to evaluate more prompts

results = []

for i in range(min(NUM_SAMPLES, len(prompts))):
    print(f"\n{'='*80}")
    print(f"Processing prompt {i+1}/{NUM_SAMPLES}")
    print(f"{'='*80}\n")
    
    writing_prompt = prompts[i]
    print(f"Prompt: {writing_prompt}\n")
    
    # Generate
    response = generate_with_claude(writing_prompt, system_instruction)
    print(f"Full response length: {len(response['full_response'])} characters")
    print(f"Draft length: {len(response['draft'])} characters\n")
    
    # Evaluate ONLY the draft section
    result = evaluate_with_claude(response["draft"], rubric, writing_prompt)
    score = result.get("score")
    
    results.append({
        "prompt": writing_prompt,
        "full_response": response["full_response"],
        "draft": response["draft"],
        "evaluation": result.get("evaluation", {}),
        "score": score
    })
    
    print(f"Score: {score:.2f}/100" if score else "Score: N/A")

print(f"\n{'='*80}")
print("CLAUDE 4.5 BASELINE PERFORMANCE SUMMARY")
print(f"{'='*80}\n")

scores = [r["score"] for r in results if r["score"] is not None]
if scores:
    print(f"Number of samples: {len(scores)}")
    print(f"Average score: {sum(scores) / len(scores):.2f}/100")
    print(f"Min score: {min(scores):.2f}/100")
    print(f"Max score: {max(scores):.2f}/100")
    if len(scores) > 1:
        std = (sum((x - sum(scores)/len(scores))**2 for x in scores) / len(scores))**0.5
        print(f"Std dev: {std:.2f}")
else:
    print("No scores available")


Processing prompt 1/10

Prompt: Write a research paper introduction about a machine learning system that improves accessibility for screen reader users.

Full response length: 6861 characters
Draft length: 4065 characters

Score: 90.00/100

Processing prompt 2/10

Prompt: Write a research paper introduction about a conversational AI interface that helps users manage their personal data privacy.

Full response length: 7665 characters
Draft length: 4687 characters

Score: 78.75/100

Processing prompt 3/10

Prompt: Write a research paper introduction about a visualization tool that makes algorithm decision-making more transparent.

Full response length: 7798 characters
Draft length: 4984 characters

Score: 92.50/100

Processing prompt 4/10

Prompt: Write a research paper introduction about an interactive system that supports collaborative writing for distributed teams.

Full response length: 6530 characters
Draft length: 3809 characters

Score: 78.75/100

Processing prompt 5/10

Prompt: 