# negotiating the past

We'll use ollama and llama3 7b to run some test.


We load the necessary libraries.

In [None]:
!python --version
%pip install umap-learn
%pip install gensim
%pip install nltk
%pip install pandas
%pip install anthropic
%pip install ipywidgets


## Creating a Historical Prompt Dataset with Claude

## Import libraries and setup

In [None]:
# Import libraries and setup
import anthropic
import numpy as np
import pandas as pd
import time
import logging
import os
from datetime import datetime
from tqdm.notebook import tqdm  # Note the use of tqdm.notebook for better display in Jupyter
from concurrent.futures import ThreadPoolExecutor
import json

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(f"past_reference_detection_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger("past_reference_detector")

# Create results directory
os.makedirs("data/results_claude", exist_ok=True)

## Load API key

In [None]:
# Load API key securely from a separate JSON file (recommended)
def load_api_key():
    try:
        # Try to load from config file
        with open('api_config.json') as f:
            config = json.load(f)
            return config.get('CLAUDE_API_KEY')
    except FileNotFoundError:
        # If file doesn't exist, check environment variables
        api_key = os.environ.get("CLAUDE_API_KEY")
        if api_key:
            return api_key
        else:
            # If not found, prompt user (only in notebook environment)
            from IPython.display import display
            from ipywidgets import widgets
            
            password = widgets.Password(
                description='Claude API Key:',
                style={'description_width': 'initial'},
                layout={'width': '50%'}
            )
            display(password)
            
            # This will allow the user to enter their API key securely
            # The entered key will be accessible via password.value in subsequent cells
            return None

# Create a JSON file for the API key that you won't commit to GitHub
# (Run this cell ONCE, then delete or comment it out)
"""
import json
with open('api_config.json', 'w') as f:
    json.dump({'CLAUDE_API_KEY': 'your_api_key_here'}, f)
    
print("Created api_config.json - add this file to .gitignore!")

# Create .gitignore if it doesn't exist
if not os.path.exists('.gitignore'):
    with open('.gitignore', 'w') as f:
        f.write("api_config.json\n.ipynb_checkpoints/\n__pycache__/\n*.pyc\nresults/\nlogs/\n")
    print("Created .gitignore file")
"""

# Load the API key
api_key = load_api_key()
if api_key and api_key.startswith("sk-"):
    print("✅ API key loaded successfully")
    # Initialize Claude client
    client = anthropic.Anthropic(api_key=api_key)
else:
    print("❌ API key not found or not valid format")
    # If using the widget approach, you'll need to get the value in the next cell
    # client = anthropic.Anthropic(api_key=password.value)

## Define your prompt

In [None]:
# Third cell - Helper functions

system_prompt = """As a panel of three historians with different specializations (international history, global history, and European history), analyze the following prompt to determine if it contains an implicit or explicit reference to the past.

Each historian should consider:

1. EXPLICIT REFERENCES: Clear temporal markers (yesterday, last week, previously), historical events, periods, or figures, or mentions of things that happened in the past.

2. IMPLICIT REFERENCES: Subtle indications of past time frames, comparative language suggesting change over time, or references to completed actions or states that are no longer current.

3. CONTEXTUAL CLUES: Words implying memory, reflection, or nostalgia; verbs in past tense that indicate historical events rather than hypotheticals.

4. DOMAIN-SPECIFIC PERSPECTIVES:
   - International historian: Look for references to international relations, treaties, wars, or cross-border interactions that occurred in the past
   - Global historian: Consider references to world systems, long-term global trends, or cross-cultural historical developments
   - European historian: Note references to European historical periods, events, or figures

After your individual analyses, debate any disagreements and reach a consensus.

Your final answer must be ONLY 'yes' or 'no' - nothing else.
"""


## define functions

In [None]:

def analyze_prompt(prompt, client, retries=3, backoff_factor=2):
    """Analyze a single prompt using Claude API"""
    for attempt in range(retries):
        try:
            response = client.messages.create(
                model="claude-3-7-sonnet-20250219",
                max_tokens=10,
                temperature=0,
                system=system_prompt,
                messages=[
                    {"role": "user", "content": prompt}
                ]
            )
            
            # Extract the yes/no response
            classification = response.content[0].text.strip().lower()
            # Normalize the response
            if "yes" in classification:
                return "yes"
            else:
                return "no"
                
        except Exception as e:
            if attempt < retries - 1:
                sleep_time = backoff_factor ** attempt
                logger.warning(f"Error on prompt: {e}. Retrying in {sleep_time}s...")
                time.sleep(sleep_time)
            else:
                logger.error(f"Failed after {retries} attempts: {e}")
                return "error"

def process_batch(batch_prompts, client, batch_id):
    """Process a batch of prompts"""
    results = []
    for prompt in tqdm(batch_prompts, desc=f"Batch {batch_id}"):
        result = analyze_prompt(prompt, client)
        results.append({"prompt": prompt, "references_past": result})
    
    # Save batch results
    batch_df = pd.DataFrame(results)
    batch_df.to_csv(f"data/results_claude/batch_{batch_id}_results.csv", index=False)
    logger.info(f"Completed batch {batch_id} with {len(results)} prompts")
    return results

## Load your dataset

In [None]:
# Fourth cell - Load and explore dataset

# Method 1: load a sample

# We want a sample
# Define the size of your sample
sample_size = 50000

# Determine the total number of rows in your dataset
total_rows = sum(1 for _ in open('data/prompts.csv')) - 1  # -1 for header

# Compute the probability of being selected for each line
skip_prob = 1 - sample_size / total_rows

# Use the skiprows parameter with a lambda function
df = pd.read_csv('data/prompts.csv', 
                        usecols=[0],  # First column only
                        skiprows=lambda x: x > 0 and np.random.random() < skip_prob)

# Si l'on veut juste charger tout le fichier -- attention 10 millions de lignes
# prompts_df = pd.read_csv('data/prompts.csv', usecols=[0])

# on vérifie ce que l'on a engendré:
df.shape
df.head(10)

# Method 2 Your dataset is small, you do not need a sample
# df = pd.read_csv("data/prompts.csv")


#try:
#    df = pd.read_csv("data/prompts.csv", nrows=10000, usecols=[0])
#    print(f"Loaded dataset with {len(df)} prompts")
#   
#    # Display a few example prompts
#    display(df.head())
#    
    # If you're working with a sample for testing
    # sample_df = df.sample(n=1000)
    # prompts_list = sample_df['prompt'].tolist()
#    prompts_list = df['prompt'].tolist()
#    
#except Exception as e:
#    print(f"Error loading dataset: {e}")
    # Create a sample dataset for testing
#    prompts_list = [
#        "What will the weather be like tomorrow?",
#        "Tell me about World War II",
#        "How do I make a cake?",
#        "What was the stock market like last year?",
#        "Remember that time when..."
#    ]
#    print(f"Created sample dataset with {len(prompts_list)} prompts")
#



## Process a small test batch

In [None]:
# Fifth cell - Process a small test batch
# Test with a small batch first
test_batch = prompts_list[:5]  # Just 5 prompts for testing
test_results = process_batch(test_batch, client, "test")

# Display test results
test_df = pd.DataFrame(test_results)
display(test_df)

# Check if everything is working correctly
if len(test_df) == len(test_batch):
    print("✅ Test batch processed successfully!")
else:
    print("❌ Some issues with the test batch")

## Configure and start main processing with optimized parameters

In [None]:
# Configure and start main processing with optimized parameters
# Calculate optimal batch size and workers based on Claude API limits
# Rate limits:
# - 50 requests per minute
# - 20,000 input tokens per minute
# - 8,000 output tokens per minute
# Pricing:
# - Input: $3 per million tokens
# - Output: $15 per million tokens

# Estimate tokens per prompt (system prompt + user prompt)
system_prompt_tokens = 350  # Our complex historian prompt is ~350 tokens
avg_prompt_tokens = 50  # Average user prompt length in tokens
output_tokens = 5  # Just "yes" or "no" plus a tiny bit of overhead

# Calculate tokens per request
tokens_per_request = system_prompt_tokens + avg_prompt_tokens + output_tokens

# Calculate how many requests we can make per minute based on different constraints
requests_per_min_rate_limit = 50  # Hard limit
requests_per_min_input_tokens = 20000 / (system_prompt_tokens + avg_prompt_tokens)
requests_per_min_output_tokens = 8000 / output_tokens

# The limiting factor is the minimum of these three
max_requests_per_min = min(requests_per_min_rate_limit, 
                           requests_per_min_input_tokens,
                           requests_per_min_output_tokens)

print(f"Limiting factors for requests per minute:")
print(f"- Rate limit: {requests_per_min_rate_limit:.1f} requests/min")
print(f"- Input token limit: {requests_per_min_input_tokens:.1f} requests/min")
print(f"- Output token limit: {requests_per_min_output_tokens:.1f} requests/min")
print(f"→ Maximum throughput: {max_requests_per_min:.1f} requests/min")

# Calculate optimal max_workers (slightly under the limit to allow for overhead)
max_workers = max(1, int(max_requests_per_min * 0.9))

# Calculate optimal batch_size 
# For this task, since each prompt is independent, smaller batches are better for parallelism
# But we want to save results frequently. A good rule is saving every ~1-2 minutes
batch_size = max(5, min(100, int(max_workers * 2)))  # Between 5 and 100 prompts per batch

# Split dataset into batches
batches = [prompts_list[i:i+batch_size] for i in range(0, len(prompts_list), batch_size)]
print(f"\nOptimized parameters:")
print(f"- max_workers: {max_workers} (parallel requests)")
print(f"- batch_size: {batch_size} prompts per batch")
print(f"- Total batches: {len(batches)}")

# Check for existing results to resume
existing_batches = set()
for filename in os.listdir("data/results_claude"):
    if filename.startswith("batch_") and filename.endswith("_results.csv"):
        try:
            batch_id = int(filename.split("_")[1])
            existing_batches.add(batch_id)
        except:
            pass

# Get batches to process
remaining_batches = [(i, batch) for i, batch in enumerate(batches) if i not in existing_batches]
print(f"\nResume status: {len(batches)-len(remaining_batches)} batches already completed, {len(remaining_batches)} remaining")

# More accurate cost estimation
total_prompts = len(prompts_list)
total_input_tokens = total_prompts * (system_prompt_tokens + avg_prompt_tokens)
total_output_tokens = total_prompts * output_tokens
input_cost = (total_input_tokens / 1_000_000) * 3
output_cost = (total_output_tokens / 1_000_000) * 15
total_cost = input_cost + output_cost

# Calculate time estimate
processing_time_minutes = total_prompts / max_requests_per_min
processing_time_hours = processing_time_minutes / 60

print(f"\nEstimated costs:")
print(f"- Input tokens: {total_input_tokens:,} tokens (${input_cost:.2f})")
print(f"- Output tokens: {total_output_tokens:,} tokens (${output_cost:.2f})")
print(f"- Total estimated cost: ${total_cost:.2f}")
print(f"\nEstimated processing time: {processing_time_minutes:.1f} minutes ({processing_time_hours:.2f} hours)")
print(f"Processing {len(remaining_batches)} batches with {batch_size} prompts each using {max_workers} workers")

# Ask for confirmation before proceeding with the full run
from IPython.display import display
from ipywidgets import widgets

confirm = widgets.Button(
    description='Start Processing',
    button_style='success',
    tooltip='Click to start processing all batches'
)
cancel = widgets.Button(
    description='Cancel',
    button_style='danger',
    tooltip='Cancel processing'
)

# Add option to adjust parameters
worker_slider = widgets.IntSlider(
    value=max_workers,
    min=1,
    max=min(50, max_workers*2),
    description='Workers:',
    style={'description_width': 'initial'}
)

batch_slider = widgets.IntSlider(
    value=batch_size,
    min=5,
    max=200,
    description='Batch size:',
    style={'description_width': 'initial'}
)

output = widgets.Output()

display(widgets.VBox([
    widgets.HTML("<b>Adjust processing parameters (if needed):</b>"),
    worker_slider,
    batch_slider,
    widgets.HBox([confirm, cancel])
]), output)

def on_confirm(b):
    with output:
        print(f"Starting processing with {worker_slider.value} workers and batch size of {batch_slider.value}...")
        # The actual processing code will be in the next cell, and will use these values
        
def on_cancel(b):
    with output:
        print("Processing cancelled.")

confirm.on_click(on_confirm)
cancel.on_click(on_cancel)

# Store these values to be used in the next cell
processing_config = {
    'max_workers': worker_slider,
    'batch_size': batch_slider
}

## Execute processing (run after confirmation)

In [None]:
# Execute processing (run after confirmation)
# Process batches with multiple threads
all_results = []

# Use tqdm to track overall progress
progress_bar = tqdm(total=len(remaining_batches), desc="Overall progress")

# Function to update progress
def update_progress(future):
    progress_bar.update(1)
    return future.result()

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = []
    for i, batch in remaining_batches:
        future = executor.submit(process_batch, batch, client, i)
        future.add_done_callback(update_progress)
        futures.append(future)
    
    # Wait for all futures to complete
    for future in futures:
        try:
            batch_results = future.result()
            all_results.extend(batch_results)
        except Exception as e:
            logger.error(f"Error processing batch: {e}")

progress_bar.close()

## Combine results and generate report

In [None]:
# Combine results and generate report
# Read all batch results
all_batches = []
for filename in os.listdir("results"):
    if filename.startswith("batch_") and filename.endswith("_results.csv"):
        try:
            batch_df = pd.read_csv(f"results/{filename}")
            all_batches.append(batch_df)
        except Exception as e:
            logger.error(f"Error reading {filename}: {e}")

# Combine all results
if all_batches:
    final_df = pd.concat(all_batches, ignore_index=True)
    final_df.to_csv("results/past_references_complete_results.csv", index=False)
    print(f"Combined results from {len(all_batches)} batches, total {len(final_df)} prompts")
    
    # Generate summary statistics
    past_references_count = sum(final_df['references_past'] == 'yes')
    error_count = sum(final_df['references_past'] == 'error')
    
    print("\n--- SUMMARY STATISTICS ---")
    print(f"Total prompts processed: {len(final_df)}")
    print(f"Prompts with past references: {past_references_count} ({past_references_count/len(final_df)*100:.2f}%)")
    print(f"Prompts with errors: {error_count} ({error_count/len(final_df)*100:.2f}%)")
    
    # Display a sample of prompts with past references
    print("\n--- SAMPLE PROMPTS WITH PAST REFERENCES ---")
    past_refs = final_df[final_df['references_past'] == 'yes'].sample(min(5, past_references_count))
    display(past_refs)
else:
    print("No batch results found")

## Visualization and analysis

In [None]:
# Visualization and analysis
import matplotlib.pyplot as plt
import seaborn as sns

# Set a nicer style
sns.set(style="whitegrid")

# Create a pie chart
plt.figure(figsize=(10, 6))
counts = final_df['references_past'].value_counts()
plt.pie(counts, labels=counts.index, autopct='%1.1f%%', startangle=90, colors=sns.color_palette("Set2"))
plt.axis('equal')
plt.title('Distribution of Prompts with Past References')
plt.show()

# Optionally, if your dataset has other columns (e.g., prompt length, category, etc.)
# You can do more analysis to look for patterns

# Add a prompt length column
final_df['prompt_length'] = final_df['prompt'].str.len()

# Plot prompt length distribution by reference type
plt.figure(figsize=(12, 6))
sns.histplot(data=final_df, x='prompt_length', hue='references_past', multiple='stack', bins=30)
plt.title('Prompt Length Distribution by Reference Type')
plt.xlabel('Prompt Length (characters)')
plt.ylabel('Count')
plt.show()

# Any additional analysis you might want to perform