# Process Life Narratives with Gemini Pro 2.5

This notebook processes text files containing life histories and scores them on:
- **A**: Enjoyment of consumption and leisure
- **B**: Making a difference and contributing

Results are saved incrementally to CSV with resume capability.

In [None]:
# Install required packages
!pip install -q google-generativeai pandas seaborn matplotlib

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# ============================================================
# CONFIGURATION - ADJUST PARAMETERS HERE
# ============================================================

SAMPLE_SIZE = None  # Set to None to process all files, or set a number (e.g., 10, 50, 100)

# Directory containing the text files
STORIES_DIR = '/content/drive/MyDrive/all-narratives_cleanedtxtfiles'

# Output CSV file (starts with . to appear at top of directory)
OUTPUT_CSV = '/content/drive/MyDrive/all-narratives_cleanedtxtfiles/.results_narrative_scores.csv'

# Error log file
ERROR_LOG = '/content/drive/MyDrive/all-narratives_cleanedtxtfiles/.error_log.txt'

# Rate limiting (requests per minute) - set to None for no limit
RATE_LIMIT = None

# Max concurrent requests
MAX_CONCURRENT = 10

# Gemini model to use
MODEL_NAME = 'gemini-2.0-flash-exp'  # or 'gemini-pro' depending on availability

In [None]:
# Import libraries
import google.generativeai as genai
from google.colab import userdata
import asyncio
import pandas as pd
import os
import re
from pathlib import Path
from datetime import datetime
import time
import seaborn as sns
import matplotlib.pyplot as plt
from typing import Optional, Dict, List
import json

# Configure Gemini API
api_key = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=api_key)

print("Setup complete!")

In [None]:
# Define the prompt template
PROMPT_TEMPLATE = """You are an expert historian. Read this life history. Consider two possible sources of satisfaction and meaning:

a. Enjoying consumption and leisure -- making the most of what life has to offer?
b. How much of a difference do you make, how much can you create and contribute (both materially and spiritually)?

You have 10 points to award, overall, max. If the person derives no satisfaction from a, give 0 points. If they get the same satisfaction from a+b, give 5 points each. No need to use up all points.

Explain your reasoning, by category (overall points, points for a, for b).

IMPORTANT: Format your response exactly as follows:
POINTS-A: [number]
POINTS-B: [number]
REASONING-A: [your explanation for category A]
REASONING-B: [your explanation for category B]

Life history:
{story_text}"""


def parse_gemini_response(response_text: str) -> Dict[str, any]:
    """Parse the Gemini response to extract points and reasoning."""
    result = {
        'points_a': None,
        'points_b': None,
        'reasoning_a': '',
        'reasoning_b': ''
    }
    
    # Extract points for A
    match_a = re.search(r'POINTS-A:\s*([0-9.]+)', response_text, re.IGNORECASE)
    if match_a:
        result['points_a'] = float(match_a.group(1))
    
    # Extract points for B
    match_b = re.search(r'POINTS-B:\s*([0-9.]+)', response_text, re.IGNORECASE)
    if match_b:
        result['points_b'] = float(match_b.group(1))
    
    # Extract reasoning for A
    match_reasoning_a = re.search(r'REASONING-A:\s*(.+?)(?=REASONING-B:|$)', response_text, re.IGNORECASE | re.DOTALL)
    if match_reasoning_a:
        result['reasoning_a'] = match_reasoning_a.group(1).strip()
    
    # Extract reasoning for B
    match_reasoning_b = re.search(r'REASONING-B:\s*(.+?)$', response_text, re.IGNORECASE | re.DOTALL)
    if match_reasoning_b:
        result['reasoning_b'] = match_reasoning_b.group(1).strip()
    
    return result


def log_error(filename: str, error: str):
    """Log errors to error log file."""
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    with open(ERROR_LOG, 'a', encoding='utf-8') as f:
        f.write(f"{timestamp} | {filename} | {error}\n")


def get_processed_files() -> set:
    """Get list of already processed files from existing CSV."""
    if os.path.exists(OUTPUT_CSV):
        try:
            df = pd.read_csv(OUTPUT_CSV)
            return set(df['filename'].tolist())
        except Exception as e:
            print(f"Could not read existing CSV: {e}")
            return set()
    return set()


def save_result(filename: str, points_a: float, points_b: float, reasoning_a: str, reasoning_b: str):
    """Save a single result to CSV (append mode)."""
    result_df = pd.DataFrame([{
        'filename': filename,
        'points-a': points_a,
        'points-b': points_b,
        'reasoning-a': reasoning_a,
        'reasoning-b': reasoning_b
    }])
    
    # Check if file exists to determine if we need to write header
    file_exists = os.path.exists(OUTPUT_CSV)
    result_df.to_csv(OUTPUT_CSV, mode='a', header=not file_exists, index=False)


async def process_file(model, filepath: Path, semaphore: asyncio.Semaphore) -> bool:
    """Process a single text file with Gemini."""
    async with semaphore:
        filename = filepath.name
        
        try:
            # Read the story
            with open(filepath, 'r', encoding='utf-8') as f:
                story_text = f.read()
            
            # Create prompt
            prompt = PROMPT_TEMPLATE.format(story_text=story_text)
            
            # Call Gemini API (synchronous call in async context)
            loop = asyncio.get_event_loop()
            response = await loop.run_in_executor(
                None,
                lambda: model.generate_content(prompt)
            )
            
            # Parse response
            response_text = response.text
            parsed = parse_gemini_response(response_text)
            
            # Check if parsing was successful
            if parsed['points_a'] is None or parsed['points_b'] is None:
                raise ValueError(f"Could not extract points from response: {response_text[:200]}")
            
            # Save result immediately
            save_result(
                filename=filename,
                points_a=parsed['points_a'],
                points_b=parsed['points_b'],
                reasoning_a=parsed['reasoning_a'],
                reasoning_b=parsed['reasoning_b']
            )
            
            print(f"✓ Processed: {filename} (A={parsed['points_a']}, B={parsed['points_b']})")
            return True
            
        except Exception as e:
            error_msg = str(e)
            log_error(filename, error_msg)
            print(f"✗ Error processing {filename}: {error_msg}")
            return False


async def process_all_files():
    """Process all text files in the directory."""
    # Initialize model
    model = genai.GenerativeModel(MODEL_NAME)
    
    # Get list of text files
    stories_path = Path(STORIES_DIR)
    all_files = sorted(list(stories_path.glob('*.txt')))
    
    # Apply sample size if specified
    if SAMPLE_SIZE is not None:
        all_files = all_files[:SAMPLE_SIZE]
        print(f"Processing {SAMPLE_SIZE} files (sample)")
    else:
        print(f"Processing all {len(all_files)} files")
    
    # Get already processed files
    processed_files = get_processed_files()
    files_to_process = [f for f in all_files if f.name not in processed_files]
    
    if len(processed_files) > 0:
        print(f"Resuming: {len(processed_files)} already processed, {len(files_to_process)} remaining")
    
    if len(files_to_process) == 0:
        print("All files already processed!")
        return
    
    # Create semaphore for rate limiting
    semaphore = asyncio.Semaphore(MAX_CONCURRENT)
    
    # Process files
    start_time = time.time()
    tasks = [process_file(model, filepath, semaphore) for filepath in files_to_process]
    results = await asyncio.gather(*tasks)
    
    # Summary
    elapsed = time.time() - start_time
    success_count = sum(results)
    print(f"\n" + "="*60)
    print(f"Processing complete!")
    print(f"Total files: {len(files_to_process)}")
    print(f"Successful: {success_count}")
    print(f"Failed: {len(files_to_process) - success_count}")
    print(f"Time elapsed: {elapsed:.2f} seconds")
    print(f"Results saved to: {OUTPUT_CSV}")
    if len(files_to_process) - success_count > 0:
        print(f"Errors logged to: {ERROR_LOG}")
    print("="*60)


print("Functions defined. Ready to process files.")

In [None]:
# Run the processing
await process_all_files()

## Visualizations

Create distributions and scatterplots for the scores.

In [None]:
# Load results
df = pd.read_csv(OUTPUT_CSV)

print(f"Loaded {len(df)} results")
print(f"\nSummary statistics:")
print(df[['points-a', 'points-b']].describe())

# Show first few rows
print(f"\nFirst 5 results:")
df.head()

In [None]:
# Set seaborn style
sns.set_style("whitegrid")
sns.set_palette("husl")

# Create figure with subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Life Narrative Satisfaction Scores', fontsize=16, fontweight='bold')

# 1. Distribution of Points-A (Consumption/Leisure)
sns.histplot(df['points-a'], bins=20, kde=True, ax=axes[0, 0], color='skyblue')
axes[0, 0].set_title('Distribution of Points-A\n(Consumption & Leisure)', fontweight='bold')
axes[0, 0].set_xlabel('Points-A')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(df['points-a'].mean(), color='red', linestyle='--', label=f"Mean: {df['points-a'].mean():.2f}")
axes[0, 0].legend()

# 2. Distribution of Points-B (Contribution/Difference)
sns.histplot(df['points-b'], bins=20, kde=True, ax=axes[0, 1], color='lightcoral')
axes[0, 1].set_title('Distribution of Points-B\n(Contribution & Difference)', fontweight='bold')
axes[0, 1].set_xlabel('Points-B')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].axvline(df['points-b'].mean(), color='red', linestyle='--', label=f"Mean: {df['points-b'].mean():.2f}")
axes[0, 1].legend()

# 3. Scatterplot: Points-A vs Points-B
sns.scatterplot(data=df, x='points-a', y='points-b', ax=axes[1, 0], alpha=0.6, s=50)
axes[1, 0].set_title('Points-A vs Points-B', fontweight='bold')
axes[1, 0].set_xlabel('Points-A (Consumption & Leisure)')
axes[1, 0].set_ylabel('Points-B (Contribution & Difference)')
axes[1, 0].plot([0, 10], [0, 10], 'r--', alpha=0.3, label='Equal points line')
axes[1, 0].legend()

# 4. Total points distribution
df['total_points'] = df['points-a'] + df['points-b']
sns.histplot(df['total_points'], bins=20, kde=True, ax=axes[1, 1], color='lightgreen')
axes[1, 1].set_title('Distribution of Total Points\n(A + B)', fontweight='bold')
axes[1, 1].set_xlabel('Total Points')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].axvline(df['total_points'].mean(), color='red', linestyle='--', label=f"Mean: {df['total_points'].mean():.2f}")
axes[1, 1].axvline(10, color='orange', linestyle='--', alpha=0.5, label='Max (10)')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

print(f"\nCorrelation between Points-A and Points-B: {df['points-a'].corr(df['points-b']):.3f}")

In [None]:
# Additional analysis: Show some examples
print("Examples of high Points-A (Consumption/Leisure):")
print(df.nlargest(3, 'points-a')[['filename', 'points-a', 'points-b', 'reasoning-a']])

print("\nExamples of high Points-B (Contribution/Difference):")
print(df.nlargest(3, 'points-b')[['filename', 'points-a', 'points-b', 'reasoning-b']])