In [None]:
# Environment Setup (Run this first on Colab/Binder)
import sys
import os

# Check if we're in Colab
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("🔧 Setting up Google Colab environment...")
    # Clone the repository if not already present
    if not os.path.exists('mlschool-text'):
        !git clone https://github.com/jobschepens/mlschool-text.git
        os.chdir('mlschool-text')
    else:
        os.chdir('mlschool-text')
    
    # Install requirements
    !pip install -q -r requirements_colab.txt
    print("✅ Colab setup complete!")

elif 'BINDER_LAUNCH_HOST' in os.environ:
    print("🔧 Binder environment detected - dependencies should already be installed")
    print("✅ Binder setup complete!")

elif 'CODESPACES' in os.environ:
    print("🚀 GitHub Codespaces environment detected")
    print("Dependencies should be installed automatically via devcontainer.json")
    print("✅ Codespaces setup complete!")

else:
    print("💻 Local environment detected")
    print("Make sure you've run: pip install -r requirements.txt")

# Set working directory for consistent paths
if os.path.exists('mlschool-text') and not os.getcwd().endswith('mlschool-text'):
    os.chdir('mlschool-text')
elif os.getcwd().endswith('notebooks'):
    # If we're in the notebooks directory, go up one level
    os.chdir('..')

print(f"📁 Working directory: {os.getcwd()}")

# Verify key files are accessible
key_files = ['models.json', 'data/', 'scripts/', 'output/']
missing_files = []
for file_path in key_files:
    if not os.path.exists(file_path):
        missing_files.append(file_path)

if missing_files:
    print(f"⚠️ Warning: Cannot find {missing_files}")
    print("💡 Make sure you're in the correct directory")
else:
    print("✅ All key project files accessible")

print("🎯 Ready to start! You can now run the rest of the notebook.")

# Part 1b: Data Integration and Predictor Engineering

## From Raw Corpus to Powerful Predictors

**Learning Objectives:**
- **Process Raw Text**: Take a large, unstructured text corpus and turn it into a clean, tokenized list of words.
- **Calculate Frequency**: Compute raw word frequency counts from the tokenized text.
- **Integrate External Data**: Merge the LLM-derived frequencies with established psycholinguistic datasets (ECP, SUBTLEX) to create a rich, comparative dataset.
- **Apply Transformations**: Convert raw frequencies into meaningful, psycholinguistically-validated scales (Schepens, Zipf).
- **Export for Analysis**: Save the final, merged data into a single file, ready for statistical analysis in Notebook 2.

---

💡 **Research Context:** A raw text file isn't useful for statistical modeling. We need to "engineer" predictors from it. This notebook automates the critical pipeline from text to data. We will calculate our own `llm_frequency` and then place it alongside well-known, human-validated measures. This allows us to directly compare our LLM-based predictor with the "gold standards" in the field.

# Prepare Predictors from Corpus

This notebook processes the large corpus text file to extract word frequencies and prepare predictor data for analysis. The output will be used by `notebook2_corpus_analysis.ipynb` to compare different frequency measures.

## 1. Setup and Configuration

First, we'll import the necessary libraries and define the file paths for our input (the raw text corpus) and our output (the final CSV file with all predictors).

In [50]:
# Environment Setup
import pandas as pd
import numpy as np
import os
import sys

# Add parent directory to Python path for imports
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath('.')))
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

# Import path utilities for cross-platform compatibility
try:
    from path_utils import get_project_path, get_output_path
    print("✅ Path utilities loaded successfully")
except ImportError:
    print("⚠️ Path utilities not found, using fallback functions")
    
    def get_project_path(relative_path):
        """Fallback path utility function"""
        if os.path.exists(relative_path):
            return relative_path
        parent_path = os.path.join('..', relative_path)
        if os.path.exists(parent_path):
            return parent_path
        return relative_path
    
    def get_output_path(filename):
        """Fallback output path utility function"""
        output_dir = get_project_path('output')
        os.makedirs(output_dir, exist_ok=True)
        return os.path.join(output_dir, filename)

# File paths using path utilities
corpus_file_path = get_output_path('large_corpus.txt')
output_csv_path = get_output_path('generated_corpus_with_predictors.csv')

print("Starting corpus processing...")
print(f"Corpus file: {corpus_file_path}")
print(f"Output file: {output_csv_path}")

⚠️ Path utilities not found, using fallback functions
Starting corpus processing...
Corpus file: ..\output\large_corpus.txt
Output file: ..\output\generated_corpus_with_predictors.csv


## 2. Corpus Selection

Choose which corpus file you want to process for your analysis. The notebook will:

1. **Detect Available Corpus Files**: Automatically find all `.txt` corpus files in the output directory
2. **Let You Choose**: Select which specific corpus file to process
3. **Generate Standard Variables**: Create the standard `llm_frequency_raw` and transformation variables

This simplified approach creates clean, focused analysis with a single chosen corpus for easier interpretation and comparison.

In [51]:
# Import additional libraries for corpus processing
import re
from collections import Counter
import glob

# Discover available corpus files
output_dir = get_output_path('')  # Get the output directory path
corpus_pattern = os.path.join(output_dir, "*.txt")
available_corpus_files = glob.glob(corpus_pattern)

# Filter to only include actual corpus files (exclude metadata files, etc.)
corpus_files = []
for file_path in available_corpus_files:
    filename = os.path.basename(file_path)
    # Include files that look like corpus files
    if (filename.startswith('large_corpus') or 
        'corpus' in filename.lower() and not filename.endswith('_metadata.txt')):
        corpus_files.append(file_path)

print("🔍 Available corpus files in output directory:")
for i, file_path in enumerate(corpus_files, 1):
    filename = os.path.basename(file_path)
    try:
        # Get file size for reference
        size_mb = os.path.getsize(file_path) / (1024 * 1024)
        print(f"   {i}. {filename} ({size_mb:.1f} MB)")
    except:
        print(f"   {i}. {filename}")

if not corpus_files:
    print("❌ No corpus files found in the output directory.")
    print("Please ensure you have generated corpus files first using the generation scripts.")
    selected_file = None
else:
    print(f"\n📊 Found {len(corpus_files)} corpus file(s)")
    
    # Simple selection: choose which file to process
    choice = "1"  # Default choice
    
    try:
        # Try to get user input
        import sys
        if hasattr(sys, 'ps1') and len(corpus_files) > 1:  # Interactive session with multiple files
            choice = input(f"\nChoose corpus file to process (1-{len(corpus_files)}) [default=1]: ").strip()
        else:
            print(f"\n⚡ Auto-selecting first file for processing")
            choice = "1"
    except (EOFError, KeyboardInterrupt):
        print(f"\n⚡ No input received, using first file")
        choice = "1"
    except Exception as e:
        print(f"\n⚡ Input error ({e}), using first file")
        choice = "1"
    
    # Fallback to "1" if choice is empty
    if not choice:
        choice = "1"
        print("⚡ Empty input, using first file")
    
    try:
        file_index = int(choice) - 1
        if 0 <= file_index < len(corpus_files):
            selected_file = corpus_files[file_index]
        else:
            print(f"❌ Invalid selection. Using first file as fallback.")
            selected_file = corpus_files[0]
    except ValueError:
        print(f"❌ Invalid input. Using first file as fallback.")
        selected_file = corpus_files[0]

# Display final selection
if selected_file:
    print(f"\n🎯 Selected corpus file: {os.path.basename(selected_file)}")
else:
    print("\n❌ No corpus file selected for processing")

🔍 Available corpus files in output directory:
   1. large_corpus_2m_llama_20250908_002336.txt (3.9 MB)
   2. large_corpus_2m_qwen_dynamic_20250908_002429.txt (15.1 MB)
   3. large_corpus_2m_qwen_seeds.txt (16.0 MB)
   4. large_corpus_gpt-oss-20b_dynamic_20250908_002239.txt (0.8 MB)

📊 Found 4 corpus file(s)

🎯 Selected corpus file: large_corpus_2m_qwen_dynamic_20250908_002429.txt

🎯 Selected corpus file: large_corpus_2m_qwen_dynamic_20250908_002429.txt


## 3. Text Processing and Tokenization

Process the selected corpus file to extract word frequencies:

**Key Steps:**
1. **Load and Clean**: Read the corpus file and remove any metadata comments
2. **Tokenize**: Extract words using regex pattern matching
3. **Calculate Frequencies**: Count word frequencies for the corpus

**Generated Variables:**
- `llm_frequency_raw`: Raw frequency count from the selected corpus
- Word length and other derived variables for analysis

In [52]:
# Single Corpus Processing
print("🔄 Processing selected corpus file...")

if not selected_file:
    print("❌ No corpus file selected for processing")
    df_words = pd.DataFrame()
else:
    try:
        filename = os.path.basename(selected_file)
        print(f"\n📖 Processing: {filename}")
        
        # Load corpus text
        with open(selected_file, 'r', encoding='utf-8') as file:
            corpus_text = file.read()
        
        print(f"   ✅ Loaded {len(corpus_text):,} characters")
        
        # Clean text (remove metadata comments)
        cleaned_text = re.sub(r'<!-- Story Metadata:.*?-->', '', corpus_text, flags=re.DOTALL)
        
        # Tokenize (extract words)
        words = re.findall(r"\b[a-z]+(?:'[a-z]+)?\b", cleaned_text.lower())
        
        # Calculate word frequencies
        word_counts = Counter(words)
        total_words = len(words)
        unique_words = len(word_counts)
        
        print(f"   📊 Total tokens: {total_words:,}")
        print(f"   📚 Unique words: {unique_words:,}")
        print(f"   🏆 Top 5 words: {word_counts.most_common(5)}")
        
        # Create DataFrame with standard column names
        vocabulary = sorted(word_counts.keys())
        df_words = pd.DataFrame({'word': vocabulary})
        df_words['word_length'] = df_words['word'].apply(len)
        df_words['llm_frequency_raw'] = df_words['word'].map(word_counts)
        
        # Sort by frequency
        df_words = df_words.sort_values('llm_frequency_raw', ascending=False).reset_index(drop=True)
        
        print(f"\n✅ DataFrame created:")
        print(f"   📚 Vocabulary size: {len(df_words):,} words")
        print(f"   📊 Frequency range: {df_words['llm_frequency_raw'].min()}-{df_words['llm_frequency_raw'].max()}")
        
        # Display preview
        print(f"\n📋 Preview of frequency data:")
        print(df_words[['word', 'word_length', 'llm_frequency_raw']].head(10))
        
    except Exception as e:
        print(f"   ❌ Error processing {filename}: {e}")
        df_words = pd.DataFrame()

🔄 Processing selected corpus file...

📖 Processing: large_corpus_2m_qwen_dynamic_20250908_002429.txt
   ✅ Loaded 15,517,265 characters
   📊 Total tokens: 2,105,954
   📚 Unique words: 27,939
   🏆 Top 5 words: [('the', 108943), ('a', 79787), ('of', 53158), ('and', 49441), ('it', 40292)]

✅ DataFrame created:
   📚 Vocabulary size: 27,939 words
   📊 Frequency range: 1-108943

📋 Preview of frequency data:
   word  word_length  llm_frequency_raw
0   the            3             108943
1     a            1              79787
2    of            2              53158
3   and            3              49441
4    it            2              40292
5    to            2              35514
6    in            2              35280
7     s            1              31289
8  that            4              25366
9     i            1              23607
   📊 Total tokens: 2,105,954
   📚 Unique words: 27,939
   🏆 Top 5 words: [('the', 108943), ('a', 79787), ('of', 53158), ('and', 49441), ('it', 40292)]

✅ Da

## 4. Display Corpus Statistics

Let's examine the characteristics of our corpus dataset to understand the vocabulary and frequency distribution.

In [53]:
# Display Corpus Statistics
print("📈 Corpus Dataset Summary")
print("=" * 40)

if df_words.empty:
    print("❌ No data available for analysis.")
else:
    # Overall statistics
    print(f"📚 Total unique words: {len(df_words):,}")
    print(f"📏 Average word length: {df_words['word_length'].mean():.1f} characters")
    print(f"📊 Word length range: {df_words['word_length'].min()} - {df_words['word_length'].max()} characters")
    print(f"📊 Total tokens in corpus: {df_words['llm_frequency_raw'].sum():,}")
    print(f"📊 Frequency range: {df_words['llm_frequency_raw'].min()} - {df_words['llm_frequency_raw'].max()}")
    
    # Frequency distribution
    print(f"\n📊 Frequency Distribution:")
    freq_stats = df_words['llm_frequency_raw'].describe()
    print(f"   Median frequency: {freq_stats['50%']:.0f}")
    print(f"   75th percentile: {freq_stats['75%']:.0f}")
    print(f"   95th percentile: {df_words['llm_frequency_raw'].quantile(0.95):.0f}")
    
    # High frequency words
    print(f"\n🏆 Top 10 most frequent words:")
    top_words = df_words.head(10)[['word', 'llm_frequency_raw']]
    for _, row in top_words.iterrows():
        print(f"   {row['word']}: {row['llm_frequency_raw']:,}")
    
    print(f"\n✅ Corpus dataset ready for integration!")

📈 Corpus Dataset Summary
📚 Total unique words: 27,939
📏 Average word length: 7.7 characters
📊 Word length range: 1 - 23 characters
📊 Total tokens in corpus: 2,105,954
📊 Frequency range: 1 - 108943

📊 Frequency Distribution:
   Median frequency: 4
   75th percentile: 20
   95th percentile: 181

🏆 Top 10 most frequent words:
   the: 108,943
   a: 79,787
   of: 53,158
   and: 49,441
   it: 40,292
   to: 35,514
   in: 35,280
   s: 31,289
   that: 25,366
   i: 23,607

✅ Corpus dataset ready for integration!


## 5. Data Integration with External Datasets

Merge our LLM-derived frequencies with established psycholinguistic datasets:

1. **LLM frequency**: Our corpus-derived frequency measures
2. **Reference measures**: ECP, SUBTLEX, Multilex, GPT familiarity ratings  
3. **Behavioral data**: Human reading times for validation

This creates a comprehensive dataset for comparing LLM-derived predictors against established measures.

In [54]:
# Data Integration with Reference Datasets
print("🔗 Integrating corpus data with reference datasets...")

try:
    # Load ECP data using path utilities
    ecp_path = get_project_path('data/lexicaldecision/ecp/English Crowdsourcing Project All Native Speakers.csv')
    ecp_df = pd.read_csv(ecp_path)
    print(f"✅ Loaded {len(ecp_df)} records from ECP dataset")
    
    # Load SUBTLEX-US data to get raw frequency counts
    print("Loading SUBTLEX-US data for raw frequency counts...")
    subtlex_us_path = get_project_path('data/frequency/subtlex-us/SUBTLEXus74286wordstextversion.txt')
    subtlex_df = pd.read_csv(subtlex_us_path, sep='\t')
    print(f"✅ Loaded {len(subtlex_df)} records from SUBTLEX-US dataset")
    
    # Rename columns for consistency
    subtlex_df = subtlex_df.rename(columns={
        'Word': 'word',
        'FREQcount': 'subtlex_freq_raw'
    })
    
    # Merge ECP data with SUBTLEX-US data to get raw frequency counts
    word_col = 'spelling' if 'spelling' in ecp_df.columns else 'Word'
    ecp_with_subtlex = pd.merge(ecp_df, subtlex_df[['word', 'subtlex_freq_raw']], 
                               left_on=word_col, 
                               right_on='word', how='left')
    
    # Define reference columns to merge (using actual ECP column names!)
    ref_cols = ['SUBTLEX', 'subtlex_freq_raw', 'Multilex', 'GPT', 'rt_correct_mean', 'accuracy', 'prevalence']
    cols_to_merge = [word_col] + [col for col in ref_cols if col in ecp_with_subtlex.columns]
    
    # Check what reading time column is available
    rt_cols = [col for col in ecp_with_subtlex.columns if 'rt' in col.lower() or 'reaction' in col.lower()]
    if rt_cols:
        print(f"✅ Found reading time column(s): {rt_cols}")
        cols_to_merge.extend([col for col in rt_cols if col not in cols_to_merge])
    else:
        print("⚠️ No reading time column found in ECP data")
        print(f"   Available ECP columns: {list(ecp_with_subtlex.columns)[:10]}...")
    
    # Merge corpus data with reference data
    merged_df = pd.merge(df_words, ecp_with_subtlex[cols_to_merge], 
                        left_on='word', right_on=word_col, how='left')
    
    # Rename reference columns for clarity (using actual ECP column names)
    column_renames = {
        'SUBTLEX': 'subtlex_zipf',
        'subtlex_freq_raw': 'subtlex_freq_raw',
        'Multilex': 'multilex_zipf',
        'GPT': 'gpt_familiarity',
        'rt_correct_mean': 'rt',  # Rename ECP reading time to standard 'rt'
        'spelling': 'word'  # Ensure word column is consistently named
    }
    
    # Apply column renaming
    merged_df = merged_df.rename(columns=column_renames)
    print(f"✅ Reading time column 'rt_correct_mean' renamed to 'rt'")
    
    # Remove duplicate word column if present
    if word_col != 'word' and word_col in merged_df.columns:
        merged_df = merged_df.drop(columns=[word_col])
        
    print("✅ Successfully merged corpus data with reference measures")
    
    # Show integration statistics
    ref_columns = ['subtlex_zipf', 'subtlex_freq_raw', 'multilex_zipf', 'gpt_familiarity', 'rt', 'accuracy', 'prevalence']
    
    print(f"\n📊 Integration Statistics:")
    print(f"   📚 Reference measures available:")
    for col in ref_columns:
        if col in merged_df.columns:
            coverage = merged_df[col].notna().sum()
            print(f"      • {col}: {coverage:,} words ({coverage/len(merged_df)*100:.1f}%)")
        else:
            print(f"      • {col}: NOT AVAILABLE")
            
    # Special check for reading time data
    if 'rt' not in merged_df.columns:
        print(f"\n⚠️ CRITICAL: No reading time (rt) column found!")
        print(f"   This will cause issues in notebook2_corpus_analysis.ipynb")
        print(f"   Please check the ECP data file for reading time columns.")
    else:
        print(f"\n✅ Reading time data successfully integrated!")
        print(f"   Column 'rt' contains mean correct reaction times from ECP")
    
except FileNotFoundError as e:
    print(f"⚠️ Reference data file not found: {e}")
    print("   Proceeding with LLM data only.")
    merged_df = df_words.copy()
except Exception as e:
    print(f"⚠️ Error during data integration: {e}")
    print("   Proceeding with LLM data only.")
    merged_df = df_words.copy()

print(f"\n✅ Final dataset: {len(merged_df)} words with {len(merged_df.columns)} columns")

🔗 Integrating corpus data with reference datasets...
✅ Loaded 61851 records from ECP dataset
Loading SUBTLEX-US data for raw frequency counts...
✅ Loaded 61851 records from ECP dataset
Loading SUBTLEX-US data for raw frequency counts...
✅ Loaded 74286 records from SUBTLEX-US dataset
✅ Found reading time column(s): ['rt_correct_mean', 'rt_correct_std']
✅ Reading time column 'rt_correct_mean' renamed to 'rt'
✅ Successfully merged corpus data with reference measures

📊 Integration Statistics:
   📚 Reference measures available:
      • subtlex_zipf: 15,859 words (56.8%)
      • subtlex_freq_raw: 14,229 words (50.9%)
      • multilex_zipf: 15,814 words (56.6%)
      • gpt_familiarity: 15,859 words (56.8%)
      • rt: 15,859 words (56.8%)
      • accuracy: 15,859 words (56.8%)
      • prevalence: 15,859 words (56.8%)

✅ Reading time data successfully integrated!
   Column 'rt' contains mean correct reaction times from ECP

✅ Final dataset: 27939 words with 12 columns
✅ Loaded 74286 records f

## 6. Data Integration with External Datasets

**Integration Strategy:** We merge our single LLM-derived corpus frequencies with established psycholinguistic datasets to create a comprehensive comparative dataset:

1. **LLM frequency measures**: Raw frequency counts from our selected corpus
2. **Reference measures**: ECP, SUBTLEX, Multilex, GPT familiarity ratings
3. **Behavioral data**: Human reading times for validation against real reading behavior

**Generated Variables:**
- `llm_frequency_raw`: Raw frequency count from the selected corpus
- `subtlex_zipf`: SUBTLEX frequency in Zipf scale
- `multilex_zipf`: Multilex frequency in Zipf scale  
- `gpt_familiarity`: GPT-based familiarity estimates
- `rt`: Human reading times from ECP dataset

This creates a focused comparison between:
- LLM-derived frequency vs. traditional corpus frequency (SUBTLEX)
- LLM-derived frequency vs. LLM-based familiarity (GPT estimates)
- All frequency measures vs. actual human reading behavior

In [55]:
# --- Enhanced Multi-Corpus Data Integration ---
print("🔗 Integrating multi-corpus data with reference datasets...")

try:
    # Load ECP data using path utilities
    ecp_path = get_project_path('data/lexicaldecision/ecp/English Crowdsourcing Project All Native Speakers.csv')
    ecp_df = pd.read_csv(ecp_path)
    print(f"✅ Loaded {len(ecp_df)} records from ECP dataset")
    
    # Load SUBTLEX-US data to get raw frequency counts
    print("Loading SUBTLEX-US data for raw frequency counts...")
    subtlex_us_path = get_project_path('data/frequency/subtlex-us/SUBTLEXus74286wordstextversion.txt')
    subtlex_df = pd.read_csv(subtlex_us_path, sep='\t')
    print(f"✅ Loaded {len(subtlex_df)} records from SUBTLEX-US dataset")
    
    # Rename columns for consistency
    subtlex_df = subtlex_df.rename(columns={
        'Word': 'word',
        'FREQcount': 'subtlex_freq_raw'
    })
    
    # Merge ECP data with SUBTLEX-US data to get raw frequency counts
    word_col = 'spelling' if 'spelling' in ecp_df.columns else 'Word'
    ecp_with_subtlex = pd.merge(ecp_df, subtlex_df[['word', 'subtlex_freq_raw']], 
                               left_on=word_col, 
                               right_on='word', how='left')
    
    # Define reference columns to merge (using actual ECP column names!)
    ref_cols = ['SUBTLEX', 'subtlex_freq_raw', 'Multilex', 'GPT', 'rt_correct_mean', 'accuracy', 'prevalence']
    cols_to_merge = [word_col] + [col for col in ref_cols if col in ecp_with_subtlex.columns]
    
    # Check what reading time column is available
    rt_cols = [col for col in ecp_with_subtlex.columns if 'rt' in col.lower() or 'reaction' in col.lower()]
    if rt_cols:
        print(f"✅ Found reading time column(s): {rt_cols}")
        cols_to_merge.extend([col for col in rt_cols if col not in cols_to_merge])
    else:
        print("⚠️ No reading time column found in ECP data")
        print(f"   Available ECP columns: {list(ecp_with_subtlex.columns)[:10]}...")
    
    # Merge multi-corpus data with reference data
    merged_df = pd.merge(df_words, ecp_with_subtlex[cols_to_merge], 
                        left_on='word', right_on=word_col, how='left')
    
    # Rename reference columns for clarity (using actual ECP column names)
    column_renames = {
        'SUBTLEX': 'subtlex_zipf',
        'subtlex_freq_raw': 'subtlex_freq_raw',
        'Multilex': 'multilex_zipf',
        'GPT': 'gpt_familiarity',
        'rt_correct_mean': 'rt',  # Rename ECP reading time to standard 'rt'
        'spelling': 'word'  # Ensure word column is consistently named
    }
    
    # Apply column renaming
    merged_df = merged_df.rename(columns=column_renames)
    print(f"✅ Reading time column 'rt_correct_mean' renamed to 'rt'")
    
    # Remove duplicate word column if present
    if word_col != 'word' and word_col in merged_df.columns:
        merged_df = merged_df.drop(columns=[word_col])
        
    print("✅ Successfully merged multi-corpus data with reference measures")
    
    # Show integration statistics
    ref_columns = ['subtlex_zipf', 'subtlex_freq_raw', 'multilex_zipf', 'gpt_familiarity', 'rt', 'accuracy', 'prevalence']
    
    print(f"\n📊 Integration Statistics:")
    print(f"   📚 Reference measures available:")
    for col in ref_columns:
        if col in merged_df.columns:
            coverage = merged_df[col].notna().sum()
            print(f"      • {col}: {coverage:,} words ({coverage/len(merged_df)*100:.1f}%)")
        else:
            print(f"      • {col}: NOT AVAILABLE")
            
    # Special check for reading time data
    if 'rt' not in merged_df.columns:
        print(f"\n⚠️ CRITICAL: No reading time (rt) column found!")
        print(f"   This will cause issues in notebook2_corpus_analysis.ipynb")
        print(f"   Please check the ECP data file for reading time columns.")
    else:
        print(f"\n✅ Reading time data successfully integrated!")
        print(f"   Column 'rt' contains mean correct reaction times from ECP")
    
except FileNotFoundError as e:
    print(f"⚠️ Reference data file not found: {e}")
    print("   Proceeding with LLM data only.")
    merged_df = df_words.copy()
except Exception as e:
    print(f"⚠️ Error during data integration: {e}")
    print("   Proceeding with LLM data only.")
    merged_df = df_words.copy()

print(f"\n✅ Final dataset: {len(merged_df)} words with {len(merged_df.columns)} columns")

🔗 Integrating multi-corpus data with reference datasets...
✅ Loaded 61851 records from ECP dataset
Loading SUBTLEX-US data for raw frequency counts...
✅ Loaded 74286 records from SUBTLEX-US dataset
✅ Loaded 61851 records from ECP dataset
Loading SUBTLEX-US data for raw frequency counts...
✅ Loaded 74286 records from SUBTLEX-US dataset
✅ Found reading time column(s): ['rt_correct_mean', 'rt_correct_std']
✅ Reading time column 'rt_correct_mean' renamed to 'rt'
✅ Successfully merged multi-corpus data with reference measures

📊 Integration Statistics:
   📚 Reference measures available:
      • subtlex_zipf: 15,859 words (56.8%)
      • subtlex_freq_raw: 14,229 words (50.9%)
      • multilex_zipf: 15,814 words (56.6%)
      • gpt_familiarity: 15,859 words (56.8%)
      • rt: 15,859 words (56.8%)
      • accuracy: 15,859 words (56.8%)
      • prevalence: 15,859 words (56.8%)

✅ Reading time data successfully integrated!
   Column 'rt' contains mean correct reaction times from ECP

✅ Final da

## 7. Frequency Transformations

**Transformation Strategy:** We apply logarithmic transformations to prepare the frequency data for regression analysis.

**For the corpus, we create:**
1. **Schepens Transformation**: `log10(frequency + 1)` - Simple log transformation used in Schepens et al. paper
2. **Zipf Transformation**: `log10((frequency / total_tokens) * 1e9)` - Standard Zipf-scale frequency

**Resulting variables:**
- `llm_frequency_schepens` - Log-transformed frequency following Schepens method
- `llm_frequency_zipf` - Zipf-scale frequency for comparison with reference measures

In [56]:
# Frequency Transformations for Regression Analysis
print("🔢 Computing frequency transformations...")

# Calculate Schepens-style log-transformed frequency
if 'llm_frequency_raw' in merged_df.columns:
    # Ensure positive values and add small constant to avoid log(0)
    merged_df['llm_frequency_schepens'] = np.log10(merged_df['llm_frequency_raw'] + 1)
    print("✅ Added log10(frequency + 1) transformation (Schepens method)")
    
    # Calculate Zipf-scale frequency (log10 per billion)
    total_tokens = merged_df['llm_frequency_raw'].sum()
    if total_tokens > 0:
        merged_df['llm_frequency_zipf'] = np.log10((merged_df['llm_frequency_raw'] / total_tokens) * 1e9)
        # Replace -inf with minimum valid value for zero-frequency words
        min_valid_zipf = merged_df.loc[merged_df['llm_frequency_zipf'] != -np.inf, 'llm_frequency_zipf'].min()
        merged_df.loc[merged_df['llm_frequency_zipf'] == -np.inf, 'llm_frequency_zipf'] = min_valid_zipf - 1
        print("✅ Added Zipf-scale frequency transformation")
    else:
        print("⚠️ Cannot calculate Zipf frequency: total tokens is 0")
        merged_df['llm_frequency_zipf'] = np.nan
else:
    print("⚠️ No LLM frequency data found for transformations")

# Verify and display transformation results
numeric_cols = ['llm_frequency_raw', 'llm_frequency_schepens', 'llm_frequency_zipf', 
               'subtlex_zipf', 'multilex_zipf', 'gpt_familiarity']

available_cols = [col for col in numeric_cols if col in merged_df.columns]

if available_cols:
    print(f"\n📊 Frequency measure statistics:")
    
    for col in available_cols:
        valid_count = merged_df[col].notna().sum()
        print(f"\n📏 {col}:")
        print(f"   Valid values: {valid_count:,} ({valid_count/len(merged_df)*100:.1f}%)")
        if valid_count > 0:
            print(f"   Range: {merged_df[col].min():.3f} to {merged_df[col].max():.3f}")
            print(f"   Mean: {merged_df[col].mean():.3f}")
            print(f"   Median: {merged_df[col].median():.3f}")

print(f"\n✅ Frequency transformations complete!")
print(f"   Final dataset: {len(merged_df)} words with {len(merged_df.columns)} measures")

🔢 Computing frequency transformations...
✅ Added log10(frequency + 1) transformation (Schepens method)
✅ Added Zipf-scale frequency transformation

📊 Frequency measure statistics:

📏 llm_frequency_raw:
   Valid values: 27,939 (100.0%)
   Range: 1.000 to 108943.000
   Mean: 75.377
   Median: 4.000

📏 llm_frequency_schepens:
   Valid values: 27,939 (100.0%)
   Range: 0.301 to 5.037
   Mean: 0.932
   Median: 0.699

📏 llm_frequency_zipf:
   Valid values: 27,939 (100.0%)
   Range: 2.677 to 7.714
   Mean: 3.478
   Median: 3.279

📏 subtlex_zipf:
   Valid values: 15,859 (56.8%)
   Range: 1.292 to 7.621
   Mean: 3.193
   Median: 3.137

📏 multilex_zipf:
   Valid values: 15,814 (56.6%)
   Range: 0.374 to 7.572
   Mean: 3.371
   Median: 3.331

📏 gpt_familiarity:
   Valid values: 15,859 (56.8%)
   Range: 1.008 to 7.000
   Mean: 5.841
   Median: 6.023

✅ Frequency transformations complete!
   Final dataset: 27939 words with 14 measures


## 8. Export Dataset for Analysis

**Final Output:** We export the complete dataset with all frequency measures and reference data for analysis in `notebook2_corpus_analysis.ipynb`.

**The exported file includes:**
- **LLM Frequency Measures**: Raw frequency, Schepens-transformed, and Zipf-scaled
- **Reference Measures**: SUBTLEX, Multilex, GPT familiarity estimates
- **Behavioral Data**: English reading times from ECP dataset
- **Metadata**: Word length, basic statistics

**Output:** `generated_corpus_with_predictors.csv` - ready for regression analysis and validation against human reading time data.

In [57]:
# Export Dataset for Analysis
print("Preparing to export corpus dataset...")

# Generate summary of available columns
print(f"Dataset Summary:")
print(f"   Total words: {len(merged_df):,}")
print(f"   Total columns: {len(merged_df.columns)}")

# Categorize columns
llm_freq_cols = [col for col in merged_df.columns if col.startswith('llm_frequency')]
ref_cols = [col for col in merged_df.columns if col in ['subtlex_zipf', 'subtlex_freq_raw', 'multilex_zipf', 'gpt_familiarity', 'rt', 'accuracy', 'prevalence']]

print(f"\nColumn Categories:")
print(f"   LLM Frequency Measures: {len(llm_freq_cols)} columns")
for col in llm_freq_cols:
    print(f"      - {col}")
    
print(f"   Reference Measures: {len(ref_cols)} columns")
for col in ref_cols:
    if col in merged_df.columns:
        print(f"      - {col}")

# Export the dataset
output_dir = get_project_path('output')
output_path = os.path.join(output_dir, 'generated_corpus_with_predictors.csv')
merged_df.to_csv(output_path, index=False)

# File size
size_mb = os.path.getsize(output_path) / (1024 * 1024)

print(f"\nCorpus dataset exported successfully!")
print(f"File: {os.path.basename(output_path)}")
print(f"File size: {size_mb:.1f} MB")

print(f"\nNext Steps:")
print(f"   1. Open notebook2_corpus_analysis.ipynb for behavioral validation")
print(f"   2. Use this file to compare:")
print(f"      - LLM-derived frequency vs traditional measures (SUBTLEX, Multilex)")
print(f"      - LLM frequency vs GPT familiarity estimates")
print(f"      - All measures vs human reading times (ECP data)")

print(f"\nAvailable for Analysis:")
total_predictors = len(llm_freq_cols) + len(ref_cols)
print(f"   {total_predictors} frequency/familiarity predictors")
print(f"   Ready for statistical modeling and validation")
print(f"   Compatible with notebook2_corpus_analysis.ipynb regression analysis")

Preparing to export corpus dataset...
Dataset Summary:
   Total words: 27,939
   Total columns: 14

Column Categories:
   LLM Frequency Measures: 3 columns
      - llm_frequency_raw
      - llm_frequency_schepens
      - llm_frequency_zipf
   Reference Measures: 7 columns
      - subtlex_zipf
      - subtlex_freq_raw
      - multilex_zipf
      - gpt_familiarity
      - rt
      - accuracy
      - prevalence



Corpus dataset exported successfully!
File: generated_corpus_with_predictors.csv
File size: 3.5 MB

Next Steps:
   1. Open notebook2_corpus_analysis.ipynb for behavioral validation
   2. Use this file to compare:
      - LLM-derived frequency vs traditional measures (SUBTLEX, Multilex)
      - LLM frequency vs GPT familiarity estimates
      - All measures vs human reading times (ECP data)

Available for Analysis:
   10 frequency/familiarity predictors
   Ready for statistical modeling and validation
   Compatible with notebook2_corpus_analysis.ipynb regression analysis
