# GPT-4o Feature Validation for Box Office Dataset

This notebook validates and corrects movie features using GPT-4o's knowledge. It sends movie information to the model and asks it to verify the accuracy of our engineered features.

In [5]:
# =============================================================================
# DATASET CONFIGURATION - Easy switching between dataset subsets
# =============================================================================

import sys
from pathlib import Path
sys.path.insert(0, str(Path('../').resolve()))

from dataset_config import (
    DEFAULT_CONFIG, get_dataset_config, get_dataset_path, get_config_summary,
    use_full_dataset, use_english_only, use_major_studios, use_english_major
)

# =============================================================================
# CHOOSE YOUR DATASET SCOPE - Uncomment one line to switch
# =============================================================================

# CURRENT_CONFIG = use_full_dataset()                    # All studios, all languages (2010-2026)
# CURRENT_CONFIG = use_english_only(2010)                # English only (2010-2026)
CURRENT_CONFIG = use_english_only(2015)                # English only (2015-2026)
# CURRENT_CONFIG = use_major_studios(2010)               # Major studios only (2010-2026)
# CURRENT_CONFIG = use_major_studios(2015)               # Major studios only (2015-2026)
# CURRENT_CONFIG = use_english_major(2010)               # English + Major studios (2010-2026)
# CURRENT_CONFIG = use_english_major(2015)               # English + Major studios (2015-2026)

# CURRENT_CONFIG = DEFAULT_CONFIG  # Use default (full dataset)

print("🎯 DATASET CONFIGURATION:")
print("=" * 50)
print(get_config_summary(CURRENT_CONFIG))
print("\n💡 To change scope, uncomment one of the CURRENT_CONFIG lines above and re-run this cell")

🎯 DATASET CONFIGURATION:
📊 Dataset Configuration: English Only
   Description: English movies only
   Year range: 2015-2026
   Training file: dataset_domestic_processed_english_2015_2026.csv
   Full file: dataset_domestic_processed_english_2015_2026.csv


💡 To change scope, uncomment one of the CURRENT_CONFIG lines above and re-run this cell


In [6]:
import pandas as pd
import numpy as np
import json
import time
import requests
from pathlib import Path
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# OpenAI API setup
import openai

# Load API key from config.json
try:
    with open('../config.json', 'r') as f:
        config = json.load(f)

    # Set OpenAI API key
    openai.api_key = config['OPENAI_API_KEY']
    os.environ['OPENAI_API_KEY'] = config['OPENAI_API_KEY']

    print("✅ OpenAI API key loaded from config.json")

except FileNotFoundError:
    print("❌ config.json not found. Please create it with your OPENAI_API_KEY")
    print("Example config.json structure:")
    print('{\n  "OPENAI_API_KEY": "your-api-key-here"\n}')
except KeyError:
    print("❌ OPENAI_API_KEY not found in config.json")
except Exception as e:
    print(f"❌ Error loading config: {e}")

print("Libraries imported successfully")

✅ OpenAI API key loaded from config.json
Libraries imported successfully


## Load Dataset and Configuration

In [7]:
# Load dataset using configuration system
def load_dataset(training=False):
    """Load the configured dataset subset."""
    import pandas as pd
    from movie_lists import normalize_domestic_titles

    dataset_path = get_dataset_path(training=training, config=CURRENT_CONFIG)
    dataset_config = get_dataset_config(CURRENT_CONFIG)

    print(f"📁 Loading dataset: {dataset_path}")
    
    # Load data
    df = pd.read_csv(dataset_path)

    # Apply additional filtering if needed (for english_major scope)
    if dataset_config['scope'] == 'english_major':
        if 'is_major_studio' in df.columns:
            original_len = len(df)
            df = df[df['is_major_studio'] == 1].copy()
            print(f"   Filtered to major studios: {len(df):,} movies (removed {original_len - len(df):,})")

    # Normalize titles
    df = normalize_domestic_titles(df)

    print(f"   ✅ Loaded {len(df):,} movies")
    if 'release_year' in df.columns:
        print(f"   Year range: {df['release_year'].min()}-{df['release_year'].max()}")

        # Show breakdown by time period
        training_count = len(df[df['release_year'] <= 2023])
        test_2024_count = len(df[df['release_year'] == 2024])
        eval_2025_count = len(df[df['release_year'] == 2025])
        pred_2026_count = len(df[df['release_year'] == 2026])

        print(f"   Training (≤2023): {training_count:,} movies")
        if test_2024_count > 0:
            print(f"   Testing (2024): {test_2024_count:,} movies")
        if eval_2025_count > 0:
            print(f"   Evaluation (2025): {eval_2025_count:,} movies")
        if pred_2026_count > 0:
            print(f"   Prediction (2026): {pred_2026_count:,} movies")

    return df

# Load the dataset
df = load_dataset(training=False)

📁 Loading dataset: ../data/dataset_domestic_processed_english_2015_2026.csv
No title corrections needed
   ✅ Loaded 1,307 movies
   Year range: 2015-2026
   Training (≤2023): 1,003 movies
   Testing (2024): 145 movies
   Evaluation (2025): 114 movies
   Prediction (2026): 45 movies


## GPT-4o Validation System

In [8]:
def create_validation_prompt(movie_data):
    """
    Create a prompt for GPT-4o to validate movie features
    """
    
    # Extract ALL specified features for validation (34 columns total)
    feature_dict = {
        # Basic movie info
        'title': movie_data['title'],
        'title_normalized': movie_data['title_normalized'],
        'release_year': int(movie_data['release_year']),
        
        # Production/Origin info
        'production_country_codes': str(movie_data['production_country_codes']),
        'is_origin_usa': bool(movie_data['is_origin_usa']),
        'is_origin_uk_ie': bool(movie_data['is_origin_uk_ie']),
        'is_origin_canada': bool(movie_data['is_origin_canada']),
        'is_origin_us_uk_ca': bool(movie_data['is_origin_us_uk_ca']),
        
        # Distribution
        'distributor': str(movie_data['distributor']),
        
        # Studio features (9 total)
        'is_disney': bool(movie_data['is_disney']),
        'is_warner_bros': bool(movie_data['is_warner_bros']),
        'is_universal': bool(movie_data['is_universal']),
        'is_sony': bool(movie_data['is_sony']),
        'is_paramount': bool(movie_data['is_paramount']),
        'is_fox': bool(movie_data['is_fox']),
        'is_mgm': bool(movie_data['is_mgm']),
        'is_lionsgate': bool(movie_data['is_lionsgate']),
        'is_major_studio': bool(movie_data['is_major_studio']),
        
        # Language
        'is_english': bool(movie_data['is_english']),
        
        # IP and franchise features (15 total)
        'is_sequel_title': bool(movie_data['is_sequel_title']),
        'is_marvel': bool(movie_data['is_marvel']),
        'is_dc': bool(movie_data['is_dc']),
        'is_star_wars': bool(movie_data['is_star_wars']),
        'is_fast_furious': bool(movie_data['is_fast_furious']),
        'is_harry_potter': bool(movie_data['is_harry_potter']),
        'is_franchise_sequel': bool(movie_data['is_franchise_sequel']),
        'is_sequel': bool(movie_data['is_sequel']),
        'is_live_action_remake': bool(movie_data['is_live_action_remake']),
        'is_adaptation': bool(movie_data['is_adaptation']),
        'is_superhero': bool(movie_data['is_superhero']),
        'has_remake_indicator': bool(movie_data['has_remake_indicator']),
        'is_remake_adaptation': bool(movie_data['is_remake_adaptation']),
        'is_ip_movie': bool(movie_data['is_ip_movie'])
    }
    
    # Convert feature dict to JSON string to avoid f-string nesting issues
    feature_json = json.dumps(feature_dict, indent=2)
    
    prompt = f"""
You are a movie expert tasked with validating feature classifications for a box office prediction model. 

Please review the following movie and its current feature classifications, then provide corrections for any inaccurate features.

**Movie Information:**
{feature_json}

**Feature Definitions:**
- title: Original movie title
- title_normalized: Cleaned version of the title
- release_year: Year the movie was released
- production_country_codes: Country codes where movie was produced
- is_origin_usa/uk_ie/canada/us_uk_ca: Movie origin country flags
- distributor: Studio/company that distributed the movie
- Studio flags (is_disney, is_warner_bros, etc.): Which major studio distributed it
- is_major_studio: Whether distributed by a major studio (Disney, Warner Bros, Universal, Sony, Paramount, Fox, MGM, Lionsgate)
- is_english: Whether the movie is in English language
- is_sequel_title: Title contains sequel indicators (2, 3, Part, Chapter, etc.)
- is_marvel: Marvel Cinematic Universe or Marvel Comics adaptation
- is_dc: DC Comics adaptation
- is_star_wars: Part of Star Wars franchise
- is_fast_furious: Part of Fast & Furious franchise
- is_harry_potter: Part of Harry Potter/Wizarding World
- is_franchise_sequel: Sequel/spinoff of an established franchise
- is_sequel: Any type of sequel (combines multiple sequel indicators)
- is_live_action_remake: Live-action version of animated film or remake of classic
- is_adaptation: Adapted from other media (books, games, TV shows, toys, comics, etc.)
- is_superhero: Features superheroes as main characters
- has_remake_indicator: Title contains remake-related words
- is_remake_adaptation: Any type of remake or adaptation
- is_ip_movie: Based on existing intellectual property (comprehensive IP flag)

**Instructions:**
1. Use your knowledge of this specific movie to verify each feature
2. Focus on accuracy - only return corrections for features that are definitively wrong
3. Pay special attention to:
   - Studio/distributor accuracy (which studio actually distributed this movie?)
   - IP classification (Marvel, DC, adaptations, remakes, sequels)
   - Franchise identification (is this really part of these franchises?)
   - Origin country information (where was this movie actually made?)
   - Language (is this actually in English?)
4. Provide your response as a JSON object with only the corrected features
5. If all features are correct, return: {{"status": "all_correct"}}
6. For corrections, include a brief but clear reason

**Response Format:**
```json
{{
  "corrections": {{
    "feature_name": {{
      "old_value": false,
      "new_value": true,
      "reason": "Brief explanation of why this is correct"
    }}
  }}
}}
```

Please provide your analysis:
"""
    
    return prompt

def validate_movie_features(movie_data, client, model="gpt-4o", max_retries=3):
    """
    Send movie data to GPT-4o for feature validation
    """
    prompt = create_validation_prompt(movie_data)
    
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": "You are a movie expert who validates feature classifications for machine learning models. Always respond with valid JSON. Focus on accuracy and only correct features that are definitively wrong."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.1,
                max_tokens=2000  # Increased for all 34 features
            )
            
            # Extract JSON from response
            content = response.choices[0].message.content.strip()
            
            # Remove markdown code block markers if present
            if content.startswith('```json'):
                content = content[7:]
            if content.endswith('```'):
                content = content[:-3]
            
            # Parse JSON response
            result = json.loads(content)
            return result
            
        except json.JSONDecodeError as e:
            print(f"JSON decode error for {movie_data['title']} (attempt {attempt + 1}): {e}")
            if attempt == max_retries - 1:
                return {"error": f"JSON decode failed after {max_retries} attempts"}
            time.sleep(1)
            
        except Exception as e:
            print(f"API error for {movie_data['title']} (attempt {attempt + 1}): {e}")
            if attempt == max_retries - 1:
                return {"error": f"API call failed after {max_retries} attempts: {str(e)}"}
            time.sleep(2)
    
    return {"error": "Max retries exceeded"}

print("Validation functions defined with ALL 34 specified features")

Validation functions defined with ALL 34 specified features


## Test Validation System

In [None]:
# Initialize OpenAI client
try:
    client = openai.OpenAI()
    print("OpenAI client initialized successfully")
    
    # Test with a sample movie
    test_movie = df[df['title'].str.contains('Mario', case=False, na=False)].iloc[0] if len(df[df['title'].str.contains('Mario', case=False, na=False)]) > 0 else df.iloc[0]
    
    print(f"\nTesting validation system with: {test_movie['title']} ({test_movie['release_year']})")
    
    # Run validation
    result = validate_movie_features(test_movie, client)
    
    print("\nValidation result:")
    print(json.dumps(result, indent=2))
    
except Exception as e:
    print(f"Error initializing OpenAI client: {e}")
    print("Please make sure you have set your OpenAI API key:")
    print("1. Set environment variable: export OPENAI_API_KEY='your-key-here'")
    print("2. Or uncomment and set the api_key in the first cell")

## Batch Validation Process

In [None]:
def apply_corrections(df, corrections_log):
    """
    Apply all collected corrections to the dataframe
    """
    corrected_df = df.copy()
    total_corrections = 0
    
    for idx, corrections in corrections_log.items():
        if 'corrections' in corrections:
            for feature, correction_info in corrections['corrections'].items():
                old_value = correction_info['old_value']
                new_value = correction_info['new_value']
                reason = correction_info['reason']
                
                # Apply correction
                corrected_df.at[idx, feature] = new_value
                total_corrections += 1
                
                print(f"Row {idx} ({corrected_df.at[idx, 'title']}): {feature} {old_value} → {new_value} ({reason})")
    
    print(f"\nTotal corrections applied: {total_corrections}")
    return corrected_df

def validate_dataset_subset(df, start_idx=0, end_idx=None, save_progress=True):
    """
    Validate a subset of the dataset
    """
    if end_idx is None:
        end_idx = len(df)
    
    subset = df.iloc[start_idx:end_idx]
    corrections_log = {}
    errors_log = {}
    
    print(f"Validating movies {start_idx} to {end_idx-1} ({len(subset)} movies)")
    
    try:
        client = openai.OpenAI()
    except Exception as e:
        print(f"Failed to initialize OpenAI client: {e}")
        return None, None, None
    
    for idx, row in tqdm(subset.iterrows(), total=len(subset), desc="Validating movies"):
        try:
            # Validate movie features
            result = validate_movie_features(row, client)
            
            # Store result
            if 'error' in result:
                errors_log[idx] = result
            else:
                corrections_log[idx] = result
            
            # Rate limiting - wait between API calls
            time.sleep(0.5)
            
            # Save progress every 10 movies
            if save_progress and (idx - start_idx) % 10 == 9:
                progress_file = f'../data/validation_progress_{start_idx}_{end_idx}.json'
                with open(progress_file, 'w') as f:
                    json.dump({
                        'corrections': corrections_log,
                        'errors': errors_log,
                        'last_processed': idx
                    }, f, indent=2)
                    
        except KeyboardInterrupt:
            print(f"\nValidation interrupted at row {idx}")
            break
        except Exception as e:
            print(f"Unexpected error for row {idx}: {e}")
            errors_log[idx] = {"error": str(e)}
    
    return corrections_log, errors_log, subset

print("Batch validation functions defined")

## Select Movies for Validation

Let's prioritize validation for movies that are most likely to have errors, especially 2026 movies and IP-related films.

In [None]:
# Prioritize movies for validation
print("Selecting movies for validation...")

# High priority: 2026 movies (most likely to have errors)
movies_2026 = df[df['release_year'] == 2026].copy()
print(f"2026 movies: {len(movies_2026)}")

# Medium priority: Recent movies (2024-2025) with IP/adaptation flags
recent_ip_movies = df[
    (df['release_year'].isin([2024, 2025])) & 
    (df['is_ip_movie'] == 1)
].copy()
print(f"Recent IP movies (2024-2025): {len(recent_ip_movies)}")

# Sample of other movies for general validation
other_movies = df[
    (~df.index.isin(movies_2026.index)) & 
    (~df.index.isin(recent_ip_movies.index))
].sample(n=min(50, len(df) - len(movies_2026) - len(recent_ip_movies)), random_state=42)
print(f"Sample of other movies: {len(other_movies)}")

# Combine prioritized datasets
validation_subset = pd.concat([movies_2026, recent_ip_movies, other_movies]).reset_index(drop=True)
print(f"\nTotal movies selected for validation: {len(validation_subset)}")

# Display sample
print("\nSample of movies to validate:")
display(validation_subset[['title', 'release_year', 'is_ip_movie', 'is_adaptation', 'is_marvel', 'is_dc']].head(10))

## Run Validation (Uncomment to Execute)

**Warning**: This will make API calls to OpenAI. Make sure you have:
1. Set your OpenAI API key
2. Sufficient API credits
3. Are ready to process the selected movies

In [None]:
# Uncomment the following lines to run the validation
# This will make API calls to OpenAI

# print("Starting validation process...")
# print("This may take several minutes depending on the number of movies.")
# print(f"Estimated cost: ~${len(validation_subset) * 0.01:.2f} (rough estimate)")
# 
# # Run validation on the prioritized subset
# corrections_log, errors_log, validated_subset = validate_dataset_subset(
#     validation_subset, 
#     start_idx=0, 
#     end_idx=len(validation_subset)
# )
# 
# # Display results
# if corrections_log is not None:
#     print(f"\nValidation completed!")
#     print(f"Movies processed: {len(validated_subset)}")
#     print(f"Movies with corrections: {len([c for c in corrections_log.values() if 'corrections' in c])}")
#     print(f"Errors encountered: {len(errors_log)}")
#     
#     # Save validation results
#     results_file = '../data/llm_validation_results.json'
#     with open(results_file, 'w') as f:
#         json.dump({
#             'corrections': corrections_log,
#             'errors': errors_log,
#             'validation_metadata': {
#                 'total_movies': len(validated_subset),
#                 'validation_date': pd.Timestamp.now().isoformat(),
#                 'model_used': 'gpt-4o'
#             }
#         }, f, indent=2)
#     
#     print(f"Results saved to: {results_file}")
# else:
#     print("Validation failed to initialize")

print("Validation code ready (uncomment to run)")

## Apply Corrections and Rebuild Dataset

In [None]:
# Load and apply validation results (uncomment when you have results)

# try:
#     # Load validation results
#     with open('../data/llm_validation_results.json', 'r') as f:
#         validation_results = json.load(f)
#     
#     corrections_log = validation_results['corrections']
#     errors_log = validation_results['errors']
#     
#     print(f"Loaded validation results:")
#     print(f"- Movies with corrections: {len([c for c in corrections_log.values() if 'corrections' in c])}")
#     print(f"- Errors: {len(errors_log)}")
#     
#     # Apply corrections to the full dataset
#     print("\nApplying corrections to dataset...")
#     corrected_df = apply_corrections(df, corrections_log)
#     
#     # Save corrected dataset
#     corrected_path = '../data/dataset_domestic_processed_llm_validated.csv'
#     corrected_df.to_csv(corrected_path, index=False)
#     print(f"\nCorrected dataset saved to: {corrected_path}")
#     
#     # Generate validation report
#     print("\n=== VALIDATION REPORT ===")
#     
#     # Count corrections by feature
#     feature_corrections = {}
#     for idx, corrections in corrections_log.items():
#         if 'corrections' in corrections:
#             for feature in corrections['corrections'].keys():
#                 feature_corrections[feature] = feature_corrections.get(feature, 0) + 1
#     
#     print("\nMost frequently corrected features:")
#     for feature, count in sorted(feature_corrections.items(), key=lambda x: x[1], reverse=True):
#         print(f"  {feature}: {count} corrections")
#     
#     # Analyze improvement in key features
#     key_features = ['is_ip_movie', 'is_adaptation', 'is_marvel', 'is_dc', 'is_superhero', 'is_sequel']
#     
#     print("\nFeature accuracy improvements:")
#     for feature in key_features:
#         if feature in corrected_df.columns:
#             original_count = df[feature].sum()
#             corrected_count = corrected_df[feature].sum()
#             change = corrected_count - original_count
#             print(f"  {feature}: {original_count} → {corrected_count} ({change:+d})")
#     
# except FileNotFoundError:
#     print("No validation results found. Run validation first.")
# except Exception as e:
#     print(f"Error loading validation results: {e}")

print("Correction application code ready")

## Summary and Next Steps

This notebook provides a complete system for validating and correcting movie features using GPT-4o. To use it:

1. **Set up API access**: Add your OpenAI API key
2. **Test the system**: Run the test validation to ensure everything works
3. **Run validation**: Uncomment the validation code to process your selected movies
4. **Apply corrections**: Use the results to generate a corrected dataset
5. **Rebuild features**: Use the corrected dataset for model training

The system prioritizes:
- 2026 movies (most likely to have errors)
- Recent IP movies (2024-2025)
- A random sample of other movies for general validation

Key features validated:
- IP movie classification
- Adaptation detection
- Franchise identification (Marvel, DC, Star Wars, etc.)
- Sequel detection
- Studio classification
- Superhero movie identification