# Count Lines Per Protagonist

This notebook counts the number of lines per character in each movie and keeps only the character with the most lines (the first one if there are ties).

In [None]:
import pandas as pd
import os
from pathlib import Path
output_path = "data/kaggle_data.csv"


In [None]:
import pandas as pd
import os
from pathlib import Path

# Path to the movie character texts directory
base_path = Path("data/movie_characters/data/movie_character_texts/movie_character_texts")

# List to store results
results = []

# Iterate through each movie directory
for movie_dir in sorted(base_path.iterdir()):
    if movie_dir.is_dir():
        # Extract movie name and IMDB ID
        movie_name_id = movie_dir.name
        parts = movie_name_id.rsplit('_', 1)
        if len(parts) == 2:
            movie_name = parts[0]
            imdb_id = parts[1]
        else:
            movie_name = movie_name_id
            imdb_id = ""
        
        # Iterate through each character file in the movie directory
        for character_file in movie_dir.glob("*_text.txt"):
            # Extract character name (remove '_text.txt' suffix)
            character_name = character_file.stem.replace('_text', '')
            
            # Count lines in the file
            with open(character_file, 'r', encoding='utf-8', errors='ignore') as f:
                lines = f.readlines()
                line_count = len(lines)
            
            # Add to results
            results.append({
                'movie_name': movie_name,
                'imdb_id': imdb_id,
                'character_name': character_name,
                'line_count': line_count
            })

# Create DataFrame
movie_data = pd.DataFrame(results)

# Keep only the first character with the most lines per movie (no ties)
movie_data = movie_data.loc[movie_data.groupby('movie_name')['line_count'].idxmax()].copy()

# Load movie metadata
metadata_df = pd.read_csv("data/movie_metadata/movie_meta_data.csv")
metadata_df = metadata_df[['imdbid', 'title', 'directors', 'genres', 'imdb user rating']].copy()
metadata_df.columns = ['imdb_id', 'title', 'director', 'genres', 'imdb_rating']

# Merge protagonist data with movie metadata
movie_data['imdb_id_padded'] = movie_data['imdb_id'].astype(str).str.zfill(7)
metadata_df['imdb_id_padded'] = metadata_df['imdb_id'].astype(str).str.zfill(7)
movie_data = movie_data.merge(
    metadata_df, 
    left_on='imdb_id_padded',
    right_on='imdb_id_padded',
    how='left',
    suffixes=('', '_meta')
)
movie_data = movie_data.drop(columns=['imdb_id_padded', 'imdb_id_meta'], errors='ignore')

# Add column with path to full script
movie_data['script_filename'] = movie_data['movie_name'] + '_' + movie_data['imdb_id'] + '.txt'
movie_data['script_path'] = 'data/screenplay_data/data/raw_texts/raw_texts/' + movie_data['script_filename']
movie_data['script_exists'] = movie_data['script_path'].apply(lambda x: os.path.exists(x))

# Remove rows where script files are missing
movie_data = movie_data[movie_data['script_exists']].copy()

# Display summary
print(f"Remaining movies with scripts: {movie_data['movie_name'].nunique()}")
print(f"Remaining rows: {movie_data.shape[0]}")
print(movie_data.head())

In [None]:
# Display first few rows
movie_data.head(10)

In [None]:
# Display sample of protagonists
movie_data.tail(20)

In [None]:
# Summary statistics
print("Summary statistics for protagonist line counts:")
print(movie_data['line_count'].describe())

In [None]:
# Load movie metadata
metadata_df = pd.read_csv("data/movie_metadata/movie_meta_data.csv")

# Select relevant columns
metadata_df = metadata_df[['imdbid', 'title', 'directors', 'genres', 'imdb user rating']].copy()

# Rename columns for clarity
metadata_df.columns = ['imdb_id', 'title', 'director', 'genres', 'imdb_rating']

print(f"Metadata shape: {metadata_df.shape}")
print(f"\nMetadata sample:")
metadata_df.head()

In [None]:
# Merge protagonist data with movie metadata
# Convert imdb_id to string and pad with leading zeros to 7 digits for consistent format
movie_data['imdb_id_padded'] = movie_data['imdb_id'].astype(str).str.zfill(7)
metadata_df['imdb_id_padded'] = metadata_df['imdb_id'].astype(str).str.zfill(7)

# Merge on the padded imdb_id
movie_data = movie_data.merge(
    metadata_df, 
    left_on='imdb_id_padded',
    right_on='imdb_id_padded',
    how='left',
    suffixes=('', '_meta')
)

# Drop the temporary padded column and the duplicate imdb_id from metadata
movie_data = movie_data.drop(columns=['imdb_id_padded', 'imdb_id_meta'], errors='ignore')

print(f"Merged DataFrame shape: {movie_data.shape}")
print(f"\nMerge statistics:")
print(f"  - Protagonists matched with metadata: {movie_data['title'].notna().sum()}")
print(f"  - Protagonists without metadata: {movie_data['title'].isna().sum()}")

movie_data.head(10)

In [None]:
# Add column with path to full script
import os

# Create the script file path by combining movie_name and imdb_id
movie_data['script_filename'] = movie_data['movie_name'] + '_' + movie_data['imdb_id'] + '.txt'
movie_data['script_path'] = 'data/screenplay_data/data/raw_texts/raw_texts/' + movie_data['script_filename']

# Check if the script files actually exist
movie_data['script_exists'] = movie_data['script_path'].apply(
    lambda x: os.path.exists(x)
)

print(f"Scripts found: {movie_data['script_exists'].sum()}")
print(f"Scripts not found: {(~movie_data['script_exists']).sum()}")

# Show some examples
print("\nExamples with scripts:")
print(movie_data[movie_data['script_exists']][['movie_name', 'character_name', 'script_path']].head(5))

print("\nExamples without scripts:")
print(movie_data[~movie_data['script_exists']][['movie_name', 'character_name', 'script_filename']].head(5))

In [None]:
# Remove rows where script files are missing
movie_data = movie_data[movie_data['script_exists']].copy()

print(f"Remaining movies with scripts: {movie_data['movie_name'].nunique()}")
print(f"Remaining rows: {movie_data.shape[0]}")

In [None]:
# Loop over rows and print the beginning of every thousandth script
for idx, row in movie_data.iterrows():
    if idx % 1000 == 0:
        script_path = row['script_path']
        try:
            with open(script_path, 'r', encoding='utf-8', errors='ignore') as f:
                print(f"Script {idx}: {script_path}")
                print(f.read(100))  # Print the first 100 characters of the script
        except FileNotFoundError:
            print(f"Script not found: {script_path}")

In [None]:
# Dummy code genres and keep the 12 most common genres
genre_dummies = movie_data['genres'].str.get_dummies(sep=', ')

# Sum up occurrences of each genre and keep the top 12
most_common_genres = genre_dummies.sum().nlargest(12).index
filtered_genres = genre_dummies[most_common_genres]

# Add the filtered genres back to the main DataFrame
movie_data = pd.concat([movie_data, filtered_genres], axis=1)

print("Top 12 genres added as dummy variables:")
print(filtered_genres.columns)

#keep bottom 3 rows
movie_data = movie_data.tail(3).copy()

In [None]:
# Simple IMDb decimal-rating scraper using Selenium (Safari)
# Insert your preferred CSS selector in RATING_SELECTOR (see marker below).
# Note: Safari's WebDriver (safaridriver) must be enabled on macOS:
#   Run `safaridriver --enable` once in a terminal if not already enabled.
# Safari on macOS does not support a headless mode via safaridriver, so the browser UI may appear when scraping.

import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

# --- PUT YOUR RATING CSS SELECTOR ON THE NEXT LINE ---
# Example selectors you can try:
# 1) 'div[data-testid="hero-rating-bar__aggregate-rating__score"] span'
# 2) 'span[itemprop="ratingValue"]'
# Set RATING_SELECTOR to whichever works for your environment/page version.
RATING_SELECTOR = 'div[data-testid="hero-rating-bar__aggregate-rating__score"] span'

# Start Safari webdriver (requires safaridriver enabled)
# If you prefer Chrome/Firefox, switch to the corresponding webdriver initialization.
driver = webdriver.Safari()
wait = WebDriverWait(driver, 6)

def fetch_decimal_rating(imdb_id):
    imdb_str = str(imdb_id).zfill(7)
    url = f"https://www.imdb.com/title/tt{imdb_str}/"
    try:
        driver.get(url)
        # Wait until the rating element appears (short timeout)
        el = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, RATING_SELECTOR)))
        text = el.text.strip()
        if not text:
            return None
        # Text is typically like '7.8' or '8'
        try:
            return float(text)
        except ValueError:
            # sometimes contains other whitespace â€” try first token
            try:
                return float(text.split()[0])
            except Exception:
                return None
    except Exception as e:
        # suppressed; return None when not found or blocked
        # print(f"fetch error tt{imdb_str}: {e}")
        return None

# Build rating map for unique ids
unique_ids = movie_data['imdb_id'].astype(str).unique()
rating_map = {}
for i, iid in enumerate(unique_ids, 1):
    if not iid or pd.isna(iid) or iid.strip() == "":
        rating_map[iid] = None
        continue
    rating = fetch_decimal_rating(iid)
    print(f"Fetched rating for tt{iid}: ", end="")
    print(rating)
    rating_map[iid] = rating
    # polite pause
    time.sleep(1 + random.uniform(0, 1.5))
    if i % 100 == 0:
        print(f"Fetched {i}/{len(unique_ids)}")

# Map back into dataframe and prefer decimal when available
movie_data['imdb_rating_decimal'] = movie_data['imdb_id'].astype(str).map(rating_map)

# cleanup
driver.quit()

# persist results
movie_data.to_csv(output_path, index=False)
print(f"Updated {output_path} with scraped decimal ratings (where found).")
print(movie_data[['imdb_id','imdb_rating', 'imdb_rating_decimal']].head())

In [None]:
#load kaggle data
kaggle_data = pd.read_csv("data/kaggle_data.csv")

#LLM_annotate