In [1]:
import pandas as pd
import os
from pathlib import Path
output_path = "data/kaggle_data.csv"

In [2]:
# Path to the movie character texts directory
base_path = Path("data/movie_characters/data/movie_character_texts/movie_character_texts")

# List to store results
results = []

# Iterate through each movie directory
for movie_dir in sorted(base_path.iterdir()):
    if movie_dir.is_dir():
        # Extract movie name and IMDB ID
        movie_name_id = movie_dir.name
        parts = movie_name_id.rsplit('_', 1)
        if len(parts) == 2:
            movie_name = parts[0]
            imdb_id = parts[1]
        else:
            movie_name = movie_name_id
            imdb_id = ""
        
        # Iterate through each character file in the movie directory
        for character_file in movie_dir.glob("*_text.txt"):
            # Extract character name (remove '_text.txt' suffix)
            character_name = character_file.stem.replace('_text', '')
            
            # Count lines in the file
            with open(character_file, 'r', encoding='utf-8', errors='ignore') as f:
                lines = f.readlines()
                line_count = len(lines)
            
            # Add to results
            results.append({
                'movie_name': movie_name,
                'imdb_id': imdb_id,
                'character_name_kaggle': character_name,
                'line_count': line_count
            })

# Create DataFrame
kaggle_data = pd.DataFrame(results)

# Keep only the first character with the most lines per movie (no ties)
kaggle_data = kaggle_data.loc[kaggle_data.groupby('movie_name')['line_count'].idxmax()].copy()

# Load movie metadata
metadata_df = pd.read_csv("data/movie_metadata/movie_meta_data.csv")
metadata_df = metadata_df[['imdbid', 'title', 'directors', 'genres', 'imdb user rating', 'year', 'plot']].copy()
metadata_df.columns = ['imdb_id', 'title', 'director', 'genres', 'imdb_rating', 'year', 'plot']

# Merge protagonist data with movie metadata
kaggle_data['imdb_id_padded'] = kaggle_data['imdb_id'].astype(str).str.zfill(7)
metadata_df['imdb_id_padded'] = metadata_df['imdb_id'].astype(str).str.zfill(7)
kaggle_data = kaggle_data.merge(
    metadata_df, 
    left_on='imdb_id_padded',
    right_on='imdb_id_padded',
    how='left',
    suffixes=('', '_meta')
)
kaggle_data = kaggle_data.drop(columns=['imdb_id_padded', 'imdb_id_meta'], errors='ignore')

# Add column with path to full script
kaggle_data['script_filename'] = kaggle_data['movie_name'] + '_' + kaggle_data['imdb_id'] + '.txt'
kaggle_data['script_path'] = 'data/screenplay_data/data/raw_texts/raw_texts/' + kaggle_data['script_filename']
kaggle_data['script_exists'] = kaggle_data['script_path'].apply(lambda x: os.path.exists(x))

# Remove rows where script files are missing
kaggle_data = kaggle_data[kaggle_data['script_exists']].copy()

# Display summary
print(f"Remaining movies with scripts: {kaggle_data['movie_name'].nunique()}")
print(f"Remaining rows: {kaggle_data.shape[0]}")

Remaining movies with scripts: 2138
Remaining rows: 2138


In [3]:
kaggle_data.columns

Index(['movie_name', 'imdb_id', 'character_name_kaggle', 'line_count', 'title',
       'director', 'genres', 'imdb_rating', 'year', 'plot', 'script_filename',
       'script_path', 'script_exists'],
      dtype='object')

In [None]:
# # Load movie metadata
# metadata_df = pd.read_csv("data/movie_metadata/movie_meta_data.csv")

# # Select relevant columns
# metadata_df = metadata_df[['imdbid', 'title', 'directors', 'genres', 'imdb user rating']].copy()

# # Rename columns for clarity
# metadata_df.columns = ['imdb_id', 'title', 'director', 'genres', 'imdb_rating']

# print(f"Metadata shape: {metadata_df.shape}")
# print(f"\nMetadata sample:")
# metadata_df.head()

In [None]:
# # Merge protagonist data with movie metadata
# # Convert imdb_id to string and pad with leading zeros to 7 digits for consistent format
# kaggle_data['imdb_id_padded'] = kaggle_data['imdb_id'].astype(str).str.zfill(7)
# metadata_df['imdb_id_padded'] = metadata_df['imdb_id'].astype(str).str.zfill(7)

# # Merge on the padded imdb_id
# kaggle_data = kaggle_data.merge(
#     metadata_df, 
#     left_on='imdb_id_padded',
#     right_on='imdb_id_padded',
#     how='left',
#     suffixes=('', '_meta')
# )

# # Drop the temporary padded column and the duplicate imdb_id from metadata
# kaggle_data = kaggle_data.drop(columns=['imdb_id_padded', 'imdb_id_meta'], errors='ignore')

# print(f"Merged DataFrame shape: {kaggle_data.shape}")
# print(f"\nMerge statistics:")
# print(f"  - Protagonists matched with metadata: {kaggle_data['title'].notna().sum()}")
# print(f"  - Protagonists without metadata: {kaggle_data['title'].isna().sum()}")

create column for filepath of scripts

In [None]:
# # Create the script file path by combining movie_name and imdb_id
# kaggle_data['script_filename'] = kaggle_data['movie_name'] + '_' + kaggle_data['imdb_id'] + '.txt'
# kaggle_data['script_path'] = 'data/screenplay_data/data/raw_texts/raw_texts/' + kaggle_data['script_filename']

# # Check if the script files actually exist
# kaggle_data['script_exists'] = kaggle_data['script_path'].apply(
#     lambda x: os.path.exists(x)
# )

# print(f"Scripts found: {kaggle_data['script_exists'].sum()}")
# print(f"Scripts not found: {(~kaggle_data['script_exists']).sum()}")

# print("\nExamples with scripts:")
# print(kaggle_data[kaggle_data['script_exists']][['movie_name', 'character_name', 'script_path']].head(5))

# print("\nExamples without scripts:")
# print(kaggle_data[~kaggle_data['script_exists']][['movie_name', 'character_name', 'script_filename']].head(5))

check missing scripts

In [None]:
# # Remove rows where script files are missing
# print(f"Before removal: {kaggle_data.shape[0]} rows, {kaggle_data['movie_name'].nunique()} unique movies")
# kaggle_data = kaggle_data[kaggle_data['script_exists']].copy()

# print(f"Remaining movies with scripts: {kaggle_data['movie_name'].nunique()}")
# print(f"Remaining rows: {kaggle_data.shape[0]}")

dummy code genres

In [4]:
# Dummy code genres and keep the 12 most common genres
genre_dummies = kaggle_data['genres'].str.get_dummies(sep=', ')

# Sum up occurrences of each genre and keep the top 12
most_common_genres = genre_dummies.sum().nlargest(12).index
filtered_genres = genre_dummies[most_common_genres]

# Add the filtered genres back to the main DataFrame
kaggle_data = pd.concat([kaggle_data, filtered_genres], axis=1)

add decimal version of imdb rating via webscraping

In [5]:
# Simple IMDb decimal-rating scraper using Selenium (Safari)
# Insert your preferred CSS selector in RATING_SELECTOR (see marker below).
# Note: Safari's WebDriver (safaridriver) must be enabled on macOS:
#   Run `safaridriver --enable` once in a terminal if not already enabled.
# Safari on macOS does not support a headless mode via safaridriver, so the browser UI may appear when scraping.

import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

# --- PUT YOUR RATING CSS SELECTOR ON THE NEXT LINE ---
# Example selectors you can try:
# 1) 'div[data-testid="hero-rating-bar__aggregate-rating__score"] span'
# 2) 'span[itemprop="ratingValue"]'
# Set RATING_SELECTOR to whichever works for your environment/page version.
RATING_SELECTOR = 'div[data-testid="hero-rating-bar__aggregate-rating__score"] span'

# Start Safari webdriver (requires safaridriver enabled)
# If you prefer Chrome/Firefox, switch to the corresponding webdriver initialization.
driver = webdriver.Safari()
wait = WebDriverWait(driver, 6)

def fetch_decimal_rating(imdb_id):
    imdb_str = str(imdb_id).zfill(7)
    url = f"https://www.imdb.com/title/tt{imdb_str}/"
    try:
        driver.get(url)
        # Wait until the rating element appears (short timeout)
        el = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, RATING_SELECTOR)))
        text = el.text.strip()
        if not text:
            return None
        # Text is typically like '7.8' or '8'
        try:
            return float(text)
        except ValueError:
            # sometimes contains other whitespace â€” try first token
            try:
                return float(text.split()[0])
            except Exception:
                return None
    except Exception as e:
        # suppressed; return None when not found or blocked
        # print(f"fetch error tt{imdb_str}: {e}")
        return None

# Build rating map for unique ids
unique_ids = kaggle_data['imdb_id'].astype(str).unique()
#if intermediate file exists, load it
if os.path.exists("data/intermediate_ratings.csv"):
    rating_map = pd.read_csv("data/intermediate_ratings.csv", index_col=0).to_dict()["0"]
    rating_map = {str(k).zfill(7): v for k, v in rating_map.items()}

else:
    rating_map = {}
for i, iid in enumerate(unique_ids, 1):
    if not iid or pd.isna(iid) or iid.strip() == "":
        rating_map[iid] = None
        continue
    if iid in rating_map:
        # print(f"Skipping tt{iid}, already have rating.\n\n\n\n\n")
        continue    
    print(f"Fetching tt{iid}...")
    rating = fetch_decimal_rating(iid)
    #if not none
    if rating is not None:
        rating_map[iid] = rating
    # polite pause
    time.sleep(2 + random.uniform(0, 1.5))
    if i % 10 == 0:
        print(f"Fetched {i}/{len(unique_ids)}")
        # Optionally: save intermediate results or take other actions
        pd.Series(rating_map).to_csv(f"data/intermediate_ratings.csv")

#pad keys to 7 digits
rating_map = {str(k).zfill(7): v for k, v in rating_map.items()}
# Map back into dataframe and prefer decimal when available
kaggle_data['imdb_rating_decimal'] = kaggle_data['imdb_id'].astype(str).map(rating_map)

# cleanup
driver.quit()

# persist results
kaggle_data.to_csv(output_path, index=False)
print(f"Updated {output_path} with scraped decimal ratings (where found).")
print(kaggle_data[['imdb_id','imdb_rating', 'imdb_rating_decimal']].head())

Fetching tt12734734...
Fetching tt1781782...
Fetching tt0946528...
Fetching tt10871176...
Fetching tt2419232...
Fetching tt0417952...
Fetching tt1282592...
Fetching tt8505542...
Fetching tt0483314...
Fetching tt0581053...
Fetching tt0068451...
Fetching tt3328956...
Fetching tt5660414...
Fetching tt0080180...
Fetching tt0120907...
Fetching tt0295701...
Updated data/kaggle_data.csv with scraped decimal ratings (where found).
   imdb_id  imdb_rating  imdb_rating_decimal
0  1179933            7                  7.2
1  0147800            7                  7.4
2  0118528            7                  7.8
3  0114746            8                  8.0
4  2024544            8                  8.1


In [6]:
kaggle_data.to_csv(output_path, index=False)

remove negative imdb ratings

In [7]:
#load kaggle data
import pandas as pd
kaggle_data = pd.read_csv("data/kaggle_data.csv")

#set negative imdb ratings to none
kaggle_data.loc[kaggle_data.imdb_rating.astype(str).str.startswith('-'), 'imdb_rating'] = None

kaggle_data.to_csv("data/kaggle_data.csv", index=False)