# Count Lines Per Protagonist

This notebook counts the number of lines per character in each movie and keeps only the character with the most lines (the first one if there are ties).

In [1]:
import pandas as pd
import os
from pathlib import Path
output_path = "data/kaggle_data.csv"

In [2]:
# Path to the movie character texts directory
base_path = Path("data/movie_characters/data/movie_character_texts/movie_character_texts")

# List to store results
results = []

# Iterate through each movie directory
for movie_dir in sorted(base_path.iterdir()):
    if movie_dir.is_dir():
        # Extract movie name and IMDB ID
        movie_name_id = movie_dir.name
        parts = movie_name_id.rsplit('_', 1)
        if len(parts) == 2:
            movie_name = parts[0]
            imdb_id = parts[1]
        else:
            movie_name = movie_name_id
            imdb_id = ""
        
        # Iterate through each character file in the movie directory
        for character_file in movie_dir.glob("*_text.txt"):
            # Extract character name (remove '_text.txt' suffix)
            character_name = character_file.stem.replace('_text', '')
            
            # Count lines in the file
            with open(character_file, 'r', encoding='utf-8', errors='ignore') as f:
                lines = f.readlines()
                line_count = len(lines)
            
            # Add to results
            results.append({
                'movie_name': movie_name,
                'imdb_id': imdb_id,
                'character_name': character_name,
                'line_count': line_count
            })

# Create DataFrame
kaggle_data = pd.DataFrame(results)

# Keep only the first character with the most lines per movie (no ties)
kaggle_data = kaggle_data.loc[kaggle_data.groupby('movie_name')['line_count'].idxmax()].copy()

# Load movie metadata
metadata_df = pd.read_csv("data/movie_metadata/movie_meta_data.csv")
metadata_df = metadata_df[['imdbid', 'title', 'directors', 'genres', 'imdb user rating']].copy()
metadata_df.columns = ['imdb_id', 'title', 'director', 'genres', 'imdb_rating']

# Merge protagonist data with movie metadata
kaggle_data['imdb_id_padded'] = kaggle_data['imdb_id'].astype(str).str.zfill(7)
metadata_df['imdb_id_padded'] = metadata_df['imdb_id'].astype(str).str.zfill(7)
kaggle_data = kaggle_data.merge(
    metadata_df, 
    left_on='imdb_id_padded',
    right_on='imdb_id_padded',
    how='left',
    suffixes=('', '_meta')
)
kaggle_data = kaggle_data.drop(columns=['imdb_id_padded', 'imdb_id_meta'], errors='ignore')

# Add column with path to full script
kaggle_data['script_filename'] = kaggle_data['movie_name'] + '_' + kaggle_data['imdb_id'] + '.txt'
kaggle_data['script_path'] = 'data/screenplay_data/data/raw_texts/raw_texts/' + kaggle_data['script_filename']
kaggle_data['script_exists'] = kaggle_data['script_path'].apply(lambda x: os.path.exists(x))

# Remove rows where script files are missing
kaggle_data = kaggle_data[kaggle_data['script_exists']].copy()

# Display summary
print(f"Remaining movies with scripts: {kaggle_data['movie_name'].nunique()}")
print(f"Remaining rows: {kaggle_data.shape[0]}")
print(kaggle_data.head())

Remaining movies with scripts: 2138
Remaining rows: 2138
                   movie_name  imdb_id   character_name  line_count  \
0         10 Cloverfield Lane  1179933           Howard         316   
1  10 Things I Hate About You  0147800    Kat Stratford         387   
2                12 Angry Men  0118528          Foreman          98   
3                  12 Monkeys  0114746   Jeffrey Goines          97   
4            12 Years a Slave  2024544  Solomon Northup         289   

                        title          director  \
0         10 Cloverfield Lane  Dan Trachtenberg   
1  10 Things I Hate About You        Gil Junger   
2                12 Angry Men  William Friedkin   
3                  12 Monkeys     Terry Gilliam   
4            12 Years a Slave     Steve McQueen   

                                             genres  imdb_rating  \
0  Action, Drama, Horror, Mystery, Sci-Fi, Thriller            7   
1                            Comedy, Drama, Romance            7   
2    

In [3]:
# Display first few rows
kaggle_data.head(10)

Unnamed: 0,movie_name,imdb_id,character_name,line_count,title,director,genres,imdb_rating,script_filename,script_path,script_exists
0,10 Cloverfield Lane,1179933,Howard,316,10 Cloverfield Lane,Dan Trachtenberg,"Action, Drama, Horror, Mystery, Sci-Fi, Thriller",7,10 Cloverfield Lane_1179933.txt,data/screenplay_data/data/raw_texts/raw_texts/...,True
1,10 Things I Hate About You,147800,Kat Stratford,387,10 Things I Hate About You,Gil Junger,"Comedy, Drama, Romance",7,10 Things I Hate About You_0147800.txt,data/screenplay_data/data/raw_texts/raw_texts/...,True
2,12 Angry Men,118528,Foreman,98,12 Angry Men,William Friedkin,"Crime, Drama",7,12 Angry Men_0118528.txt,data/screenplay_data/data/raw_texts/raw_texts/...,True
3,12 Monkeys,114746,Jeffrey Goines,97,12 Monkeys,Terry Gilliam,"Mystery, Sci-Fi, Thriller",8,12 Monkeys_0114746.txt,data/screenplay_data/data/raw_texts/raw_texts/...,True
4,12 Years a Slave,2024544,Solomon Northup,289,12 Years a Slave,Steve McQueen,"Biography, Drama, History",8,12 Years a Slave_2024544.txt,data/screenplay_data/data/raw_texts/raw_texts/...,True
5,127 Hours,1542344,Aron Ralston,183,127 Hours,Danny Boyle,"Biography, Drama",7,127 Hours_1542344.txt,data/screenplay_data/data/raw_texts/raw_texts/...,True
6,13 13 13,2991516,Jack,362,13/13/13,James Cullen Bressack,Horror,3,13 13 13_2991516.txt,data/screenplay_data/data/raw_texts/raw_texts/...,True
7,1408,450385,Mike Enslin,445,1408,Mikael Håfström,"Fantasy, Horror, Mystery",6,1408_0450385.txt,data/screenplay_data/data/raw_texts/raw_texts/...,True
8,1492 Conquest of Paradise,103594,Columbus,488,1492: Conquest of Paradise,Ridley Scott,"Adventure, Biography, Drama, History",6,1492 Conquest of Paradise_0103594.txt,data/screenplay_data/data/raw_texts/raw_texts/...,True
9,15 Minutes,179626,Emil Slovak,188,15 Minutes,John Herzfeld,"Action, Crime, Drama, Thriller",6,15 Minutes_0179626.txt,data/screenplay_data/data/raw_texts/raw_texts/...,True


In [4]:
# Load movie metadata
metadata_df = pd.read_csv("data/movie_metadata/movie_meta_data.csv")

# Select relevant columns
metadata_df = metadata_df[['imdbid', 'title', 'directors', 'genres', 'imdb user rating']].copy()

# Rename columns for clarity
metadata_df.columns = ['imdb_id', 'title', 'director', 'genres', 'imdb_rating']

print(f"Metadata shape: {metadata_df.shape}")
print(f"\nMetadata sample:")
metadata_df.head()

Metadata shape: (2858, 5)

Metadata sample:


Unnamed: 0,imdb_id,title,director,genres,imdb_rating
0,120770,A Night at the Roxbury,John Fortenberry,"Comedy, Music, Romance",6
1,132512,At First Sight,Irwin Winkler,"Drama, Romance",6
2,118661,The Avengers,Jeremiah S. Chechik,"Action, Adventure, Sci-Fi, Thriller",3
3,215545,Bamboozled,Spike Lee,"Comedy, Drama, Music",6
4,118715,The Big Lebowski,"Joel Coen, Ethan Coen","Comedy, Crime, Sport",8


In [5]:
# Merge protagonist data with movie metadata
# Convert imdb_id to string and pad with leading zeros to 7 digits for consistent format
kaggle_data['imdb_id_padded'] = kaggle_data['imdb_id'].astype(str).str.zfill(7)
metadata_df['imdb_id_padded'] = metadata_df['imdb_id'].astype(str).str.zfill(7)

# Merge on the padded imdb_id
kaggle_data = kaggle_data.merge(
    metadata_df, 
    left_on='imdb_id_padded',
    right_on='imdb_id_padded',
    how='left',
    suffixes=('', '_meta')
)

# Drop the temporary padded column and the duplicate imdb_id from metadata
kaggle_data = kaggle_data.drop(columns=['imdb_id_padded', 'imdb_id_meta'], errors='ignore')

print(f"Merged DataFrame shape: {kaggle_data.shape}")
print(f"\nMerge statistics:")
print(f"  - Protagonists matched with metadata: {kaggle_data['title'].notna().sum()}")
print(f"  - Protagonists without metadata: {kaggle_data['title'].isna().sum()}")

kaggle_data.head(10)

Merged DataFrame shape: (2138, 15)

Merge statistics:
  - Protagonists matched with metadata: 2138
  - Protagonists without metadata: 0


Unnamed: 0,movie_name,imdb_id,character_name,line_count,title,director,genres,imdb_rating,script_filename,script_path,script_exists,title_meta,director_meta,genres_meta,imdb_rating_meta
0,10 Cloverfield Lane,1179933,Howard,316,10 Cloverfield Lane,Dan Trachtenberg,"Action, Drama, Horror, Mystery, Sci-Fi, Thriller",7,10 Cloverfield Lane_1179933.txt,data/screenplay_data/data/raw_texts/raw_texts/...,True,10 Cloverfield Lane,Dan Trachtenberg,"Action, Drama, Horror, Mystery, Sci-Fi, Thriller",7
1,10 Things I Hate About You,147800,Kat Stratford,387,10 Things I Hate About You,Gil Junger,"Comedy, Drama, Romance",7,10 Things I Hate About You_0147800.txt,data/screenplay_data/data/raw_texts/raw_texts/...,True,10 Things I Hate About You,Gil Junger,"Comedy, Drama, Romance",7
2,12 Angry Men,118528,Foreman,98,12 Angry Men,William Friedkin,"Crime, Drama",7,12 Angry Men_0118528.txt,data/screenplay_data/data/raw_texts/raw_texts/...,True,12 Angry Men,William Friedkin,"Crime, Drama",7
3,12 Monkeys,114746,Jeffrey Goines,97,12 Monkeys,Terry Gilliam,"Mystery, Sci-Fi, Thriller",8,12 Monkeys_0114746.txt,data/screenplay_data/data/raw_texts/raw_texts/...,True,12 Monkeys,Terry Gilliam,"Mystery, Sci-Fi, Thriller",8
4,12 Years a Slave,2024544,Solomon Northup,289,12 Years a Slave,Steve McQueen,"Biography, Drama, History",8,12 Years a Slave_2024544.txt,data/screenplay_data/data/raw_texts/raw_texts/...,True,12 Years a Slave,Steve McQueen,"Biography, Drama, History",8
5,127 Hours,1542344,Aron Ralston,183,127 Hours,Danny Boyle,"Biography, Drama",7,127 Hours_1542344.txt,data/screenplay_data/data/raw_texts/raw_texts/...,True,127 Hours,Danny Boyle,"Biography, Drama",7
6,13 13 13,2991516,Jack,362,13/13/13,James Cullen Bressack,Horror,3,13 13 13_2991516.txt,data/screenplay_data/data/raw_texts/raw_texts/...,True,13/13/13,James Cullen Bressack,Horror,3
7,1408,450385,Mike Enslin,445,1408,Mikael Håfström,"Fantasy, Horror, Mystery",6,1408_0450385.txt,data/screenplay_data/data/raw_texts/raw_texts/...,True,1408,Mikael Håfström,"Fantasy, Horror, Mystery",6
8,1492 Conquest of Paradise,103594,Columbus,488,1492: Conquest of Paradise,Ridley Scott,"Adventure, Biography, Drama, History",6,1492 Conquest of Paradise_0103594.txt,data/screenplay_data/data/raw_texts/raw_texts/...,True,1492: Conquest of Paradise,Ridley Scott,"Adventure, Biography, Drama, History",6
9,15 Minutes,179626,Emil Slovak,188,15 Minutes,John Herzfeld,"Action, Crime, Drama, Thriller",6,15 Minutes_0179626.txt,data/screenplay_data/data/raw_texts/raw_texts/...,True,15 Minutes,John Herzfeld,"Action, Crime, Drama, Thriller",6


In [6]:
# Create the script file path by combining movie_name and imdb_id
kaggle_data['script_filename'] = kaggle_data['movie_name'] + '_' + kaggle_data['imdb_id'] + '.txt'
kaggle_data['script_path'] = 'data/screenplay_data/data/raw_texts/raw_texts/' + kaggle_data['script_filename']

# Check if the script files actually exist
kaggle_data['script_exists'] = kaggle_data['script_path'].apply(
    lambda x: os.path.exists(x)
)

print(f"Scripts found: {kaggle_data['script_exists'].sum()}")
print(f"Scripts not found: {(~kaggle_data['script_exists']).sum()}")

# Show some examples
print("\nExamples with scripts:")
print(kaggle_data[kaggle_data['script_exists']][['movie_name', 'character_name', 'script_path']].head(5))

print("\nExamples without scripts:")
print(kaggle_data[~kaggle_data['script_exists']][['movie_name', 'character_name', 'script_filename']].head(5))

Scripts found: 2138
Scripts not found: 0

Examples with scripts:
                   movie_name   character_name  \
0         10 Cloverfield Lane           Howard   
1  10 Things I Hate About You    Kat Stratford   
2                12 Angry Men          Foreman   
3                  12 Monkeys   Jeffrey Goines   
4            12 Years a Slave  Solomon Northup   

                                         script_path  
0  data/screenplay_data/data/raw_texts/raw_texts/...  
1  data/screenplay_data/data/raw_texts/raw_texts/...  
2  data/screenplay_data/data/raw_texts/raw_texts/...  
3  data/screenplay_data/data/raw_texts/raw_texts/...  
4  data/screenplay_data/data/raw_texts/raw_texts/...  

Examples without scripts:
Empty DataFrame
Columns: [movie_name, character_name, script_filename]
Index: []


In [7]:
# Remove rows where script files are missing
kaggle_data = kaggle_data[kaggle_data['script_exists']].copy()

print(f"Remaining movies with scripts: {kaggle_data['movie_name'].nunique()}")
print(f"Remaining rows: {kaggle_data.shape[0]}")

Remaining movies with scripts: 2138
Remaining rows: 2138


In [8]:
# Loop over rows and print the beginning of every thousandth script
for idx, row in kaggle_data.iterrows():
    if idx % 1000 == 0:
        script_path = row['script_path']
        try:
            with open(script_path, 'r', encoding='utf-8', errors='ignore') as f:
                print(f"Script {idx}: {script_path}")
                print(f.read(100))  # Print the first 100 characters of the script
        except FileNotFoundError:
            print(f"Script not found: {script_path}")

Script 0: data/screenplay_data/data/raw_texts/raw_texts/10 Cloverfield Lane_1179933.txt
The Cellar

by
Josh Campbell & Matt Stuecken
DARKNESS

And then --

A GUNNED ENGINE --

BLURRED HEA
Script 1000: data/screenplay_data/data/raw_texts/raw_texts/McCabe Mrs Miller_0067411.txt
 

 

FOR EDUCATIONAL

 

 

PURPOSES ONLY

1.

2.

Mc CARE + MRS. MILLER

OTHE PRESBYTERIAN CHURCH 
Script 2000: data/screenplay_data/data/raw_texts/raw_texts/Trainwreck_3152624.txt
PRODUCER: Judd Apatow
PRODUCER: Barry Mendel
EXECUTIVE PRODUCER: David Householter

TRAINWRECK

by
A


In [9]:
# Dummy code genres and keep the 12 most common genres
genre_dummies = kaggle_data['genres'].str.get_dummies(sep=', ')

# Sum up occurrences of each genre and keep the top 12
most_common_genres = genre_dummies.sum().nlargest(12).index
filtered_genres = genre_dummies[most_common_genres]

# Add the filtered genres back to the main DataFrame
kaggle_data = pd.concat([kaggle_data, filtered_genres], axis=1)

print("Top 12 genres added as dummy variables:")
print(filtered_genres.columns)

#keep bottom 3 rows
# kaggle_data = kaggle_data.tail(23).copy()

Top 12 genres added as dummy variables:
Index(['Drama', 'Thriller', 'Comedy', 'Action', 'Crime', 'Romance',
       'Adventure', 'Sci-Fi', 'Mystery', 'Horror', 'Fantasy', 'Biography'],
      dtype='object')


In [10]:
# Simple IMDb decimal-rating scraper using Selenium (Safari)
# Insert your preferred CSS selector in RATING_SELECTOR (see marker below).
# Note: Safari's WebDriver (safaridriver) must be enabled on macOS:
#   Run `safaridriver --enable` once in a terminal if not already enabled.
# Safari on macOS does not support a headless mode via safaridriver, so the browser UI may appear when scraping.

import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

# --- PUT YOUR RATING CSS SELECTOR ON THE NEXT LINE ---
# Example selectors you can try:
# 1) 'div[data-testid="hero-rating-bar__aggregate-rating__score"] span'
# 2) 'span[itemprop="ratingValue"]'
# Set RATING_SELECTOR to whichever works for your environment/page version.
RATING_SELECTOR = 'div[data-testid="hero-rating-bar__aggregate-rating__score"] span'

# Start Safari webdriver (requires safaridriver enabled)
# If you prefer Chrome/Firefox, switch to the corresponding webdriver initialization.
driver = webdriver.Safari()
wait = WebDriverWait(driver, 6)

def fetch_decimal_rating(imdb_id):
    imdb_str = str(imdb_id).zfill(7)
    url = f"https://www.imdb.com/title/tt{imdb_str}/"
    try:
        driver.get(url)
        # Wait until the rating element appears (short timeout)
        el = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, RATING_SELECTOR)))
        text = el.text.strip()
        if not text:
            return None
        # Text is typically like '7.8' or '8'
        try:
            return float(text)
        except ValueError:
            # sometimes contains other whitespace — try first token
            try:
                return float(text.split()[0])
            except Exception:
                return None
    except Exception as e:
        # suppressed; return None when not found or blocked
        # print(f"fetch error tt{imdb_str}: {e}")
        return None

# Build rating map for unique ids
unique_ids = kaggle_data['imdb_id'].astype(str).unique()
#if intermediate file exists, load it
if os.path.exists("data/intermediate_ratings.csv"):
    rating_map = pd.read_csv("data/intermediate_ratings.csv", index_col=0).to_dict()["0"]
    rating_map = {str(k).zfill(7): v for k, v in rating_map.items()}

else:
    rating_map = {}
for i, iid in enumerate(unique_ids, 1):
    if not iid or pd.isna(iid) or iid.strip() == "":
        rating_map[iid] = None
        continue
    if iid in rating_map:
        # print(f"Skipping tt{iid}, already have rating.\n\n\n\n\n")
        continue    
    print(f"Fetching tt{iid}...")
    rating = fetch_decimal_rating(iid)
    #if not none
    if rating is not None:
        rating_map[iid] = rating
    # polite pause
    time.sleep(2 + random.uniform(0, 1.5))
    if i % 10 == 0:
        print(f"Fetched {i}/{len(unique_ids)}")
        # Optionally: save intermediate results or take other actions
        pd.Series(rating_map).to_csv(f"data/intermediate_ratings.csv")

#pad keys to 7 digits
rating_map = {str(k).zfill(7): v for k, v in rating_map.items()}
# Map back into dataframe and prefer decimal when available
kaggle_data['imdb_rating_decimal'] = kaggle_data['imdb_id'].astype(str).map(rating_map)

# cleanup
driver.quit()

# persist results
kaggle_data.to_csv(output_path, index=False)
print(f"Updated {output_path} with scraped decimal ratings (where found).")
print(kaggle_data[['imdb_id','imdb_rating', 'imdb_rating_decimal']].head())

Fetching tt12734734...
Fetching tt1781782...
Fetching tt0946528...
Fetching tt10871176...
Fetching tt2419232...
Fetching tt0417952...
Fetching tt1282592...
Fetching tt8505542...
Fetching tt0483314...
Fetching tt0581053...
Fetching tt0068451...
Fetching tt3328956...
Fetching tt5660414...
Fetching tt0080180...
Fetching tt0120907...
Fetching tt0295701...
Updated data/kaggle_data.csv with scraped decimal ratings (where found).
   imdb_id  imdb_rating  imdb_rating_decimal
0  1179933            7                  7.2
1  0147800            7                  7.4
2  0118528            7                  7.8
3  0114746            8                  8.0
4  2024544            8                  8.1


In [22]:
#load kaggle data
import pandas as pd
kaggle_data = pd.read_csv("data/kaggle_data.csv")

#set negative imdb ratings to none
kaggle_data.loc[kaggle_data.imdb_rating.astype(str).str.startswith('-'), 'imdb_rating'] = None

kaggle_data.to_csv("data/kaggle_data.csv", index=False)