In [1]:
import pandas as pd

In [2]:
from glob import glob
from lxml import etree

In [3]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import time
import json
from fake_headers import Headers
import requests
from bs4 import BeautifulSoup as bs

# Text Parsing for Action Lines

In [4]:
parsed_files = glob("../../movie_script/imsdb_parsed/*.xml")

In [6]:
# The following movies does not satisfy the screenplay format
# parsing their html will cause kernel restart
# these movies are excluded

skip = ["../../movie_script/imsdb_parsed/Jerry Maguire.xml",
"../../movie_script/imsdb_parsed/Chasing Amy.xml",
"../../movie_script/imsdb_parsed/Notting Hill.xml",
"../../movie_script/imsdb_parsed/Avventura, L' (The Adventure).xml",
       "../../movie_script/imsdb_parsed/American President, The.xml"]

In [26]:
parsed_df_list = []
for file in tqdm([f for f in parsed_files if f not in skip]):
    title = file.split("/")[-1][:-4]
    try:
        tree = etree.parse(file)
        root = tree.getroot()
        tag_elements = []
        for elem in root.iter():
            tag_elements.append({"tag": elem.tag,
            "attribute": elem.attrib,
            "text": elem.text.strip()})
        
        parsed_df = pd.DataFrame(tag_elements)
        parsed_df["title"] = title
        parsed_df_list.append(parsed_df)
    except etree.XMLSyntaxError as e:
        print(f"Error parsing XML: {e}")
        exit()    

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1227/1227 [00:07<00:00, 153.38it/s]


In [27]:
parsed_df_list = pd.concat(parsed_df_list)

In [31]:
# We used two metrics to determine if the screenplay are successfully parsed.
# First,
# if the parser recognize more than 5% of the lines are unclassified, 
# then we identify that movie screenplay failed parsing due to format errors and excluded from analysis
# Second,
# if the parser recognize more than 60% of the lines are action lines
# then we identify such movie screenplay does not follow the screenplay format, and excluded from analysis

tag_counts_df = parsed_df_list.groupby(["title"]).tag.value_counts(normalize=True).reset_index()
parse_failed_movie = tag_counts_df[tag_counts_df.tag=="unc"][tag_counts_df[tag_counts_df.tag=="unc"].proportion > 0.05].title.tolist()
parse_act_movies = tag_counts_df[tag_counts_df.tag=="act"][tag_counts_df[tag_counts_df.tag=="act"].proportion < 0.6].title.tolist()
parse_act_movies_file = list(set(parse_act_movies) - set(parse_failed_movie))

In [32]:
len(parse_act_movies_file)

1174

In [40]:
parsed_df_list_filtered = parsed_df_list[parsed_df_list.title.isin(parse_act_movies_file)]

In [69]:
for t in tqdm(parsed_df_list_filtered.title.unique()):
    df_temp = parsed_df_list_filtered[parsed_df_list_filtered.title == t]
    df_temp = df_temp[df_temp.tag == "act"]
    with open("imsdb_action_lines/{}.txt".format(t), "w", encoding="utf-8") as f:
        for text in df_temp.text:
            if text != "":
                f.write(text + "\n")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1174/1174 [02:21<00:00,  8.33it/s]


# BookNLP

In [227]:
from booknlp.booknlp import BookNLP
model_params={
                "pipeline":"entity,quote,supersense,event,coref", 
                "model":"big"
                }

booknlp=BookNLP("en", model_params)

  from .autonotebook import tqdm as notebook_tqdm
2025-07-25 01:05:18.749926: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-25 01:05:18.758198: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753423518.767396 2789806 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753423518.770161 2789806 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1753423518.778523 2789806 computation_placer.cc:177] computation placer already r

using device cuda
{'pipeline': 'entity,quote,supersense,event,coref', 'model': 'small'}
--- startup: 3.300 seconds ---


In [6]:
from glob import glob

In [7]:
files = glob("imsdb_action_lines/*.txt")

In [9]:
import os
from IPython.display import clear_output

In [232]:
# file_size = []
for f in tqdm(files):
    inputFile = f
    t = f[19:-4]
    outputDir = "book_nlp_output/{}/".format(t)
    
    idd = t
    
    if not os.path.exists(outputDir):
        os.mkdir(outputDir)

    # file_size.append(os.path.getsize(inputFile)/ (1024 * 1024))

    if (not os.path.exists(outputDir + "{}.book".format(t))):
        print("Processing book: {} ->> file size: {} mb".format(t, (os.path.getsize(inputFile)/ (1024 * 1024))))
        booknlp.process(inputFile, outputDir, idd)
        
        clear_output(wait=True)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1174/1174 [2:19:50<00:00,  7.15s/it]
