In [2]:
# dependencies

#method1
# ! pip install fitz
# ! pip install pymupdf

#method2
# ! pip install transformers
# ! pip install torch
# ! pip install pandas
# ! pip install sentencepiece

#method3
# ! pip install spacy
# ! python -m spacy download en_core_web_sm  

Collecting pymupdf
  Using cached PyMuPDF-1.24.9-cp310-none-win_amd64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.9 (from pymupdf)
  Using cached PyMuPDFb-1.24.9-py3-none-win_amd64.whl.metadata (1.4 kB)
Using cached PyMuPDF-1.24.9-cp310-none-win_amd64.whl (3.2 MB)
Using cached PyMuPDFb-1.24.9-py3-none-win_amd64.whl (13.2 MB)
Installing collected packages: PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.24.9 pymupdf-1.24.9


In [1]:
import fitz
import re
from collections import defaultdict
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(


In [2]:
# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = []
    for page in doc:
        text.append(page.get_text())
    return "\n".join(text)

pdf_text = extract_text_from_pdf("ex1.pdf")

Task 4:  Counting the Number of Words per Character in the Script

In [3]:
## method 1: using regex
# we are identifying character names by if they are all caps and on a single line
# we do exclude lines with scene descriptors like int and ext to not pick up on setting all caps lines
# Problems: the scripts are formatted differently and this does not pick up on the line "DADDY and MAMA" for ex because "and" is lowercase
# Questions: Is this the general format for most scripts? Does it change in different languages?

def count_words_per_character(script_text):
    lines = script_text.split('\n')
    word_counts = defaultdict(int)
    current_characters = []

    # Identifying character names as all caps on a single line, including commas and ampersands
    pattern = re.compile(r'^[A-Z\s,&]+$')

    for line in lines:
        line = line.strip()
        match = pattern.match(line)
        if match:
            # Skip common scene description indicators to prevent matching with all caps setting
            if any(word in line for word in ['INT', 'EXT', 'DAY', 'NIGHT', 'MORNING', 'AFTERNOON', 'EVENING']):
                current_characters = []
            else:
                # Split the line by commas, ampersands, and ANDs to get individual character names
                characters = [char.strip() for char in re.split('[,&]| AND | and ', line)]
                current_characters = characters
                # if len(current_characters)>1:
                #     print(current_characters)
        elif current_characters:
            # Count words in the dialogue lines
            word_count = len(line.split())
            for char in current_characters:
                # if char == "MAMA":
                #     print("mama", word_count)
                word_counts[char] += word_count

    return word_counts

# Count words per character
word_counts = count_words_per_character(pdf_text)

df = pd.DataFrame(word_counts.items(), columns=["Character", "Word Count"])
print(df)

       Character  Word Count
0          NAMIA         573
1           NUZO         730
2          DADDY         138
3           MAMA           7
4       BUBELANG         738
5  FEMALE SELLER         113


In [4]:
# ## method 2: using hugging face named entity recognition to identify proper nouns. Combine with regex method
# # problems: model is only trained on the following languages: Arabic, German, English, Spanish, French, Italian, Latvian, Dutch, Portuguese and Chinese
# # potentially should try testing with scripts in various African languages to see if its name recognition is general enough to work
# # problem: model is not doing a great job at identifying character names!

# # Load the pre-trained model and tokenizer
# model_name = "Davlan/xlm-roberta-base-ner-hrl" # 10 language multilingual model
# # documentation for model: https://huggingface.co/Davlan/xlm-roberta-base-ner-hrl
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForTokenClassification.from_pretrained(model_name)

# # Create the NER pipeline
# nlp = pipeline("ner", model=model, tokenizer=tokenizer)


# # Identify character names using NER
# ner_results = nlp(pdf_text)

# # Extract character names and filter out non-character entities
# character_names = set()
# for entity in ner_results:
#     if entity['entity'] in {'B-PER', 'I-PER'}:
#         character_names.add(entity['word'])

# # Debugging: Print identified character names
# print("Identified Character Names:", character_names)

# # Function to count words per character
# def count_words_per_character(script_text, character_names):
#     lines = script_text.split('\n')
#     word_counts = defaultdict(int)
#     current_characters = []

#     # Regex pattern to identify character lines based on script structure
#     character_pattern = re.compile(rf"^({'|'.join(re.escape(name) for name in character_names)})$", re.IGNORECASE)

#     for line in lines:
#         line = line.strip()
#         if character_pattern.match(line):
#             # Debugging: Print matched character lines
#             print("Matched Character Line:", line)
#             # Split the line by commas, ampersands, and "and" to get individual character names
#             characters = [char.strip() for char in re.split('[,&]| and ', line, flags=re.IGNORECASE)]
#             current_characters = characters
#         elif current_characters:
#             # Count words in the dialogue lines
#             word_count = len(line.split())
#             for char in current_characters:
#                 word_counts[char] += word_count

#     return word_counts

# # Count words per character
# word_counts = count_words_per_character(pdf_text, character_names)

# df = pd.DataFrame(list(word_counts.items()), columns=["Character", "Word Count"])
# print(df)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [7]:
# ## method 3: spacy named entity recognition
# # problems: requires testing on whether this will work in other languages, as ner was trained with english
# # problem: spacy also seems bad at recognizing these names

# import spacy

# # Load the spaCy model for English
# nlp = spacy.load("en_core_web_sm")

# # Identify character names using spaCy NER
# doc = nlp(pdf_text)
# character_names = set()
# for ent in doc.ents:
#     if ent.label_ == "PERSON":
#         character_names.add(ent.text.strip())

# # Filter to only keep all caps character names
# character_names = {name for name in character_names if name.isupper()}

# # Debugging: Print identified character names
# print("Identified Character Names:", character_names)

# # Function to count words per character
# def count_words_per_character(script_text, character_names):
#     lines = script_text.split('\n')
#     word_counts = defaultdict(int)
#     current_characters = []

#     # Regex pattern to identify character lines based on script structure
#     character_pattern = re.compile(
#         rf"^({'|'.join(re.escape(name) for name in character_names)})([,&]| and )*$", re.IGNORECASE
#     )

#     for line in lines:
#         line = line.strip()
#         if character_pattern.match(line):
#             # Split the line by commas, ampersands, and "and" to get individual character names
#             characters = [char.strip() for char in re.split('[,&]| and ', line, flags=re.IGNORECASE)]
#             current_characters = characters
#         elif current_characters:
#             # Count words in the dialogue lines
#             word_count = len(line.split())
#             for char in current_characters:
#                 word_counts[char] += word_count

#     return word_counts

# # Count words per character
# word_counts = count_words_per_character(pdf_text, character_names)

# df = pd.DataFrame(list(word_counts.items()), columns=["Character", "Word Count"])
# print(df)


Identified Character Names: {'RA', 'KO', 'KI', 'KIKOMANDO', 'MA', 'NAMIA'}
  Character  Word Count
0     NAMIA        2388
