In [7]:
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# First, let's download the required NLTK resources if they aren't already downloaded.
# The 'punkt' tokenizer helps break the text into individual words,
# and 'stopwords' helps us remove common words like 'the', 'a', etc. that don't add much meaning.
nltk.download('punkt')  # Download Punkt Tokenizer
nltk.download('stopwords')  # Download Stopwords

# Define the folder path where the "family." folder is located
# Make sure the path matches where you have stored your text files.
folder_path = "/Users/beauxcreel/code/ENGL370-2025/Creel/family."

# List all the .txt files in the "family." folder, assuming all screenplays are in .txt format.
screenplay_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]

# I'm limiting the number of screenplays we load for now, so we don’t overwhelm ourselves.
MAX_FILES = 5  # We can adjust this to load more or fewer files.

# Now, let's load the screenplays. We'll read the first MAX_FILES files from the "family." folder.
# This allows us to focus on a smaller sample, which is easier to manage.
screenplays = {}
for file in screenplay_files[:MAX_FILES]:
    with open(os.path.join(folder_path, file), "r", encoding="utf-8") as f:
        # Store the screenplay title and content as a key-value pair in the dictionary
        screenplays[file] = f.read()

# Let's confirm that we've successfully loaded the screenplays.
# We’ll print the number of screenplays and the names of the ones we loaded.
print(f"Loaded {len(screenplays)} screenplays.")  # Output how many screenplays we’ve loaded.
print("Loaded screenplays:", list(screenplays.keys()))  # Output the list of loaded filenames.

# Let's preview the content of one screenplay to ensure it's been loaded correctly.
sample_script = list(screenplays.keys())[0]  # Pick the first screenplay from the list
print(f"\nPreview of {sample_script}:\n")
print(screenplays[sample_script][:500])  # Print the first 500 characters of the first screenplay

# Now, let's clean and preprocess the text. The goal here is to make the text easier to analyze.
# We’ll remove unnecessary punctuation, convert everything to lowercase, and eliminate common stopwords (like 'the', 'a', etc.)

def preprocess_text(text):
    text = text.lower()  # Convert the text to lowercase to make it case-insensitive
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation (anything that's not a word or space)
    tokens = word_tokenize(text)  # Tokenize the text into words
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(tokens)  # Join the words back into a string after cleaning

# We apply the preprocessing function to all the screenplays we've loaded.
# This will clean up the text, making it ready for further analysis.
cleaned_screenplays = {title: preprocess_text(text) for title, text in screenplays.items()}

# Let's print out a preview of the cleaned text to make sure everything looks good.
sample_script = list(cleaned_screenplays.keys())[0]  # Again, pick the first screenplay
print(f"\nCleaned Preview of {sample_script}:\n")
print(cleaned_screenplays[sample_script][:500])  # Show the first 500 characters of the cleaned text

# Now that our text is cleaned and ready, it's time to convert it into a Document-Term Matrix (DTM).
# The DTM is a matrix where rows represent the screenplays and columns represent the unique words in all the screenplays.
# The values in the matrix represent the frequency of each word in each screenplay.

from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Initialize CountVectorizer: this tool will help us convert the text into a matrix of word counts.
vectorizer = CountVectorizer()

# Use the vectorizer to transform the cleaned screenplays into a Document-Term Matrix (DTM).
# The 'fit_transform' method creates the matrix based on the cleaned text.
dtm = vectorizer.fit_transform(cleaned_screenplays.values())

# Convert the DTM to a pandas DataFrame for easier readability.
# The rows are the titles of the screenplays, and the columns are the unique words (features).
dtm_df = pd.DataFrame(dtm.toarray(), index=cleaned_screenplays.keys(), columns=vectorizer.get_feature_names_out())

# Let's display the first few rows of the DTM so we can see the structure.
print(dtm_df.head())  # Output the first few rows of the Document-Term Matrix.

# Now, let's save the DTM as a CSV file.
# This will allow us to easily access the DTM later or use it in further analysis.
dtm_df.to_csv("screenplays_dtm.csv")

print("Document-Term Matrix saved as 'screenplays_dtm.csv'.")  # Confirm that the CSV file was saved.


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/beauxcreel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/beauxcreel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loaded 5 screenplays.
Loaded screenplays: ['aladdin.txt', 'princessbridethe.txt', 'findingnemo.txt', 'kungfupanda.txt', 'e.t..txt']

Preview of aladdin.txt:

ALADDIN:  THE COMPLETE SCRIPT

COMPILED BY BEN SCRIPPS 

(Portions Copyright (c) 1992 The Walt Disney Company

PEDDLER:    Oh I come from a land

    From a faraway place

    Where the caravan camels roam

    Where they cut off your ear /Where it's flat and immense

    If they don't like your face /And the heat is intense

    It's barbaric, but hey--it's home!

    When the wind's at your back

    And the sun's from the west

    And the sand in the glass is right

    Come on down,

    St

Cleaned Preview of aladdin.txt:

aladdin complete script compiled ben scripps portions copyright c 1992 walt disney company peddler oh come land faraway place caravan camels roam cut ear flat immense dont like face heat intense barbaric heyits home winds back suns west sand glass right come stop hop carpet fly another arabian night arabia