In [None]:
import csv
import re
from single_web_scrape import scrape_spans

In [10]:
def parse_title_line(title_line):
    """
    Attempt to parse lines like:
        'Series 01 Episode 01 – Pilot Episode'
        'Series 10 Episode 03 -The Dependence Transcendence'
    into three parts:
        Series='Series 01', Episode='Episode 01', EpisodeTitle='Pilot Episode'.

    Fallback: If it doesn't match, just return placeholders.
    """
    # This pattern:
    #  1) Allows 'Series' + digits, then 'Episode' + digits,
    #  2) Accepts any dash-like character ('-', '–', '—') between Episode and the title,
    #  3) Allows 0 or more spaces around that dash.
    pattern = r'^(Series\s*\d+)\s+(Episode\s*\d+)\s*[-–—]\s*(.*)$'
    match = re.match(pattern, title_line)
    if match:
        series_part = match.group(1).strip()
        episode_part = match.group(2).strip()
        episode_title = match.group(3).strip()
        return series_part, episode_part, episode_title
    else:
        return "Unknown Series", "Unknown Episode", title_line


In [None]:
scene_pattern = re.compile(r'^Scene:\s*(.*)$', re.IGNORECASE)
speaker_dialogue_pattern = re.compile(r'^(.*?):\s*(.*)$')

def parse_script_lines(lines):
    """
    Given the list of lines from 'scrape_spans',
    1) The first line is the title line for Series/Episode.
    2) We parse subsequent lines for Scene, Speaker, or Dialogue.
    3) Return a list of (Series, Episode, Episode Title, Scene, Speaker, Dialogue).
    """

    # 1) Parse the first line for Series, Episode, Episode Title
    series_part, episode_part, episode_title = parse_title_line(lines[0])

    data_rows = []
    current_scene = ""  # Initialize current scene to empty string

    for line in lines[1:]:
        line = line.strip()
        if not line:
            # Skip empty lines
            continue

        # Check if the line is a scene
        scene_match = scene_pattern.match(line)
        if scene_match:
            # Update current scene
            current_scene = scene_match.group(1).strip()
            continue

        # Otherwise check if it's a speaker line
        speaker_match = speaker_dialogue_pattern.match(line)
        if speaker_match:
            speaker = speaker_match.group(1).strip()
            dialogue = speaker_match.group(2).strip()
        else:
            # Neither scene nor speaker => treat entire line as dialogue, no speaker
            speaker = ""
            dialogue = line

        data_rows.append((
            series_part,
            episode_part,
            episode_title,
            current_scene,
            speaker,
            dialogue
        ))

    return data_rows


In [None]:
def main():
    input_csv = "collected_urls.csv" # Run the collect_urls.py script first then use this file
    output_csv = "big_bang_scripts.csv"

    with open(output_csv, mode="w", newline="", encoding="utf-8") as out_file:
        writer = csv.writer(out_file)
        writer.writerow(["Series", "Episode", "Episode Title", "Scene", "Speaker", "Dialogue"])

        with open(input_csv, mode="r", encoding="utf-8") as in_file:
            for row in csv.reader(in_file):
                if not row:
                    continue
                url = row[0].strip()
                if not url:
                    continue

                # 1) Scrape all lines from that URL
                lines = scrape_spans(url)
                if not lines:
                    continue

                # 2) Parse them into rows with (Series, Episode, Episode Title, Scene, Speaker, Dialogue)
                parsed_rows = parse_script_lines(lines)

                # 3) Write them out
                for p_row in parsed_rows:
                    writer.writerow(p_row)

    print(f"Finished writing scripts to {output_csv}.")

if __name__ == "__main__":
    main()


Finished writing scripts to big_bang_scripts.csv.
