In [24]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)  # Approve the authentication prompts as needed

Mounted at /content/drive


In [25]:
import nltk
from nltk.tokenize import sent_tokenize
import string
import os
import re

# Ensure you have downloaded the punkt tokenizer
nltk.download('punkt')
nltk.download('punkt_tab')

# Make sure this matches your exact folder structure in Google Drive
story_path = '/content/drive/MyDrive/ShortStory.txt'

# Read the story from Google Drive
with open(story_path, 'r', encoding='utf-8') as file:
    story = file.read()

if not os.path.exists(story_path):
    print(f"Base path does not exist: {story_path}")
else:
    print(f"Base path exists: {story_path}")

Base path exists: /content/drive/MyDrive/ShortStory.txt


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [42]:
import spacy

# Preprocessing function to clean the text
def preprocess_text(story: str) -> str:
    """
    Preprocess the story text by removing unnecessary characters and normalizing spaces.
    :param story: Raw story text
    :return: Cleaned story text
    """
    # Remove page boundaries or separators
    story = story.replace('------------------------------------------------', '').strip()
    story = story.replace(""""It was Adell's turn to be contrary.""", "It was Adell's turn to be contrary.").strip()
    story = story.replace("The stars are the power-units, dear", """"The stars are the power-units, dear""").strip()



    # Standardize quotation marks
    story = story.replace('“', '"').replace('”', '"')
    story = story.replace('‘', "'").replace('’', "'")

    # Remove extra spaces or newlines
    story = ' '.join(story.split())

    return story

# Sentence segmentation using spaCy
def segment_sentences_spacy(story: str) -> list[str]:
    """
    Segment the story text into sentences using spaCy.
    :param story: Preprocessed story text
    :return: List of segmented sentences
    """
    # Load spaCy's English language model
    nlp = spacy.load("en_core_web_sm")

    # Process the story with spaCy
    doc = nlp(story)

    # Extract sentences
    sentences = [sent.text.strip() for sent in doc.sents]

    return sentences

In [43]:
# Preprocess and segment
cleaned_story = preprocess_text(story)
segmented_sentences = segment_sentences_spacy(cleaned_story)

# Output segmented sentences
for i, sentence in enumerate(segmented_sentences, 1):
    print(f"{i}: {sentence}")

1: The last question was asked for the first time, half in jest, on May 21, 2061, at a time when humanity first stepped into the light.
2: The question came about as a result of a five dollar bet over highballs, and it happened this way: Alexander Adell and Bertram Lupov were two of the faithful attendants of Multivac.
3: As well as any human beings could, they knew what lay behind the cold, clicking, flashing face -- miles and miles of face -- of that giant computer.
4: They had at least a vague notion of the general plan of relays and circuits that had long since grown past the point where any single human could possibly have a firm grasp of the whole.
5: Multivac was self-adjusting and self-correcting.
6: It had to be, for nothing human could adjust and correct it quickly enough or even adequately enough -- so Adell and Lupov attended the monstrous giant only lightly and superficially, yet as well as any men could.
7: They fed it data, adjusted questions to its needs and translated 

#### Note: stitching quotations together into one coherent phrase (I find this more logical)

In [44]:
def stitch_sentences_with_quotes(sentences: list[str]) -> list[str]:
    """
    Stitch sentences with a single quotation mark until the closing quotation is found.

    :param sentences: List of segmented sentences
    :return: List of sentences with stitched quotes
    """
    stitched_sentences = []
    buffer = ""  # Temporary buffer for stitching
    open_quote = False  # Tracks whether we are inside an open quotation

    for sentence in sentences:
        # Count the number of quotation marks in the sentence
        quote_count = sentence.count('"')

        if open_quote:
            # Add to the buffer if we are inside an open quotation
            buffer += f" {sentence}"
            if quote_count % 2 == 1:  # Closing quote found (odd number indicates closure)
                stitched_sentences.append(buffer.strip())
                buffer = ""
                open_quote = False
        elif quote_count == 1:
            # Start buffering if a sentence has an open quotation
            buffer = sentence
            open_quote = True
        else:
            # Add the sentence as is if it has no quotes or a complete pair of quotes
            stitched_sentences.append(sentence.strip())

    # If anything is left in the buffer (edge case), add it
    if buffer:
        stitched_sentences.append(buffer.strip())

    return stitched_sentences


# Stitch sentences with unclosed quotes
stitched_sentences = stitch_sentences_with_quotes(segmented_sentences)

# Output the stitched sentences with enumeration
for idx, sentence in enumerate(stitched_sentences, 1):
    print(f"{idx}: {sentence}")





1: The last question was asked for the first time, half in jest, on May 21, 2061, at a time when humanity first stepped into the light.
2: The question came about as a result of a five dollar bet over highballs, and it happened this way: Alexander Adell and Bertram Lupov were two of the faithful attendants of Multivac.
3: As well as any human beings could, they knew what lay behind the cold, clicking, flashing face -- miles and miles of face -- of that giant computer.
4: They had at least a vague notion of the general plan of relays and circuits that had long since grown past the point where any single human could possibly have a firm grasp of the whole.
5: Multivac was self-adjusting and self-correcting.
6: It had to be, for nothing human could adjust and correct it quickly enough or even adequately enough -- so Adell and Lupov attended the monstrous giant only lightly and superficially, yet as well as any men could.
7: They fed it data, adjusted questions to its needs and translated 

test

In [35]:
import re

def extract_dialogue_with_speakers(text: str) -> list[dict]:
    """
    Extract dialogue and associate it with the respective speaker.

    :param text: The input story text
    :return: A list of dictionaries with 'speaker' and 'dialogue'
    """
    dialogue_segments = []
    current_speaker = None

    # Regex patterns to detect dialogue with attribution
    quote_pattern = r'"(.*?)"'  # Matches quotes
    attribution_pattern = r'(.*?)(?:said|asked|replied|interrupted) ([A-Za-z0-9\-]+)'  # Matches attributions

    # Split text into lines for processing
    lines = text.split("\n")

    for line in lines:
        line = line.strip()
        if not line:
            continue  # Skip empty lines

        # Check for attribution
        match = re.search(attribution_pattern, line)
        if match:
            # Extract speaker and dialogue
            dialogue, speaker = match.groups()
            dialogue = re.findall(quote_pattern, line)
            if dialogue:
                dialogue_segments.append({
                    "speaker": speaker,
                    "dialogue": " ".join(dialogue)
                })
            current_speaker = speaker  # Update current speaker
        elif '"' in line:
            # Dialogue without attribution
            dialogue = re.findall(quote_pattern, line)
            if dialogue:
                dialogue_segments.append({
                    "speaker": current_speaker if current_speaker else "Unknown",
                    "dialogue": " ".join(dialogue)
                })
        else:
            # Narrative text; associate it with the last known speaker
            if current_speaker:
                dialogue_segments[-1]["dialogue"] += f" {line}"

    return dialogue_segments


# Example Usage
story_text = """
Both seemed in their early twenties, both were tall and perfectly formed.
"Still," said VJ-23X, "I hesitate to submit a pessimistic report to the Galactic Council."
"I wouldn't consider any other kind of report. Stir them up a bit. We've got to stir them up."
VJ-23X sighed. "Space is infinite. A hundred billion Galaxies are there for the taking. More."
"A hundred billion is not infinite and it's getting less infinite all the time. Consider! Twenty thousand years ago, mankind first solved the problem of utilizing stellar energy, and a few centuries later, interstellar travel became possible."
VJ-23X interrupted. "We can thank immortality for that."
"Very well. Immortality exists and we have to take it into account. I admit it has its seamy side, this immortality. The Galactic AC has solved many problems for us, but in solving the problems of preventing old age and death, it has undone all its other solutions."
"""

dialogue_data = extract_dialogue_with_speakers(story_text)

# Output the extracted dialogue
for idx, segment in enumerate(dialogue_data, 1):
    print(f"{idx}. {segment['speaker']}: {segment['dialogue']}")


1. VJ-23X: Still, I hesitate to submit a pessimistic report to the Galactic Council.
2. VJ-23X: I wouldn't consider any other kind of report. Stir them up a bit. We've got to stir them up.
3. VJ-23X: Space is infinite. A hundred billion Galaxies are there for the taking. More.
4. VJ-23X: A hundred billion is not infinite and it's getting less infinite all the time. Consider! Twenty thousand years ago, mankind first solved the problem of utilizing stellar energy, and a few centuries later, interstellar travel became possible.
5. VJ-23X: We can thank immortality for that.
6. VJ-23X: Very well. Immortality exists and we have to take it into account. I admit it has its seamy side, this immortality. The Galactic AC has solved many problems for us, but in solving the problems of preventing old age and death, it has undone all its other solutions.


OLD

In [14]:
# Remove dashed separators and trim extra whitespace
story = story.replace('------------------------------------------------', '').strip()

# Tokenize the story into sentences
sentences = sent_tokenize(story)

# Preprocess function to normalize text for sorting
def preprocess_sentence(sentence):
    # Strip leading/trailing spaces, convert to lowercase, and remove dashes
    sentence = sentence.strip().lower().replace('-', '')
    # Remove punctuation except periods within abbreviations (e.g., U.S.A.)
    sentence = sentence.translate(str.maketrans('', '', string.punctuation.replace('.', '')))
    return sentence

# Sort sentences alphabetically based on the first alphanumeric character
sorted_sentences = sorted(sentences, key=preprocess_sentence)

# Display sorted sentences
for sentence in sorted_sentences:
    print(sentence)

# Optionally, save the sorted sentences to a new file
output_path = '/content/drive/MyDrive/SortedStory2.txt'
with open(output_path, 'w', encoding='utf-8') as file:
    file.write('\n'.join(sorted_sentences))

print(f"Sorted sentences saved to {output_path}")

A good point.
A hundred billion Galaxies are there for the taking.
"A hundred billion is not infinite and it's getting less infinite all the time.
A thought came, infinitely distant, but infinitely clear.
A timeless interval was spent in doing that.
"A very good point.
A very good point."
AC said, "THERE IS AS YET INSUFFICIENT DATA FOR A MEANINGFUL ANSWER."
Adell put his glass to his lips only occasionally, and Lupov's eyes slowly closed.
Adell was just drunk enough to try, just sober enough to be able to phrase the necessary symbols and operations into a question which, in words, might have corresponded to this: Will mankind one day without the net expenditure of energy be able to restore the sun to its full youthfulness even after it had died of old age?
After all, our own Galaxy alone pours out a thousand sunpower units a year and we only use two of those."
All collected data had come to a final end.
All Earth ran by invisible beams of sunpower.
All Earth turned off its burning coal

In [20]:

# Read the story content
with open(story_path, 'r', encoding='utf-8') as file:
    story = file.read()

# Remove dashed separators and trim extra whitespace
story = story.replace('------------------------------------------------', '').strip()

# Function to tokenize while preserving double-quoted text as a single sentence
def tokenize_with_double_quotes(text):
    # Regular expression to match text within double quotes
    double_quoted_pattern = r'"([^"]+)"'
    # Find all double-quoted sections
    double_quoted_sections = re.findall(double_quoted_pattern, text)

    # Replace double-quoted sections with placeholders
    text_without_quotes = re.sub(double_quoted_pattern, "QUOTE_PLACEHOLDER", text)

    # Tokenize remaining text into sentences
    sentences = nltk.sent_tokenize(text_without_quotes)

    # Replace placeholders with actual double-quoted text
    result = []
    for sentence in sentences:
        if "QUOTE_PLACEHOLDER" in sentence:
            # Replace each placeholder with the correct quoted text
            for quote in double_quoted_sections:
                if "QUOTE_PLACEHOLDER" in sentence:
                    sentence = sentence.replace("QUOTE_PLACEHOLDER", f'"{quote}"', 1)
        result.append(sentence)

    return result

# Tokenize the story while preserving double-quoted text as a single sentence
sentences = tokenize_with_double_quotes(story)

# Preprocess function to normalize text for sorting
def preprocess_sentence(sentence):
    # Strip leading/trailing spaces, convert to lowercase, and remove dashes
    sentence = sentence.strip().lower().replace('-', '')
    # Remove punctuation except periods within abbreviations (e.g., U.S.A.)
    sentence = sentence.translate(str.maketrans('', '', string.punctuation.replace('.', '')))
    # Remove leading non-alphanumeric characters
    return sentence.lstrip(string.punctuation + string.whitespace)

# Sort sentences alphabetically based on the first alphanumeric character
sorted_sentences = sorted(sentences, key=preprocess_sentence)

# Display sorted sentences
for sentence in sorted_sentences:
    print(sentence)



A thought came, infinitely distant, but infinitely clear.
Adell put his glass to his lips only occasionally, and Lupov's eyes slowly closed.
All Earth ran by invisible beams of sunpower.
All Earth turned off its burning coal, its fissioning uranium, and flipped the switch that connected all of it to a small station, one mile in diameter, circling the Earth at half the distance of the Moon.
Almost all stars were white dwarfs, fading to the end.
And there was light----
And yet one of them was unique among them all in being the originals Galaxy.
As well as any human beings could, they knew what lay behind the cold, clicking, flashing face -- miles and miles of face -- of that giant computer.
But slowly Multivac learned enough to answer deeper questions more fundamentally, and on May 14, 2061, what had been theory, became fact.
Can that not be done?"It's amazing when you think of it,"THERE IS AS YET INSUFFICIENT DATA FOR A MEANINGFUL ANSWER."All the energy we can possibly ever use for free

In [22]:
# Make sure this matches your exact folder structure in Google Drive
story_path = '/content/drive/MyDrive/ShortStory.txt'

# Read the story from Google Drive
with open(story_path, 'r', encoding='utf-8') as file:
    story = file.read()

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

print('\n-----\n'.join(sorted(tokenizer.tokenize(story))))

"A hundred billion is not infinite and it's getting less infinite all the time.
-----
"A very good point.
-----
"All right, but now we can hook up each individual spaceship to the Solar Station, and it can go to Pluto and back a million times without ever worrying about fuel.
-----
"All right, then.
-----
"All right.
-----
"All the energy we can possibly ever use for free.
-----
"And don't say we'll switch to another sun."
-----
"And you?"
-----
"Are you sure, Jerrodd?"
-----
"Ask Multivac."
-----
"Ask him how to turn the stars on again."
-----
"Ask the Microvac," wailed Jerrodette I.
-----
"But even so," said Man, "eventually it will all come to an end.
-----
"But how can that be all of Universal AC?"
-----
"But when all energy is gone, our bodies will finally die, and you and I with them."
-----
"Can't you just put in a new power-unit, like with my robot?"
-----
"Cosmic AC," said Man, "How may entropy be reversed?"
-----
"Darn right they will," muttered Lupov.
-----
"Did the men upon