In [2]:
import kagglehub
import os
import random
from collections import defaultdict

# Download the dataset from KaggleHub
dataset_path = kagglehub.dataset_download("humairmunir/stories")
print("Path to dataset files:", dataset_path)

# Function to load all .txt stories from the dataset
def load_stories_from_directory(directory):
    stories = ""
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), "r", encoding="utf-8") as file:
                stories += file.read() + " "
    return stories

# Function to create bigrams
def create_bigrams(text):
    words = text.split()
    bigrams = [(words[i], words[i+1]) for i in range(len(words)-1)]
    return bigrams

# Function to build the bigram model
def build_bigram_model(bigrams):
    model = defaultdict(list)
    for first, second in bigrams:
        model[first].append(second)
    return model

# Function to generate text
def generate_text(model, start_word, length=20):
    current_word = start_word
    generated_text = [current_word]
    for _ in range(length - 1):
        if current_word in model:
            next_word = random.choice(model[current_word])
            generated_text.append(next_word)
            current_word = next_word
        else:
            break
    return ' '.join(generated_text)

# Main function
def main():
    print("Loading stories...")
    combined_story = load_stories_from_directory(dataset_path)
    bigrams = create_bigrams(combined_story)
    model = build_bigram_model(bigrams)

    start_word = input("Enter a starting word: ")
    if start_word in model:
        generated_text = generate_text(model, start_word)
        print("Generated text:", generated_text)
    else:
        print(f"The word '{start_word}' is not found in the corpus.")

if __name__ == "__main__":
    main()


Downloading from https://www.kaggle.com/api/v1/datasets/download/humairmunir/stories?dataset_version_number=1...


100%|██████████| 2.09M/2.09M [00:00<00:00, 2.57MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/humairmunir/stories/versions/1
Loading stories...
Enter a starting word: was
Generated text: was coming. She taught along the earth. As she failed? What do indeed,” she heard whispers of life—writers, artists, meant
