# Step 1: Download and Load the Dataset

In [1]:
import requests
import os
from collections import Counter, defaultdict
import random

# Download Shakespeare dataset
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)

# Save the file
with open("shakespeare.txt", "w", encoding="utf-8") as f:
    f.write(response.text)

print("Dataset downloaded and saved as 'shakespeare.txt'.")

Dataset downloaded and saved as 'shakespeare.txt'.


# Step 2: Load and Preprocess the Data

In [2]:
with open("shakespeare.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Remove unnecessary whitespace and newlines
data = text.replace("\n", " ")

print(f"Dataset length: {len(data)} characters")

Dataset length: 1115394 characters


# Step 3: Build n-gram Model (n=2 and n=3)

In [3]:
def generate_ngrams(text, n):
    """Generate n-grams from the given text."""
    ngrams = [tuple(text[i : i + n]) for i in range(len(text) - n)]
    return Counter(ngrams)

# Generate bigrams (n=2) and trigrams (n=3)
bigrams = generate_ngrams(data, 2)
trigrams = generate_ngrams(data, 3)

print(f"Total unique bigrams: {len(bigrams)}")
print(f"Total unique trigrams: {len(trigrams)}")

Total unique bigrams: 1318
Total unique trigrams: 10033


# Step 4: Find the Most Frequent n-grams

In [4]:
def most_frequent_ngrams(ngram_counter, top_n=1):
    """Return the most common n-grams."""
    return ngram_counter.most_common(top_n)

most_common_bigrams = most_frequent_ngrams(bigrams, 1)
most_common_trigrams = most_frequent_ngrams(trigrams, 1)

print(f"Most common bigram: {most_common_bigrams}")
print(f"Most common trigram: {most_common_trigrams}")

Most common bigram: [(('e', ' '), 29077)]
Most common trigram: [((' ', 't', 'h'), 16237)]


# Step 5: Find the Most Likely Next Character for Each n-gram

In [5]:
def compute_next_char_probabilities(text, n):
    """Compute the probability distribution of the next character given an n-gram prefix."""
    ngram_dict = defaultdict(Counter)
    for i in range(len(text) - n):
        prefix = tuple(text[i : i + n - 1])  # (xt-1, xt-2, ..., xt-n+1)
        next_char = text[i + n - 1]  # xt
        ngram_dict[prefix][next_char] += 1
    
    # Convert counts to probabilities
    for prefix, counter in ngram_dict.items():
        total_count = sum(counter.values())
        for char in counter:
            counter[char] /= total_count
    
    return ngram_dict

# Compute next character probabilities for bigrams and trigrams
bigram_next_char_probs = compute_next_char_probabilities(data, 2)
trigram_next_char_probs = compute_next_char_probabilities(data, 3)

# Show example output
example_bigram_prefix = most_common_bigrams[0][0]  # Most common bigram prefix
example_trigram_prefix = most_common_trigrams[0][0]  # Most common trigram prefix

print(f"Most likely next characters for bigram {example_bigram_prefix}: {bigram_next_char_probs[example_bigram_prefix]}")
print(f"Most likely next characters for trigram {example_trigram_prefix}: {trigram_next_char_probs[example_trigram_prefix]}")

Most likely next characters for bigram ('e', ' '): Counter()
Most likely next characters for trigram (' ', 't', 'h'): Counter()


# Step 6: Generate Text using the n-gram Model

In [6]:
def generate_text(ngram_dict, seed, length=100):
    """Generate text using an n-gram probability distribution."""
    generated = list(seed)
    for _ in range(length):
        prefix = tuple(generated[-(len(seed)):])  # Match the prefix length
        if prefix in ngram_dict:
            next_char = random.choices(
                list(ngram_dict[prefix].keys()), 
                weights=ngram_dict[prefix].values()
            )[0]
            generated.append(next_char)
        else:
            break  # Stop if no continuation found
    return ''.join(generated)

# Generate three paragraphs of text using bigrams and trigrams
bigram_seed = random.choice(list(bigram_next_char_probs.keys()))
trigram_seed = random.choice(list(trigram_next_char_probs.keys()))

print("\nGenerated Text with Bigrams:")
print(generate_text(bigram_next_char_probs, bigram_seed, length=200))
print(generate_text(bigram_next_char_probs, bigram_seed, length=200))
print(generate_text(bigram_next_char_probs, bigram_seed, length=200))

print("\nGenerated Text with Trigrams:")
print(generate_text(trigram_next_char_probs, trigram_seed, length=200))
print(generate_text(trigram_next_char_probs, trigram_seed, length=200))
print(generate_text(trigram_next_char_probs, trigram_seed, length=200))


Generated Text with Bigrams:
hef GEShaneand thinove the aize se pe  mauts haloromay   ay PHE: waine thene wdiedlat we sti'd atouss I GLLar, nd d. hankind uge s. beakeand Whet Chisataifr's louchowesat hel. atatineaseen vas odeme: D
haimasho Cing; cks JO: her; RYo ad. toncaitilo- t by ULey hithand h ofit Y ht mu be bo t: cepasto! notherbaneive: KIOLERoswhin had myortou bo--f, ad'swianghaveallos h th, me wn mert tr, ngoralitre dsh 
hechin BABe s bun o D boupuryssatous uablfldofay. hor, Mit? heind helist: sto, a mettiofove DUETRELUCUCE: O: y CHato Whoud g INGOUTHoul yort m Fofle Gab, la hasist's, su plld IUThaw clonothiouepfeck yo

Generated Text with Trigrams:
o-mot thesse ings frefaughbastrust Myse dand So behen the car frouch RIA: I'llorratellich floondstiong'd wrought by not the O Than's hour mur lach of claid bleachal, aren, What so makinexce, Andoot ing,
o-mort mour of ereir be thy, wou re meneete Thind bat! Blose, MAR: How DUKE O, ithen hemakentles din hat, And a ping 't RICKING ES