In [2]:
import requests
import re
from collections import Counter

# STEP 1: CORPUS CREATION

BOOK_URLS = {
    "Pride and Prejudice": "https://www.gutenberg.org/files/1342/1342-0.txt",
    "Little Women": "https://www.gutenberg.org/files/514/514-0.txt",
    "Alice in Wonderland": "https://www.gutenberg.org/files/11/11-0.txt",
    "Sherlock Holmes": "https://www.gutenberg.org/files/1661/1661-0.txt"
}

def load_corpus(urls):
    text = ""
    for title, url in urls.items():
        print(f"Loading: {title}")
        text += requests.get(url).text + "\n"
    return text

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text.split()

raw_text = load_corpus(BOOK_URLS)
tokens = preprocess(raw_text)

print("\nCorpus Statistics:")
print("Total tokens:", len(tokens))

# STEP 2: N-GRAM CONSTRUCTION

unigrams = Counter(tokens)
bigrams = Counter(zip(tokens[:-1], tokens[1:]))
trigrams = Counter(zip(tokens[:-2], tokens[1:-1], tokens[2:]))

V = len(unigrams)

print("Vocabulary size:", V)
print("Unique bigrams:", len(bigrams))
print("Unique trigrams:", len(trigrams))

# STEP 3: PROBABILITY FUNCTIONS (ADD-1 SMOOTHING)

def bigram_probability(w1, w2):
    return (bigrams[(w1, w2)] + 1) / (unigrams[w1] + V)

def trigram_probability(w1, w2, w3):
    return (trigrams[(w1, w2, w3)] + 1) / (bigrams[(w1, w2)] + V)

# STEP 4: NEXT WORD PREDICTION

def predict_next_bigram(word, top_k=5):
    candidates = {}
    for (w1, w2), _ in bigrams.items():
        if w1 == word:
            candidates[w2] = bigram_probability(w1, w2)

    return sorted(candidates, key=candidates.get, reverse=True)[:top_k]

def predict_next_trigram(w1, w2, top_k=5):
    candidates = {}
    context_count = bigrams[(w1, w2)]

    if context_count == 0:
        # Backoff to bigram model
        return predict_next_bigram(w2, top_k)

    for word in unigrams:
        candidates[word] = trigram_probability(w1, w2, word)

    return sorted(candidates, key=candidates.get, reverse=True)[:top_k]

# STEP 5: INTERACTIVE USER PROMPT

def interactive_prediction():
    print("\n--- NEXT WORD PREDICTION ---")
    print("Enter one or two words (e.g., 'to be') or type 'exit' to quit.\n")

    while True:
        user_input = input("Input: ").lower().strip()
        if user_input == "exit":
            print("Exiting prediction.")
            break

        words = user_input.split()

        if len(words) == 1:
            word = words[0]
            if word not in unigrams:
                print("Word not found in vocabulary.\n")
                continue
            predictions = predict_next_bigram(word)
            print("Bigram Predictions:", predictions, "\n")

        elif len(words) == 2:
            w1, w2 = words
            predictions = predict_next_trigram(w1, w2)
            print("Trigram Predictions:", predictions, "\n")

        else:
            print("Please enter only one or two words.\n")

interactive_prediction()


Loading: Pride and Prejudice
Loading: Little Women
Loading: Alice in Wonderland
Loading: Sherlock Holmes

Corpus Statistics:
Total tokens: 459274
Vocabulary size: 16449
Unique bigrams: 169160
Unique trigrams: 359239

--- NEXT WORD PREDICTION ---
Enter one or two words (e.g., 'to be') or type 'exit' to quit.

Input: to be
Trigram Predictions: ['a', 'the', 'sure', 'in', 'so'] 

Input: what
Bigram Predictions: ['i', 'is', 'she', 'a', 'you'] 

Input: you are
Trigram Predictions: ['not', 'a', 'very', 'the', 'too'] 

Input: clear
Bigram Predictions: ['to', 'that', 'up', 'the', 'enough'] 

Input: exit
Exiting prediction.
