In [24]:
import re
import random

In [25]:
def tokenize(text):
    """Tokenize the input text into lowercase words and punctuation."""
    return re.findall(r'\b\w+\b|[^\w\s]', text.lower())

In [26]:
def create_n_gram_model(tokens, n):
    """Create and return N-gram models for 1 through n."""
    n_gram_counts = {}
    context_counts = {}
    
    for order in range(1, n+1):
        for i in range(len(tokens) - order + 1):
            n_gram = tuple(tokens[i:i+order])
            context = n_gram[:-1]
            n_gram_counts[n_gram] = n_gram_counts.get(n_gram, 0) + 1
            context_counts[context] = context_counts.get(context, 0) + 1
    
    # Convert counts to probabilities
    n_gram_probabilities = {}
    for n_gram, count in n_gram_counts.items():
        context = n_gram[:-1]
        n_gram_probabilities[n_gram] = count / context_counts[context]
    return n_gram_probabilities

In [27]:
def interpolate_models(models, weights):
    """Interpolate multiple N-gram models using given weights."""
    final_model = {}
    for order, model in models.items():
        for n_gram, prob in model.items():
            if n_gram in final_model:
                final_model[n_gram] += weights[order-1] * prob
            else:
                final_model[n_gram] = weights[order-1] * prob
    return final_model

In [34]:
def generate_text(model, start_words, num_words):
    """Generate text using an interpolated N-gram model."""
    
    if isinstance(start_words, str):
        start_words = tuple(start_words.split())
    
    text = list(start_words)
    current_context = tuple(text[-2:])  # Start with the last two words for context
    
    for _ in range(num_words - len(start_words)):
        possible_next_words = [n_gram[-1] for n_gram in model if n_gram[:-1] == current_context]
        probabilities = [model[n_gram] for n_gram in model if n_gram[:-1] == current_context]
        
        if not probabilities:
            break
        
        next_word = random.choices(possible_next_words, weights=probabilities)[0]
        text.append(next_word)
        current_context = (current_context[-1], next_word)  # Update context to last two words
    
    return ' '.join(text)

In [35]:
# Load the corpus from a file
with open(r"C:\Users\Hamza\Desktop\NLP\J. K. Rowling - Harry Potter 4 - The Goblet of Fire.txt") as file:
    corpus = file.read()

In [36]:
tokens = tokenize(corpus)
unigram_model = create_n_gram_model(tokens, 1)
bigram_model = create_n_gram_model(tokens, 2)
trigram_model = create_n_gram_model(tokens, 3)

In [37]:
models = {1: unigram_model, 2: bigram_model, 3: trigram_model}
weights = [0.1, 0.4, 0.5]  # Weights for unigram, bigram, trigram

In [40]:
interpolated_model = interpolate_models(models, weights)
generated_text = generate_text(interpolated_model, "he had awoken", 100)
print(generated_text)

he had awoken with his wand - maker from whom harry would be quicker , i ' m not , wormtail ' s face , which none but the rippling weed . " vot is wrong with me , if they had said , not even i can ' t he ? didn ' t have to really get to you all tonight , however , sirius would rather have done , straightened up , nodding toward the leprechauns , who opened his eyes still glinted malevolently through his hair . the dark and the students from beauxbatons and durmstrang
