<a href="https://colab.research.google.com/github/josecuervo420/576a2/blob/main/miniproj1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np
import random
from nltk.corpus import brown
import math
from collections import Counter, defaultdict


# Ensure you have the Brown corpus downloaded:
import nltk
nltk.download('brown')

# Retrieve the sentences from the Brown corpus
brown_sentences = brown.sents()

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [5]:
# Split the data into training, validation, and test sets
D_train = brown_sentences[:40138]
D_val = brown_sentences[40138:45872]
D_test = brown_sentences[45872:]

# Function to build a unigram model
def build_unigram_model(sentences):
    model = Counter()
    for sentence in sentences:
        for word in sentence:
            model[word] += 1
    total_count = sum(model.values())
    for word in model:
        model[word] /= total_count
    return model

# Function to build a bigram model
def build_bigram_model(sentences):
    model = defaultdict(Counter)
    for sentence in sentences:
        previous_word = None
        for word in sentence:
            if previous_word is not None:
                model[previous_word][word] += 1
            previous_word = word
    for previous_word in model:
        total_count = sum(model[previous_word].values())
        for word in model[previous_word]:
            model[previous_word][word] /= total_count
    return model

# Function to calculate perplexity
def calculate_perplexity(model, sentences, n_gram=1):
    perplexity = 1
    N = 0
    for sentence in sentences:
        for i in range(len(sentence) - n_gram + 1):
            if n_gram == 1:
                word = sentence[i]
                # Using a small value to avoid log(0)
                word_probability = model.get(word, 1e-6)
            else:
                previous_word = sentence[i-1] if i > 0 else None
                word = sentence[i]
                word_probability = model[previous_word].get(word, 1e-6)  # Using a small value to avoid log(0)
            perplexity = perplexity * (1 / word_probability)
            N += 1
    perplexity = math.pow(perplexity, 1/float(N))
    return perplexity

In [6]:
# Build unigram and bigram models
unigram_model = build_unigram_model(D_train)
bigram_model = build_bigram_model(D_train)

# Calculate perplexity
unigram_perplexity_train = calculate_perplexity(unigram_model, D_train)
bigram_perplexity_train = calculate_perplexity(bigram_model, D_train, n_gram=2)
unigram_perplexity_test = calculate_perplexity(unigram_model, D_test)
bigram_perplexity_test = calculate_perplexity(bigram_model, D_test, n_gram=2)

# Print the perplexities
print(f"Unigram Perplexity on Training Data: {unigram_perplexity_train}")
print(f"Bigram Perplexity on Training Data: {bigram_perplexity_train}")
print(f"Unigram Perplexity on Test Data: {unigram_perplexity_test}")
print(f"Bigram Perplexity on Test Data: {bigram_perplexity_test}")

Unigram Perplexity on Training Data: inf
Bigram Perplexity on Training Data: inf
Unigram Perplexity on Test Data: inf
Bigram Perplexity on Test Data: inf


In [7]:
# Implement add-λ smoothing method for unigram model
def add_lambda_smoothing_unigram(unigram_model, lambda_, vocabulary_size):
    smoothed_model = {}
    total_count = sum(unigram_model.values())
    for word, count in unigram_model.items():
        smoothed_model[word] = (count + lambda_) / (total_count + lambda_ * vocabulary_size)
    return smoothed_model

# Implement add-λ smoothing method for bigram model
def add_lambda_smoothing_bigram(bigram_model, lambda_, vocabulary_size):
    smoothed_model = defaultdict(lambda: defaultdict(float))
    for previous_word, word_counts in bigram_model.items():
        total_count = sum(word_counts.values())
        for word, count in word_counts.items():
            smoothed_model[previous_word][word] = (count + lambda_) / (total_count + lambda_ * vocabulary_size)
    return smoothed_model

# Perform a grid search over a range of λ values for unigram model
lambda_values = np.linspace(0.1, 2.0, 20)  # Example range, adjust as needed
best_lambda_unigram = lambda_values[0]
best_perplexity_unigram = float('inf')

for lambda_ in lambda_values:
    smoothed_model = add_lambda_smoothing_unigram(unigram_model, lambda_, len(unigram_model))
    perplexity = calculate_perplexity(smoothed_model, D_val)
    print(f"λ = {lambda_}, Unigram Perplexity = {perplexity}")

    if perplexity < best_perplexity_unigram:
        best_perplexity_unigram = perplexity
        best_lambda_unigram = lambda_

print(f"Best λ for Unigram: {best_lambda_unigram}, with perplexity: {best_perplexity_unigram}")

# Perform a grid search over a range of λ values for bigram model
best_lambda_bigram = lambda_values[0]
best_perplexity_bigram = float('inf')

for lambda_ in lambda_values:
    smoothed_model = add_lambda_smoothing_bigram(bigram_model, lambda_, len(unigram_model))
    perplexity = calculate_perplexity(smoothed_model, D_val, n_gram=2)
    print(f"λ = {lambda_}, Bigram Perplexity = {perplexity}")

    if perplexity < best_perplexity_bigram:
        best_perplexity_bigram = perplexity
        best_lambda_bigram = lambda_

print(f"Best λ for Bigram: {best_lambda_bigram}, with perplexity: {best_perplexity_bigram}")

  perplexity = perplexity * (1 / word_probability)


λ = 0.1, Unigram Perplexity = inf
λ = 0.2, Unigram Perplexity = inf
λ = 0.3, Unigram Perplexity = inf
λ = 0.4, Unigram Perplexity = inf
λ = 0.5, Unigram Perplexity = inf
λ = 0.6, Unigram Perplexity = inf
λ = 0.7, Unigram Perplexity = inf
λ = 0.7999999999999999, Unigram Perplexity = inf
λ = 0.8999999999999999, Unigram Perplexity = inf
λ = 0.9999999999999999, Unigram Perplexity = inf
λ = 1.0999999999999999, Unigram Perplexity = inf
λ = 1.2, Unigram Perplexity = inf
λ = 1.3, Unigram Perplexity = inf
λ = 1.4, Unigram Perplexity = inf
λ = 1.5, Unigram Perplexity = inf
λ = 1.5999999999999999, Unigram Perplexity = inf
λ = 1.7, Unigram Perplexity = inf
λ = 1.8, Unigram Perplexity = inf
λ = 1.9, Unigram Perplexity = inf
λ = 2.0, Unigram Perplexity = inf
Best λ for Unigram: 0.1, with perplexity: inf
λ = 0.1, Bigram Perplexity = inf
λ = 0.2, Bigram Perplexity = inf
λ = 0.3, Bigram Perplexity = inf
λ = 0.4, Bigram Perplexity = inf
λ = 0.5, Bigram Perplexity = inf
λ = 0.6, Bigram Perplexity = inf
λ

In [8]:
# Combine training and validation data for retraining
D_train_val = D_train + D_val

# Retrain unigram and bigram models with the combined data using best λ values
unigram_model_retrained = add_lambda_smoothing_unigram(build_unigram_model(D_train_val), best_lambda_unigram, len(unigram_model))
bigram_model_retrained = add_lambda_smoothing_bigram(build_bigram_model(D_train_val), best_lambda_bigram, len(unigram_model))

# Calculate perplexity of the retrained models on the test data
unigram_perplexity_test_retrained = calculate_perplexity(unigram_model_retrained, D_test)
bigram_perplexity_test_retrained = calculate_perplexity(bigram_model_retrained, D_test, n_gram=2)

print(f"Retrained Unigram Perplexity on Test Data: {unigram_perplexity_test_retrained}")
print(f"Retrained Bigram Perplexity on Test Data: {bigram_perplexity_test_retrained}")

  perplexity = perplexity * (1 / word_probability)


Retrained Unigram Perplexity on Test Data: inf
Retrained Bigram Perplexity on Test Data: inf


In [None]:
# Generate random sentences from the unigram and bigram models
def generate_sentence_from_model(model, stop_symbol, is_bigram=False):
    sentence = []
    if is_bigram:
        word = '<s>'  # Assuming <s> is your start symbol
        while True:
            next_words = list(model[word].keys())
            probabilities = list(model[word].values())
            next_word = random.choices(next_words, weights=probabilities)[0]
            if next_word == stop_symbol or next_word == '</s>':
                break
            sentence.append(next_word)
            word = next_word
    else:
        while True:
            next_words = list(model.keys())
            probabilities = list(model.values())
            next_word = random.choices(next_words, weights=probabilities)[0]
            if next_word == stop_symbol:
                break
            sentence.append(next_word)
    return ' '.join(sentence)

# Generate sentences from the retrained unigram and bigram models
print("Generated sentences from the unigram model:")
for _ in range(5):
    print(generate_sentence_from_model(unigram_model_retrained, '</s>'))

print("\nGenerated sentences from the bigram model:")
for _ in range(5):
    print(generate_sentence_from_model(bigram_model_retrained, '</s>', is_bigram=True))

Generated sentences from the unigram model:


In [None]:
# Assuming you have a trigram model trained
def generate_sentence_from_trigram(model, stop_symbol):
    sentence = ['<s>', '<s>']  # Assuming <s> is your start symbol
    while True:
        context = tuple(sentence[-2:])
        if context not in model:
            break
        next_words = list(model[context].keys())
        probabilities = list(model[context].values())
        next_word = random.choices(next_words, weights=probabilities)[0]
        if next_word == stop_symbol or next_word == '</s>':
            break
        sentence.append(next_word)
    return ' '.join(sentence[2:])

# Generate sentences from the trigram model
print("\nGenerated sentences from the trigram model:")
for _ in range(5):
    print(generate_sentence_from_trigram(trigram_model_retrained, '</s>'))