In [12]:
from google.colab import files

uploaded = files.upload()


Saving brown.dev.txt to brown.dev.txt
Saving brown.test.txt to brown.test.txt
Saving brown.train.txt to brown.train.txt


In [13]:
# Define file paths based on the uploaded files
train_data_path = 'brown.train.txt'
dev_data_path = 'brown.dev.txt'
test_data_path = 'brown.test.txt'

# Load data
def load_data(file_path):
    with open(file_path, 'r') as file:
        data = file.readlines()
    return data

# Load datasets
train_data = load_data(train_data_path)
dev_data = load_data(dev_data_path)
test_data = load_data(test_data_path)


In [14]:
def preprocess(data):
    preprocessed_data = []
    for sentence in data:
        # Lowercase the sentence
        sentence = sentence.lower().strip()
        # Add start and end tokens
        sentence = f"<s> {sentence} </s>"
        preprocessed_data.append(sentence)
    return preprocessed_data

# Preprocess the datasets
train_data = preprocess(train_data)
dev_data = preprocess(dev_data)
test_data = preprocess(test_data)


In [15]:
from collections import Counter

# Count word frequencies in the training data
word_counts = Counter(" ".join(train_data).split())

# Replace rare words with <unk>
def replace_rare_words(data, word_counts, threshold=1):
    processed_data = []
    for sentence in data:
        processed_sentence = []
        for word in sentence.split():
            if word_counts[word] <= threshold:
                processed_sentence.append("<unk>")
            else:
                processed_sentence.append(word)
        processed_data.append(" ".join(processed_sentence))
    return processed_data

# Apply the transformation
train_data = replace_rare_words(train_data, word_counts)
test_data = replace_rare_words(test_data, word_counts)


In [16]:
import numpy as np

class UnigramModel:
    def __init__(self):
        self.model = None

    def train(self, data):
        word_counts = Counter(" ".join(data).split())
        total_words = sum(word_counts.values())
        self.model = {word: count / total_words for word, count in word_counts.items()}

    def probability(self, word):
        return self.model.get(word, self.model.get("<unk>", 0))

class BigramModel:
    def __init__(self, smoothing=False):
        self.bigram_counts = None
        self.unigram_counts = None
        self.smoothing = smoothing

    def train(self, data):
        bigram_counts = Counter()
        unigram_counts = Counter()
        for sentence in data:
            words = sentence.split()
            unigram_counts.update(words)
            bigram_counts.update([(words[i], words[i+1]) for i in range(len(words) - 1)])
        self.bigram_counts = bigram_counts
        self.unigram_counts = unigram_counts

    def probability(self, word1, word2):
        bigram_count = self.bigram_counts.get((word1, word2), 0)
        unigram_count = self.unigram_counts.get(word1, 0)
        if self.smoothing:
            bigram_count += 1
            unigram_count += len(self.unigram_counts)
        return bigram_count / unigram_count if unigram_count > 0 else 0

# Train unigram model
unigram_model = UnigramModel()
unigram_model.train(train_data)

# Train bigram models
bigram_model = BigramModel()
bigram_model.train(train_data)

bigram_addone_model = BigramModel(smoothing=True)
bigram_addone_model.train(train_data)


In [17]:
unique_words = len(unigram_model.model)
total_words = sum(Counter(" ".join(train_data).split()).values())

print(f"Number of unique word types: {unique_words}")
print(f"Number of word tokens: {total_words}")


Number of unique word types: 24796
Number of word tokens: 1018784


In [18]:
def unknown_words_percentage(test_data, word_counts):
    total_tokens = 0
    unknown_tokens = 0
    unique_test_words = set()
    unknown_test_words = set()

    for sentence in test_data:
        words = sentence.split()
        total_tokens += len(words)
        unique_test_words.update(words)
        unknown_tokens += sum(1 for word in words if word_counts[word] == 0)
        unknown_test_words.update([word for word in words if word_counts[word] == 0])

    return (unknown_tokens / total_tokens) * 100, (len(unknown_test_words) / len(unique_test_words)) * 100

unknown_token_percentage, unknown_type_percentage = unknown_words_percentage(test_data, word_counts)
print(f"Percentage of unknown word tokens in test data: {unknown_token_percentage}%")
print(f"Percentage of unknown word types in test data: {unknown_type_percentage}%")


Percentage of unknown word tokens in test data: 3.2206874447910074%
Percentage of unknown word types in test data: 0.008508465923593977%


In [19]:
sentences = [
    "he was laughed off the screen .",
    "there was no compulsion behind them .",
    "i look forward to hearing your reply ."
]

def log_probability_sentence(sentence, model, model_type="unigram"):
    words = sentence.split()
    log_prob = 0
    if model_type == "unigram":
        for word in words:
            prob = model.probability(word)
            log_prob += np.log(prob) if prob > 0 else -np.inf
    elif model_type == "bigram":
        for i in range(len(words) - 1):
            prob = model.probability(words[i], words[i + 1])
            log_prob += np.log(prob) if prob > 0 else -np.inf
    return log_prob

for sentence in sentences:
    print(f"Sentence: {sentence}")
    print(f"Unigram log probability: {log_probability_sentence(sentence, unigram_model, 'unigram')}")
    print(f"Bigram log probability: {log_probability_sentence(sentence, bigram_model, 'bigram')}")
    print(f"Bigram Add-One log probability: {log_probability_sentence(sentence, bigram_addone_model, 'bigram')}")
    print("\n")


Sentence: he was laughed off the screen .
Unigram log probability: -43.79932926890675
Bigram log probability: -26.275048110491174
Bigram Add-One log probability: -44.69653560039508


Sentence: there was no compulsion behind them .
Unigram log probability: -47.61873728327696
Bigram log probability: -19.83630885635591
Bigram Add-One log probability: -40.90488974022167


Sentence: i look forward to hearing your reply .
Unigram log probability: -57.23484105538851
Bigram log probability: -inf
Bigram Add-One log probability: -62.16420607795311




In [20]:
def perplexity(test_data, model, model_type="unigram"):
    log_prob_sum = 0
    N = 0
    for sentence in test_data:
        words = sentence.split()
        N += len(words)
        log_prob_sum += log_probability_sentence(sentence, model, model_type)
    return np.exp(-log_prob_sum / N)

print(f"Unigram model perplexity: {perplexity(test_data, unigram_model, 'unigram')}")
print(f"Bigram model perplexity: {perplexity(test_data, bigram_model, 'bigram')}")
print(f"Bigram Add-One model perplexity: {perplexity(test_data, bigram_addone_model, 'bigram')}")


Unigram model perplexity: 623.5664891915749
Bigram model perplexity: inf
Bigram Add-One model perplexity: 1220.68388569207
