In [None]:
# TASK 1 - FINAL

def preprocessing(text):
    text = text.lower()
    tokens = text.split()
    return tokens


def unigram_and_bigram(corpus_path):
    unigram_counts = {}
    bigram_counts = {}
    with open(corpus_path, "r") as f:
        for review in f:
            tokens = preprocessing(review.strip())
            # unigrams
            for token in tokens:
                unigram_counts[token] = unigram_counts.get(token, 0) + 1
            # bigrams
            for i in range(len(tokens) - 1):
                bigram = (tokens[i], tokens[i + 1])
                bigram_counts[bigram] = bigram_counts.get(bigram, 0) + 1

    return unigram_counts, bigram_counts


def computing_probabilities(unigram_counts, bigram_counts):
    total_unigrams = sum(unigram_counts.values())

    # unigram
    unigram_prob = {}
    for word, count in unigram_counts.items():
        unigram_prob[word] = count / total_unigrams

    # bigram
    bigram_prob = {}
    for (word1, word2), count in bigram_counts.items():
        if word1 not in bigram_prob:
            bigram_prob[word1] = {}
        bigram_prob[word1][word2] = count / unigram_counts[word1]

    return unigram_prob, bigram_prob


if __name__ == "__main__":
    train_file = "/content/train.txt"
    unigram_counts, bigram_counts = unigram_and_bigram(train_file)
    unigram_prob, bigram_prob = computing_probabilities(unigram_counts, bigram_counts)

    # printing top 10 unigrams
    print("Top 10 unigram probabilities:")
    for word, prob in list(unigram_prob.items())[:10]:
        print(f"P({word}) = {prob:.4f}")

    # printing bigram probabilities
    test_word = "the"
    if test_word in bigram_prob:
        print(f"\nBigram probabilities for context '{test_word}':")
        for word, prob in list(bigram_prob[test_word].items())[:10]:
            print(f"P({word}|{test_word}) = {prob:.4f}")


Top 10 unigram probabilities:
P(i) = 0.0190
P(booked) = 0.0010
P(two) = 0.0014
P(rooms) = 0.0022
P(four) = 0.0002
P(months) = 0.0001
P(in) = 0.0140
P(advance) = 0.0001
P(at) = 0.0083
P(the) = 0.0590

Bigram probabilities for context 'the':
P(talbott|the) = 0.0049
P(top|the) = 0.0023
P(elevators|the) = 0.0021
P(front|the) = 0.0183
P(16th|the) = 0.0004
P(noise|the) = 0.0019
P(hotel|the) = 0.0780
P(elevator|the) = 0.0038
P(room|the) = 0.0557
P(bed|the) = 0.0119


In [None]:
# TASK 2 - FINAL

def preprocessing(corpus_path, unk_threshold=1):
    unigram_counts = {}
    tokenized_sentences = []
    with open(corpus_path, "r") as f:
        for line in f:
            tokens = line.strip().split()
            tokens = [t.lower() for t in tokens]

            for token in tokens:
                unigram_counts[token] = unigram_counts.get(token, 0) + 1

            tokenized_sentences.append(tokens)

    # vocabulary with <UNK>
    vocab = set()
    for word, count in unigram_counts.items():
        if count > unk_threshold:
            vocab.add(word)
    vocab.add("<UNK>")

    # rare words to <UNK>
    modified_sentences = []
    for sentence in tokenized_sentences:
        processed = []
        for word in sentence:
            if word in vocab:
                processed.append(word)
            else:
                processed.append("<UNK>")
        modified_sentences.append(processed)

    return modified_sentences, vocab



def count_ngrams(sentences):
    unigram_counts = {}
    bigram_counts = {}
    for tokens in sentences:
        tokens = ["<s>"] + tokens + ["</s>"]

        # unigrams
        for token in tokens:
            unigram_counts[token] = unigram_counts.get(token, 0) + 1

        # bigrams
        for i in range(len(tokens) - 1):
            bigram = (tokens[i], tokens[i+1])
            bigram_counts[bigram] = bigram_counts.get(bigram, 0) + 1

    return unigram_counts, bigram_counts


def bigram_laplace(bigram_counts, unigram_counts, vocab_size):
    bigram_probs = {}
    for (word1, word2), count in bigram_counts.items():
        if word1 not in bigram_probs:
            bigram_probs[word1] = {}
        bigram_probs[word1][word2] = (count + 1) / (unigram_counts[word1] + vocab_size)
    return bigram_probs


def bigram_addk(bigram_counts, unigram_counts, vocab_size, k=0.5):
    bigram_probs = {}
    for word1 in unigram_counts:
        if word1 not in bigram_probs:
            bigram_probs[word1] = {}
        for word2 in unigram_counts:
            count = bigram_counts.get((word1, word2), 0)
            bigram_probs[word1][word2] = (count + k) / (unigram_counts[word1] + k * vocab_size)
    return bigram_probs


if __name__ == "__main__":
    train_file = "/content/train.txt"
    sentences, vocab = preprocessing(train_file, unk_threshold=1)
    vocab_size = len(vocab)

    # n-grams
    unigram_counts, bigram_counts = count_ngrams(sentences)

    # bigram
    bigram_laplace = bigram_laplace(bigram_counts, unigram_counts, vocab_size)
    bigram_addk = bigram_addk(bigram_counts, unigram_counts, vocab_size, k=0.5)

    # printing top 10 unigram
    total_unigrams = sum(unigram_counts.values())
    unigram_probs = {}
    for word, count in unigram_counts.items():
        unigram_probs[word] = count / total_unigrams

    print("Top 10 unigram probabilities:")
    for word, prob in list(unigram_probs.items())[:10]:
        print(f"P({word}) = {prob:.4f}")

    # printing bigram for word 'the'
    word = "the"
    if word in bigram_laplace:
        print(f"\nBigram probabilities for context '{word}' (Laplace):")
        for word1, prob in list(bigram_laplace[word].items())[:10]:
            print(f"P({word1}|{word}) = {prob:.4f}")

    if word in bigram_addk:
        print(f"\nBigram probabilities for context '{word}' (Add-0.5):")
        for word1, prob in list(bigram_addk[word].items())[:10]:
            print(f"P({word1}|{word}) = {prob:.4f}")


Top 10 unigram probabilities:
P(<s>) = 0.0056
P(i) = 0.0188
P(booked) = 0.0009
P(two) = 0.0014
P(rooms) = 0.0022
P(four) = 0.0002
P(months) = 0.0001
P(in) = 0.0139
P(advance) = 0.0001
P(at) = 0.0082

Bigram probabilities for context 'the' (Laplace):
P(talbott|the) = 0.0032
P(top|the) = 0.0015
P(elevators|the) = 0.0014
P(front|the) = 0.0117
P(16th|the) = 0.0004
P(noise|the) = 0.0013
P(hotel|the) = 0.0492
P(elevator|the) = 0.0025
P(room|the) = 0.0352
P(bed|the) = 0.0076

Bigram probabilities for context 'the' (Add-0.5):
P(<s>|the) = 0.0001
P(i|the) = 0.0002
P(booked|the) = 0.0001
P(two|the) = 0.0007
P(rooms|the) = 0.0131
P(four|the) = 0.0012
P(months|the) = 0.0001
P(in|the) = 0.0001
P(advance|the) = 0.0001
P(at|the) = 0.0001


In [None]:
# TASK 3:
import math

def preprocess_with_unk(corpus_path, unk_threshold=1):
    unigram_counts = {}
    tokenized_sentences = []

    with open(corpus_path, "r", encoding="utf-8") as f:
        for line in f:
            tokens = line.strip().lower().split()
            tokenized_sentences.append(tokens)
            for token in tokens:
                unigram_counts[token] = unigram_counts.get(token, 0) + 1

    vocab = {word for word, count in unigram_counts.items() if count > unk_threshold}
    vocab.add("<UNK>")

    processed_sentences = []
    for sentence in tokenized_sentences:
        processed = [w if w in vocab else "<UNK>" for w in sentence]
        processed_sentences.append(processed)

    return processed_sentences, vocab

def count_ngrams(sentences):

    unigram_counts = {}
    bigram_counts = {}

    for tokens in sentences:
        tokens = ["<s>"] + tokens + ["</s>"]

        for token in tokens:
            unigram_counts[token] = unigram_counts.get(token, 0) + 1

        for i in range(len(tokens) - 1):
            bigram = (tokens[i], tokens[i + 1])
            bigram_counts[bigram] = bigram_counts.get(bigram, 0) + 1

    return unigram_counts, bigram_counts

def calculate_perplexity(validation_path, model_bigram_counts, model_unigram_counts, vocab, k=1.0):
    vocab_size = len(vocab)
    log_prob_sum = 0.0
    N = 0

    with open(validation_path, "r", encoding="utf-8") as f:
        for line in f:
            raw_tokens = line.strip().lower().split()
            processed = [word if word in vocab else "<UNK>" for word in raw_tokens]
            tokens = ["<s>"] + processed + ["</s>"]

            N += len(tokens) - 1

            for i in range(1, len(tokens)):
                w1 = tokens[i-1]
                w2 = tokens[i]


                bigram_count = model_bigram_counts.get((w1, w2), 0)
                unigram_count = model_unigram_counts.get(w1, 0)

                denominator = unigram_count + k * vocab_size
                if denominator == 0:
                    prob = 1.0 / vocab_size
                else:
                    prob = (bigram_count + k) / denominator

                log_prob_sum += math.log(prob, 2)

    if N == 0:
        return float('inf')

    avg_log_prob = log_prob_sum / N
    perplexity = math.pow(2, -avg_log_prob)

    return perplexity


if __name__ == "__main__":

    train_file = "/content/train.txt"
    validation_file = "/content/val.txt"

    print(f"Training model on: {train_file}")

    train_sentences, vocab = preprocess_with_unk(train_file, unk_threshold=1)

    model_unigram_counts, model_bigram_counts = count_ngrams(train_sentences)
    print(f"Vocabulary size: {len(vocab)}")

    print(f"\nCalculating perplexity for: {validation_file}")

    perplexity_laplace = calculate_perplexity(
        validation_file,
        model_bigram_counts,
        model_unigram_counts,
        vocab,
        k=1.0
    )
    print(f"Perplexity with Laplace (k=1.0) smoothing: {perplexity_laplace:.4f}")

    perplexity_add_k = calculate_perplexity(
        validation_file,
        model_bigram_counts,
        model_unigram_counts,
        vocab,
        k=0.5
    )
    print(f"Perplexity with Add-k (k=0.5) smoothing: {perplexity_add_k:.4f}")

Training model on: /content/train.txt
Vocabulary size: 3116

Calculating perplexity for: /content/val.txt
Perplexity with Laplace (k=1.0) smoothing: 428.8675
Perplexity with Add-k (k=0.5) smoothing: 316.5824
