In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict, Counter
import math

nltk.download('punkt')
nltk.download('stopwords')

documents = [
    "The quick brown fox jumps over the lazy dog.",
    "A quick brown dog barks loudly.",
    "The lazy dog sleeps in the sun.",
    "A brown fox runs fast.",
    "The sun is bright today."
]

stop_words = set(stopwords.words('english'))
preprocessed_documents = []
for doc in documents:
    tokens = word_tokenize(doc.lower())
    tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
    preprocessed_documents.append(tokens)

def get_unigram_counts(doc):
    counts = Counter(doc)
    total = sum(counts.values())
    return counts, total

def get_bigram_counts(doc):
    bigrams = list(zip(["<s>"] + doc, doc + ["</s>"]))
    counts = Counter(bigrams)
    total = sum(counts.values())
    return counts, total

def unigram_prob(word, unigram_counts, total, alpha=1):
    vocab_size = len(unigram_counts)
    return (unigram_counts.get(word, 0) + alpha) / (total + alpha * vocab_size)

def bigram_prob(w1, w2, bigram_counts, unigram_counts, alpha=1):
    vocab_size = len(unigram_counts)
    return (bigram_counts.get((w1, w2), 0) + alpha) / (unigram_counts.get(w1, 0) + alpha * vocab_size)

def query_log_likelihood_unigram(query, docs):
    results = []
    for i, doc in enumerate(docs):
        unigram_counts, total = get_unigram_counts(doc)
        log_prob = 0
        for w in query:
            log_prob += math.log(unigram_prob(w, unigram_counts, total))
        results.append((i, log_prob))
    return results

def query_log_likelihood_bigram(query, docs):
    results = []
    for i, doc in enumerate(docs):
        unigram_counts, total_unigram = get_unigram_counts(doc)
        bigram_counts, total_bigram = get_bigram_counts(doc)
        log_prob = 0
        prev_word = "<s>"
        for w in query + ["</s>"]:
            log_prob += math.log(bigram_prob(prev_word, w, bigram_counts, unigram_counts))
            prev_word = w
        results.append((i, log_prob))
    return results

query = ["sun", "bright"]

uni_scores = query_log_likelihood_unigram(query, preprocessed_documents)
bi_scores = query_log_likelihood_bigram(query, preprocessed_documents)

print("Unigram Log-Likelihoods:")
for i, score in uni_scores:
    print(f"Doc {i+1}: {score:.4f}")

print("\nBigram Log-Likelihoods:")
for i, score in bi_scores:
    print(f"Doc {i+1}: {score:.4f}")

best_uni = max(uni_scores, key=lambda x: x[1])
best_bi = max(bi_scores, key=lambda x: x[1])

print(f"\nMost likely document (Unigram): Doc {best_uni[0]+1}")
print(f"Most likely document (Bigram): Doc {best_bi[0]+1}")
