# TASK 0

In [10]:
import nltk
from nltk.corpus import gutenberg
from nltk import FreqDist, ngrams
from collections import defaultdict

nltk.download('gutenberg')
nltk.download('punkt')

class NGramLanguageModel:
    def __init__(self, corpus, n=3):
        """
        Ініціалізує та навчає N-грамну модель.
        :param corpus: Список речень (списки токенів).
        :param n: Порядок N-грам (напр., 2 для біграм, 3 для триграм).
        """
        self.n = n
        self.vocabulary = set()
        self.ngram_counts = defaultdict(FreqDist)
        self.context_counts = defaultdict(int)
        
        print(f"Building a {n}-gram model...")
        
        for sentence in corpus:
            sentence_ngrams = ngrams(sentence, n, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')
            
            for ngram in sentence_ngrams:
                self.vocabulary.add(ngram[-1])
                
                context = ngram[:-1]
                target = ngram[-1]
                
                self.ngram_counts[context][target] += 1
                self.context_counts[context] += 1

    def get_mle_prob(self, context, target):
        """
        Обчислює ймовірність за методом максимальної правдоподібності (MLE).
        P(target | context) = C(context, target) / C(context) 
        """
        count = self.ngram_counts[context][target]
        total_context_count = self.context_counts[context]
        
        if total_context_count == 0:
            return 0.0
        return count / total_context_count

print("Preparing corpus...")
sents = gutenberg.sents('austen-emma.txt')
processed_corpus = [
    ['<s>'] * 2 + [word.lower() for word in sent] + ['</s>']
    for sent in sents
]

trigram_model = NGramLanguageModel(processed_corpus, n=3)
print(f"Model built. Vocabulary size: {len(trigram_model.vocabulary)}")
print("Example probability P('is' | ('what', 'is')): ", trigram_model.get_mle_prob(('what', 'is'), 'is'))

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Preparing corpus...
Building a 3-gram model...
Model built. Vocabulary size: 7341
Example probability P('is' | ('what', 'is')):  0.0


# TASK 1

In [None]:
import math

split_ratio = int(len(processed_corpus) * 0.8)
train_data = processed_corpus[:split_ratio]
test_data = processed_corpus[split_ratio:]

bigram_model_base = NGramLanguageModel(train_data, n=2)
trigram_model_base = NGramLanguageModel(train_data, n=3)

def calculate_perplexity(model, test_sentences):
    """Обчислює перплексію моделі на тестових даних."""
    log_prob_sum = 0
    word_count = 0

    for sentence in test_sentences:
        sentence_ngrams = ngrams(sentence, model.n, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')
        
        for ngram in sentence_ngrams:
            context, target = ngram[:-1], ngram[-1]
            prob = model.get_smoothed_prob(context, target) 
            
            if prob > 0:
                log_prob_sum += math.log(prob)
            word_count += 1
            
    perplexity = math.exp(-log_prob_sum / word_count)
    return perplexity

Building a 2-gram model...
Building a 3-gram model...


# TASK 2

In [None]:
class NGramLanguageModel:
    def __init__(self, corpus, n=3, stupid_backoff_lambda=0.4):
        self.n = n
        self.stupid_backoff_lambda = stupid_backoff_lambda
        self.vocabulary = set()
        self.counts = [defaultdict(FreqDist) for _ in range(n)]
        self.context_counts = [defaultdict(int) for _ in range(n)]
        self.total_words = 0

        print(f"Building a {n}-gram model with backoff...")
        
        for sentence in corpus:
            self.total_words += len(sentence) - (n - 1) 
            for i in range(1, n + 1):
                sentence_ngrams = ngrams(sentence, i, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')
                for ngram in sentence_ngrams:
                    if i == 1: self.vocabulary.add(ngram[0])
                    context, target = ngram[:-1], ngram[-1]
                    self.counts[i-1][context][target] += 1
                    self.context_counts[i-1][context] += 1

    def get_smoothed_prob(self, context, target, current_n=None):
        """Рекурсивно обчислює ймовірність з використанням Stupid Backoff."""
        if current_n is None:
            current_n = self.n
        
        if current_n == 1:
            return (self.counts[0][()][target] + 1) / (self.total_words + len(self.vocabulary))

        context = context[-(current_n-1):] 
        count = self.counts[current_n-1][context][target]
        total_context_count = self.context_counts[current_n-1][context]
        
        if count > 0 and total_context_count > 0:
            return count / total_context_count
        else:
            return self.stupid_backoff_lambda * self.get_smoothed_prob(context, target, current_n - 1)


bigram_model = NGramLanguageModel(train_data, n=2)
trigram_model = NGramLanguageModel(train_data, n=3)

ppl_bigram = calculate_perplexity(bigram_model, test_data)
ppl_trigram = calculate_perplexity(trigram_model, test_data)

print("\n--- Model Comparison ---")
print(f"Bigram Model Perplexity: {ppl_bigram:.2f}")
print(f"Trigram Model Perplexity: {ppl_trigram:.2f}")


Building a 2-gram model with backoff...
Building a 3-gram model with backoff...

--- Model Comparison ---
Bigram Model Perplexity: 84.36
Trigram Model Perplexity: 96.51


# TASK 3

In [None]:
import random

def generate_sentence(model, prompt, max_length=20):
    """
    Генерує продовження речення для заданого промпту.
    """
    sentence = prompt.lower().split()
    context = tuple(['<s>'] * (model.n - 1 - len(sentence)) + sentence)
    
    for _ in range(max_length):
        candidates = model.vocabulary
        probabilities = [model.get_smoothed_prob(context, candidate) for candidate in candidates]
        
        next_word = random.choices(list(candidates), weights=probabilities, k=1)[0]
        
        if next_word == '</s>':
            break
        
        sentence.append(next_word)
        context = tuple(sentence[-(model.n - 1):])

    return ' '.join(sentence)

print("\n--- Sentence Generation (Trigram Model) ---")
prompt1 = "she was"
generated_text1 = generate_sentence(trigram_model, prompt1)
print(f"Prompt: '{prompt1}' -> '{generated_text1}'")

prompt2 = "he said"
generated_text2 = generate_sentence(trigram_model, prompt2)
print(f"Prompt: '{prompt2}' -> '{generated_text2}'")


--- Sentence Generation (Trigram Model) ---
Prompt: 'she was' -> 'she was expecting him every moment .'
Prompt: 'he said' -> 'he said he , papa ?'
