### Todo

1. Get text from a file
2. Get all possible bi grams
3. Apply smoothing(add-one,add-delta) for each bigram
4. Given random bi gram as an input
5. Print the probability of the given bi gram via all the three ,methods

- add-one ---->laplace
- add-delta---->lidstone,jefrey's

Input text

`This is a sample text for testing the bigram probability calculation using smoothing techniques. The quick brown fox jumps over the lazy dog. The lazy dog barks loudly. Testing is essential for evaluating the performance of natural language processing models.`

In [5]:
from collections import defaultdict
import math

# Step 1: Get text from a file
def read_text_from_file(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

# Step 2: Get all possible bigrams
def get_bigrams(text):
    words = text.split()
    bigrams = [(words[i], words[i + 1]) for i in range(len(words) - 1)]
    return bigrams

# Step 3: Apply Laplace (add-one) smoothing
def laplace_smoothing(bigrams, vocabulary):
    count_bigram = defaultdict(int)
    count_unigram = defaultdict(int)

    for bigram in bigrams:
        count_bigram[bigram] += 1
        count_unigram[bigram[0]] += 1

    def laplace_probability(bigram):
        return (count_bigram[bigram] + 1) / (count_unigram[bigram[0]] + vocabulary)

    return laplace_probability

# Step 4: Apply Lidstone (add-delta) smoothing with Jeffrey's smoothing
def lidstone_smoothing(bigrams, vocabulary, delta):
    count_bigram = defaultdict(int)
    count_unigram = defaultdict(int)

    for bigram in bigrams:
        count_bigram[bigram] += 1
        count_unigram[bigram[0]] += 1

    def lidstone_probability(bigram):
        numerator = count_bigram[bigram] + delta
        denominator = count_unigram[bigram[0]] + (vocabulary * delta)
        return numerator / denominator

    return lidstone_probability

# Step 5: Given a random bigram as input, print the probabilities using both smoothing methods
def main():
    filename = 'lab6-file.txt'  # Replace with the actual file path
    text = read_text_from_file(filename)
    bigrams = get_bigrams(text)
    vocabulary = len(set(text.split()))  # Size of the vocabulary

    delta = 0.5  # Adjust this value for Lidstone smoothing

    laplace_prob = laplace_smoothing(bigrams, vocabulary)
    lidstone_prob = lidstone_smoothing(bigrams, vocabulary, delta)

    input_bigram = ('input', 'bigram')  # Replace with your random bigram

    print(f"Probability using Laplace (add-one) smoothing: {laplace_prob(input_bigram)}")
    print(f"Probability using Lidstone (add-delta) with Jeffrey's smoothing: {lidstone_prob(input_bigram)}")

if __name__ == "__main__":
    main()


Probability using Laplace (add-one) smoothing: 0.029411764705882353
Probability using Lidstone (add-delta) with Jeffrey's smoothing: 0.029411764705882353
