In [1]:
import nltk
from nltk.tokenize import word_tokenize
from collections import defaultdict
import pandas as pd

# Download necessary NLTK data
nltk.download('punkt', quiet=True)


True

In [2]:
def compute_ngrams(text):
    """
    Compute unigram and bigram counts from the input text.

    Args:
        text (str): The input text to analyze.

    Returns:
        tuple: A tuple containing dictionaries of unigram and bigram counts.
    """
    tokens = ['<s>'] + word_tokenize(text.lower()) + ['</s>']

    unigrams = defaultdict(int)
    bigrams = defaultdict(int)

    for token in tokens:
        unigrams[token] += 1

    for i in range(len(tokens) - 1):
        bigrams[(tokens[i], tokens[i+1])] += 1

    return unigrams, bigrams

def compute_probabilities(unigrams, bigrams):
    """
    Compute unigram and bigram probabilities.

    Args:
        unigrams (dict): Dictionary of unigram counts.
        bigrams (dict): Dictionary of bigram counts.

    Returns:
        tuple: A tuple containing dictionaries of unigram and bigram probabilities.
    """
    total_unigrams = sum(unigrams.values())
    unigram_probs = {}
    bigram_probs = {}

    for token, count in unigrams.items():
        unigram_probs[token] = count / total_unigrams

    for (token1, token2), count in bigrams.items():
        bigram_probs[(token1, token2)] = count / unigrams[token1]

    return unigram_probs, bigram_probs


In [3]:
# Example text
text = "The quick brown fox jumps over the lazy dog."
# Uncomment the line below to input text manually
# text = input("Enter text for analysis: ")


In [4]:
unigrams, bigrams = compute_ngrams(text)
unigram_probs, bigram_probs = compute_probabilities(unigrams, bigrams)


In [5]:
# Create a DataFrame for unigrams
unigram_df = pd.DataFrame(list(unigram_probs.items()), columns=['Unigram', 'Probability'])
unigram_df = unigram_df.sort_values(by='Probability', ascending=False).reset_index(drop=True)

print("Unigram Probabilities:")
display(unigram_df)


Unigram Probabilities:


Unnamed: 0,Unigram,Probability
0,the,0.166667
1,<s>,0.083333
2,quick,0.083333
3,brown,0.083333
4,fox,0.083333
5,jumps,0.083333
6,over,0.083333
7,lazy,0.083333
8,dog,0.083333
9,.,0.083333


In [6]:
# Create a DataFrame for bigrams
bigram_items = [ (f"{token1} {token2}", prob) for (token1, token2), prob in bigram_probs.items()]
bigram_df = pd.DataFrame(bigram_items, columns=['Bigram', 'Probability'])
bigram_df = bigram_df.sort_values(by='Probability', ascending=False).reset_index(drop=True)

print("Bigram Probabilities:")
display(bigram_df)


Bigram Probabilities:


Unnamed: 0,Bigram,Probability
0,<s> the,1.0
1,quick brown,1.0
2,brown fox,1.0
3,fox jumps,1.0
4,jumps over,1.0
5,over the,1.0
6,lazy dog,1.0
7,dog .,1.0
8,. </s>,1.0
9,the quick,0.5
