In [1]:
import nltk
from nltk import bigrams, trigrams
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.tokenize import word_tokenize
from nltk.corpus import reuters

# Download required NLTK data
nltk.download('punkt')
nltk.download('reuters')

# Sample text corpus: You can use any text corpus you have
text = "The quick brown fox jumps over the lazy dog. The quick brown fox is very quick."

# Tokenize the text
tokens = word_tokenize(text.lower())

# Unigrams
unigrams = list(FreqDist(tokens).keys())
print("Unigrams:")
print(unigrams)

# Bigrams
bi_grams = list(bigrams(tokens))
print("\nBigrams:")
print(bi_grams)

# Trigrams
tri_grams = list(trigrams(tokens))
print("\nTrigrams:")
print(tri_grams)

# Bigram Frequencies
bigram_freq = FreqDist(bi_grams)
print("\nBigram Frequencies:")
print(bigram_freq.most_common())

# Bigram Conditional Frequencies
cfd = ConditionalFreqDist((w1, w2) for w1, w2 in bigrams(tokens))
print("\nBigram Conditional Frequencies:")
for word in cfd:
    print(f"{word}: {dict(cfd[word])}")

# Bigram Probabilities
bigram_probs = {}
total_bigrams = len(bi_grams)
for (w1, w2), count in bigram_freq.items():
    bigram_probs[(w1, w2)] = count / total_bigrams
print("\nBigram Probabilities:")
print(bigram_probs)

# Next Word Prediction
def predict_next_word(word):
    if word in cfd:
        next_words = cfd[word]
        most_common_next_word = next_words.most_common(1)[0][0]
        return most_common_next_word
    else:
        return "No prediction available"

word_to_predict = "quick"
predicted_word = predict_next_word(word_to_predict)
print(f"\nNext word prediction for '{word_to_predict}':")
print(predicted_word)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package reuters to /root/nltk_data...


Unigrams:
['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'lazy', 'dog', '.', 'is', 'very']

Bigrams:
[('the', 'quick'), ('quick', 'brown'), ('brown', 'fox'), ('fox', 'jumps'), ('jumps', 'over'), ('over', 'the'), ('the', 'lazy'), ('lazy', 'dog'), ('dog', '.'), ('.', 'the'), ('the', 'quick'), ('quick', 'brown'), ('brown', 'fox'), ('fox', 'is'), ('is', 'very'), ('very', 'quick'), ('quick', '.')]

Trigrams:
[('the', 'quick', 'brown'), ('quick', 'brown', 'fox'), ('brown', 'fox', 'jumps'), ('fox', 'jumps', 'over'), ('jumps', 'over', 'the'), ('over', 'the', 'lazy'), ('the', 'lazy', 'dog'), ('lazy', 'dog', '.'), ('dog', '.', 'the'), ('.', 'the', 'quick'), ('the', 'quick', 'brown'), ('quick', 'brown', 'fox'), ('brown', 'fox', 'is'), ('fox', 'is', 'very'), ('is', 'very', 'quick'), ('very', 'quick', '.')]

Bigram Frequencies:
[(('the', 'quick'), 2), (('quick', 'brown'), 2), (('brown', 'fox'), 2), (('fox', 'jumps'), 1), (('jumps', 'over'), 1), (('over', 'the'), 1), (('the', 'lazy'), 1), (('lazy