In [None]:
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.tokenize import sent_tokenize

# Ensure you have the necessary NLTK data
nltk.download('punkt')

def preprocess_text(text):
    sentences = sent_tokenize(text)
    tokens = [word_tokenize(sentence.lower()) for sentence in sentences]
    return [token for sublist in tokens for token in sublist]

def compute_ngrams(tokens, n):
    return list(ngrams(tokens, n))

def bigram_probabilities(tokens):
    bigrams = list(ngrams(tokens, 2))
    fdist = FreqDist(bigrams)
    cfdist = ConditionalFreqDist(bigrams)
    probabilities = {}
    for w1 in cfdist:
        total = cfdist[w1].N()
        probabilities[w1] = {w2: cfdist[w1][w2] / total for w2 in cfdist[w1]}
    return probabilities

def next_word_prediction(bigram_probs, word):
    if word in bigram_probs:
        return sorted(bigram_probs[word].items(), key=lambda x: x[1], reverse=True)
    else:
        return []

# Sample corpus
text = """
Natural language processing (NLP) is a subfield of artificial intelligence (AI) concerned with the interactions
between computers and human (natural) languages. As such, NLP is related to the area of human-computer interaction.
Many NLP tasks involve natural language understanding, which requires comprehension of the meaning and context of the
language. NLP tasks can also involve natural language generation, which involves creating coherent text based on some input.
"""

# Preprocess the text
tokens = preprocess_text(text)

# Compute unigrams, bigrams, and trigrams
unigrams = compute_ngrams(tokens, 1)
bigrams = compute_ngrams(tokens, 2)
trigrams = compute_ngrams(tokens, 3)

# Compute bigram probabilities
bigram_probs = bigram_probabilities(tokens)

# Print results
print("Unigrams:")
print(FreqDist(unigrams))

print("\nBigrams:")
print(FreqDist(bigrams))

print("\nTrigrams:")
print(FreqDist(trigrams))

print("\nBigram Probabilities:")
for w1, probs in bigram_probs.items():
    print(f"{w1}: {probs}")

print("\nNext word prediction for 'natural':")
print(next_word_prediction(bigram_probs, 'natural'))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unigrams:
<FreqDist with 51 samples and 80 outcomes>

Bigrams:
<FreqDist with 73 samples and 79 outcomes>

Trigrams:
<FreqDist with 77 samples and 78 outcomes>

Bigram Probabilities:
natural: {'language': 0.75, ')': 0.25}
language: {'processing': 0.25, 'understanding': 0.25, '.': 0.25, 'generation': 0.25}
processing: {'(': 1.0}
(: {'nlp': 0.3333333333333333, 'ai': 0.3333333333333333, 'natural': 0.3333333333333333}
nlp: {'tasks': 0.5, ')': 0.25, 'is': 0.25}
): {'is': 0.3333333333333333, 'concerned': 0.3333333333333333, 'languages': 0.3333333333333333}
is: {'a': 0.5, 'related': 0.5}
a: {'subfield': 1.0}
subfield: {'of': 1.0}
of: {'the': 0.5, 'artificial': 0.25, 'human-computer': 0.25}
artificial: {'intelligence': 1.0}
intelligence: {'(': 1.0}
ai: {')': 1.0}
concerned: {'with': 1.0}
with: {'the': 1.0}
the: {'interactions': 0.25, 'area': 0.25, 'meaning': 0.25, 'language': 0.25}
interactions: {'between': 1.0}
between: {'computers': 1.0}
computers: {'and': 1.0}
and: {'human': 0.5, 'context':