In [2]:
import nltk
from nltk import bigrams, trigrams
from nltk.util import ngrams

# Download punkt tokenizer
# nltk.download('punkt')

text = """Natural Language Processing is fun and interesting. 
Language models help in predicting the next word in a sentence."""

# Tokenize
words = nltk.word_tokenize(text.lower())
print(words)


['natural', 'language', 'processing', 'is', 'fun', 'and', 'interesting', '.', 'language', 'models', 'help', 'in', 'predicting', 'the', 'next', 'word', 'in', 'a', 'sentence', '.']


In [None]:
# Building Bigram Model
bigrams = {}
for i in range(len(words) - 1):
    w1, w2 = words[i], words[i+1]
    
    # This is to prevent error when w1 is not present, i.e. the first word of sentence
    if w1 not in bigrams:
        bigrams[w1] = []
    
    bigrams[w1].append(w2)

print("Bigram Model:", bigrams)


Bigram Model: {'natural': ['language'], 'language': ['processing', 'models'], 'processing': ['is'], 'is': ['fun'], 'fun': ['and'], 'and': ['interesting'], 'interesting': ['.'], '.': ['language'], 'models': ['help'], 'help': ['in'], 'in': ['predicting', 'a'], 'predicting': ['the'], 'the': ['next'], 'next': ['word'], 'word': ['in'], 'a': ['sentence'], 'sentence': ['.']}


In [5]:
def predict_bigram(word):
    if word in bigrams:
        return bigrams[word][0]   # just return the first word for simplicity
    return "No prediction"

print("Next word after 'is':", predict_bigram("is"))
print("Next word after 'language':", predict_bigram("language"))


Next word after 'is': fun
Next word after 'language': processing


In [6]:
# Building Trigram Model
trigrams = {}
for i in range(len(words) - 2):
    key = (words[i], words[i+1])
    next_word = words[i+2]
    if key not in trigrams:
        trigrams[key] = []
    trigrams[key].append(next_word)

print("Trigram Model:", trigrams)


Trigram Model: {('natural', 'language'): ['processing'], ('language', 'processing'): ['is'], ('processing', 'is'): ['fun'], ('is', 'fun'): ['and'], ('fun', 'and'): ['interesting'], ('and', 'interesting'): ['.'], ('interesting', '.'): ['language'], ('.', 'language'): ['models'], ('language', 'models'): ['help'], ('models', 'help'): ['in'], ('help', 'in'): ['predicting'], ('in', 'predicting'): ['the'], ('predicting', 'the'): ['next'], ('the', 'next'): ['word'], ('next', 'word'): ['in'], ('word', 'in'): ['a'], ('in', 'a'): ['sentence'], ('a', 'sentence'): ['.']}


In [8]:
def predict_trigram(w1, w2):
    key = (w1, w2)
    if key in trigrams:
        return trigrams[key][0]
    return "No prediction"

print("Next word after ('language', 'processing'):", predict_trigram('language', 'processing'))
print("Next word after ('next', 'word'):", predict_trigram('next', 'word'))


Next word after ('language', 'processing'): is
Next word after ('next', 'word'): in
