In [None]:
import re

sentences = [
    "The sun sets over the horizon, painting the sky with hues of orange and pink.",
    "In the heart of the forest, a gentle breeze rustles the leaves.",
    "She walked along the sandy shore, feeling the cool water on her feet.",
    "The old bookshop on the corner is filled with stories waiting to be discovered.",
    "As the rain falls outside, I sit by the window with a cup of hot tea."
]

corpus = [re.findall(r'\b\w+\b', sentence.lower()) for sentence in sentences]
corpus

[['the',
  'sun',
  'sets',
  'over',
  'the',
  'horizon',
  'painting',
  'the',
  'sky',
  'with',
  'hues',
  'of',
  'orange',
  'and',
  'pink'],
 ['in',
  'the',
  'heart',
  'of',
  'the',
  'forest',
  'a',
  'gentle',
  'breeze',
  'rustles',
  'the',
  'leaves'],
 ['she',
  'walked',
  'along',
  'the',
  'sandy',
  'shore',
  'feeling',
  'the',
  'cool',
  'water',
  'on',
  'her',
  'feet'],
 ['the',
  'old',
  'bookshop',
  'on',
  'the',
  'corner',
  'is',
  'filled',
  'with',
  'stories',
  'waiting',
  'to',
  'be',
  'discovered'],
 ['as',
  'the',
  'rain',
  'falls',
  'outside',
  'i',
  'sit',
  'by',
  'the',
  'window',
  'with',
  'a',
  'cup',
  'of',
  'hot',
  'tea']]

In [None]:
from collections import defaultdict

def generate_ngrams(tokens, n):
    ngrams = []
    for sentence in tokens:
        for i in range(len(sentence) - n + 1):
            ngrams.append(tuple(sentence[i:i+n]))
    return ngrams

bigrams = generate_ngrams(corpus, 2)

bigrams


[('the', 'sun'),
 ('sun', 'sets'),
 ('sets', 'over'),
 ('over', 'the'),
 ('the', 'horizon'),
 ('horizon', 'painting'),
 ('painting', 'the'),
 ('the', 'sky'),
 ('sky', 'with'),
 ('with', 'hues'),
 ('hues', 'of'),
 ('of', 'orange'),
 ('orange', 'and'),
 ('and', 'pink'),
 ('in', 'the'),
 ('the', 'heart'),
 ('heart', 'of'),
 ('of', 'the'),
 ('the', 'forest'),
 ('forest', 'a'),
 ('a', 'gentle'),
 ('gentle', 'breeze'),
 ('breeze', 'rustles'),
 ('rustles', 'the'),
 ('the', 'leaves'),
 ('she', 'walked'),
 ('walked', 'along'),
 ('along', 'the'),
 ('the', 'sandy'),
 ('sandy', 'shore'),
 ('shore', 'feeling'),
 ('feeling', 'the'),
 ('the', 'cool'),
 ('cool', 'water'),
 ('water', 'on'),
 ('on', 'her'),
 ('her', 'feet'),
 ('the', 'old'),
 ('old', 'bookshop'),
 ('bookshop', 'on'),
 ('on', 'the'),
 ('the', 'corner'),
 ('corner', 'is'),
 ('is', 'filled'),
 ('filled', 'with'),
 ('with', 'stories'),
 ('stories', 'waiting'),
 ('waiting', 'to'),
 ('to', 'be'),
 ('be', 'discovered'),
 ('as', 'the'),
 ('the'

In [None]:
def build_ngram_frequency(tokens, n):
    ngram_freq = defaultdict(lambda: defaultdict(int))

    for sentence in tokens:
        for i in range(len(sentence) - n):
            ngram = tuple(sentence[i:i+n-1])
            next_word = sentence[i+n-1]
            ngram_freq[ngram][next_word] += 1

    return ngram_freq

bigram_freq = build_ngram_frequency(corpus, 2)

bigram_freq


defaultdict(<function __main__.build_ngram_frequency.<locals>.<lambda>()>,
            {('the',): defaultdict(int,
                         {'sun': 1,
                          'horizon': 1,
                          'sky': 1,
                          'heart': 1,
                          'forest': 1,
                          'sandy': 1,
                          'cool': 1,
                          'old': 1,
                          'corner': 1,
                          'rain': 1,
                          'window': 1}),
             ('sun',): defaultdict(int, {'sets': 1}),
             ('sets',): defaultdict(int, {'over': 1}),
             ('over',): defaultdict(int, {'the': 1}),
             ('horizon',): defaultdict(int, {'painting': 1}),
             ('painting',): defaultdict(int, {'the': 1}),
             ('sky',): defaultdict(int, {'with': 1}),
             ('with',): defaultdict(int, {'hues': 1, 'stories': 1, 'a': 1}),
             ('hues',): defaultdict(int, {'of': 1}),
 

In [None]:
import random

def predict_next(ngram_freq, current_ngram):
    next_word_probs = ngram_freq.get(current_ngram, {})

    if not next_word_probs:
        return random.choice(list(set(word for sentence in corpus for word in sentence)))

    total_count = sum(next_word_probs.values())
    next_word = random.choices(list(next_word_probs.keys()), weights=list(next_word_probs.values()), k=1)[0]

    return next_word

def generate_text(ngram_freq, start_ngram, n, num_words):
    current_ngram = start_ngram
    generated_text = list(current_ngram)

    for _ in range(num_words):
        next_word = predict_next(ngram_freq, current_ngram)
        generated_text.append(next_word)
        current_ngram = tuple(generated_text[-(n-1):])

    return ' '.join(generated_text)

start_bigram = ("the", "rain")
generated_sentence = generate_text(bigram_freq, start_bigram, 2, 2)

print(generated_sentence)

start_bigram = ("the", "old","bookshop")
generated_sentence = generate_text(bigram_freq, start_bigram, 2, 2)

print(generated_sentence)


the rain gentle breeze
the old bookshop the corner
