In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
np.random.seed(3)                    # same random number generation

In [3]:
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt

--2023-05-07 15:55:23--  https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26622 (26K) [text/plain]
Saving to: ‘edgar_allan_poe.txt’


2023-05-07 15:55:23 (2.81 MB/s) - ‘edgar_allan_poe.txt’ saved [26622/26622]



In [4]:
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt

--2023-05-07 15:55:27--  https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 56286 (55K) [text/plain]
Saving to: ‘robert_frost.txt’


2023-05-07 15:55:28 (4.04 MB/s) - ‘robert_frost.txt’ saved [56286/56286]



In [5]:
def get_clean_lines(file_name):
  cleaned_lines_list =[]

  with open(file_name) as txts:
    f_lines_list = txts.readlines()
  for line in f_lines_list:
    line = line.strip().lower()
    if line:
      if line[0] != '"' and line[0].isnumeric() == False:
        cleaned_lines_list.append(re.sub('\n|,|\u2009\n|!|-|;|(|)', '', line))
  
  return cleaned_lines_list
  
files_list = ['bday_poems.txt', 'edgar_allan_poe.txt', 'robert_frost.txt']
for file in files_list:
  print(get_clean_lines(file))

["a father's love is a force to behold", "a strength that weathers life's toughest cold.", 'it is a light that guides us through the night', 'a bond that endures all wrongs made right.', 'it is the sun that warms our soul', 'a love that makes us feel whole.', 'it is the wind that whispers in our ear', 'a voice that we long to hear.', 'the measure of a father is not in wealth or fame', "but in the love he gives and the lives he's changed.", "it's in the moments shared the tears shed", 'the lessons learned the laughter spread.', "it's in the way he loves his family true", 'and in the sacrifices he makes to see them through.', "it's in the values he instills the wisdom he imparts", 'and in the love that always fills our hearts.', 'the heart of a father is a thing of wonder', 'a place where love and strength never sunder.', 'it is a well that never runs dry', 'a spring that quenches thirst under the sky.', 'it is a beacon that guides us through the storm', 'a shelter that keeps us safe and

In [20]:
# need for word to idx list is not there for a langauge generator because we just need the end porbabilities of words themselves

def wordtoidx(lines_list):
  word_to_id = {}
  id = 1
  for line in lines_list:
    last_idx = None
    word_list = line.split()
    for word in word_list:
      if word not in word_to_id:
        word_to_id[word] = id
      id += 1
  return word_to_id

first_model_dict = wordtoidx(get_clean_lines(files_list[2]))
first_model_dict['love']            # testing purposes to see if function working

3235

In [6]:
def add_to_dict(dict_name, key, value):
  if key not in dict_name:
    dict_name[key] = []
  dict_name[key].append(value)

In [7]:
initial_storage_dict ={}
first_order_dict = {}
second_order_dict = {}

In [32]:
len(get_clean_lines(files_list[2])[1]), get_clean_lines(files_list[2])[1], get_clean_lines(files_list[2])[1].split()

(33,
 'and sorry i could not travel both',
 ['and', 'sorry', 'i', 'could', 'not', 'travel', 'both'])

In [8]:
entire_list = get_clean_lines(files_list[0])

# entire list contains all the lines in the text file
# we iterate through this entire list line by line
# we use the length of line to iterate instead of just directly iterating through the words because we need to access the previous two words as well

for line in entire_list:
  actual_line = line.split()           
  words_list_length = len(actual_line)
  for i in range(words_list_length):
    current_token = actual_line[i]            # current word in the line      
    if i == 0:                  # if first word in line then append to initial dictionary and keep incrementing its count(number of times it appears is increased continuously)
      initial_storage_dict[current_token] = initial_storage_dict.get(current_token, 0.) + 1
    else:
      previous_token = actual_line[i-1]       # previous token for appending into first_order_dict if its not the end word or second word in the sequence
      if i == words_list_length-1:
        add_to_dict(second_order_dict, (previous_token, current_token), 'END')     # appending end signifies end of line
      if i == 1:
        add_to_dict(first_order_dict, previous_token, current_token)
      else:
        previous_previous_token = actual_line[i-2]
        add_to_dict(second_order_dict, (previous_previous_token, previous_token), current_token)      # using function to append previous two words' values into the dictionaries, we have the probabilities of the required word occurring

In [9]:
# instead of storing only counts, convert counts of words into probabilities now - normalisation
initial_sum = sum(initial_storage_dict.values())
for term, count in initial_storage_dict.items():
  initial_storage_dict[term] /= initial_sum

In [10]:
# the words in initial storage dict are stored as words, we need to convert them into probabilities

def dict_list_value_conversion(d1):
  dict_length = len(d1)
  prob_conversion_dict = {}
  for token in d1:
    prob_conversion_dict[token] = prob_conversion_dict.get(token, 0.) + 1      # getting token and setting its count
  for token, count in prob_conversion_dict.items():
    prob_conversion_dict[token] /= dict_length              # setting the probability by dividing by length
  return prob_conversion_dict

In [11]:
for previous_word, words in first_order_dict.items():
  first_order_dict[previous_word] = dict_list_value_conversion(words)

In [12]:
for previous_previous_word, words_list in second_order_dict.items():
  second_order_dict[previous_previous_word] = dict_list_value_conversion(words_list)

In [13]:
def random_word_from_range(d1):
  random_word_prob = np.random.random()        # sample word probability generation
  cumulative_probability = 0
  for token, probability in d1.items():
    cumulative_probability += probability        # keep adding probabilities to find out cumulatives
    if random_word_prob < cumulative_probability:      # if randomly generated probability is in a range within cumulative, generate that word with  probability = cumulative_probability
      return token
  assert(False)        # error statement

In [16]:
def generator():
  for i in range(5):
    sentence = []
    first_word = random_word_from_range(initial_storage_dict)
    sentence.append(first_word)

    second_word = random_word_from_range(first_order_dict[first_word])
    sentence.append(second_word)

    while True:
      next_word = random_word_from_range(second_order_dict[(first_word, second_word)])
      if next_word == 'END':
        break
      sentence.append(next_word)

      first_word = second_word
      second_word = next_word
    print(' '.join(sentence))

In [42]:
generator()

and fills my heart with sweet rapport.
blood was its avatar and its seal—the redness and the bad and evil out of all.
yours is the wind that whispers in our ear
a source of strength that we can't hide.
whose love is a well that never fades or goes too far.
