In [1]:
import string
import re
import random
import time
from typing import List
import io
import keras
import collections

In [2]:
def get_data():
    path = keras.utils.get_file('nietzsche.txt', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')

    with io.open(path, encoding='utf-8') as file:
      raw_text = file.read()

    processed_text = raw_text.lower()
    processed_text = processed_text.replace('\n', ' ').replace('\r', '')
    processed_text = re.sub(r'[\d=_]', r'', processed_text)
    processed_text = re.sub(r'(?=[",;:()])(?<=[^\s])', r' ', processed_text) # adding spaces around punctuations
    processed_text = re.sub(r'(?<=[",;:()])(?=[^\s])', r' ', processed_text) # adding spaces around punctuations
    processed_text = re.sub(r'(?<=[^\s])(--)', r' --', processed_text) # adding spaces around punctuations
    processed_text = re.sub(r'(--)(?=[^\s])', r'-- ', processed_text) # adding spaces around punctuations
    # print(processed_text)
    return processed_text

data = get_data()

In [3]:
def create_ngram_info(n, data):
    # sentences = re.split('[?!. ]', data)
    sentences = data.split()
    word_frequencies = collections.Counter(sentences)

    ngram_frequencies = {} # maps tuples to ints
    candidates_for_contexts = {} # maps tuples to sets
    for i in range(n, len(sentences)-1):
        # print(sentences[i-1])
        gram = [sentences[i-x] for x in reversed(range(n))]
        gram = tuple(gram)
        # print("Key: ", i, ngram)

        context = gram[:n-1]
        current = gram[-1]
        # print("Context: ", i, context)

        ngram_frequencies[gram] = ngram_frequencies.get(gram,0) + 1

        current_context = candidates_for_contexts.get(context,0)
        updated_context = current_context.add(current) if current_context else {current}
        candidates_for_contexts[context] = updated_context

    ngram_probabilities = {}
    for n in ngram_frequencies.keys():
        probability = ngram_frequencies.get(n) / float(word_frequencies[n[-1]])
        ngram_probabilities[n] = probability 
    
    return ngram_frequencies, candidates_for_contexts, ngram_probabilities

ngram_frequencies, candidates_for_contexts, ngram_probabilities = create_ngram_info(3, data)

In [4]:
"""
Chooses the most likely next word
"""
def get_next_word(context):
    candidates = candidates_for_contexts.get(tuple(context), set())
    print("candidate type: ", type(candidates))
    candidates = list(candidates)
    print("candidates: ", candidates)
    if(not candidates):
        return '.'

    print(candidates)
    probabilities = [ngram_probabilities.get(context.append(c),0) for c in candidates]
    min_indexes = [i for i, x in enumerate(probabilities) if x == min(probabilities)]
    print(min_indexes)
    first = min_indexes[0]

    selected_candidate = candidates[first] # can do more randomization

    return selected_candidate
     



In [None]:
def generate_sentence(n=3, min_words=10, max_words=20):
    sentence_length = random.randint(min_words, max_words)
    print(sentence_length)

    new_sentence = []
    for w in range(sentence_length):
        if len(new_sentence) > n:
            context = new_sentence[-n:]
            # print("here")
        else:
            context = new_sentence # pad the front
            # print(context)
        new_word = get_next_word(context)
        new_sentence.append(new_word)

    return ' '.join(new_sentence)

generate_sentence()

In [None]:
def main():
    data = get_data()
    ngram_frequencies, candidates_for_contexts, ngram_probabilities = create_ngram_info(3, data)

    # random.seed(45)
    # desired_output_length = 30
    # seed_text = "We are"
    n = 3

    ngram_prediction = generate_sentence(n=3)
    
    print("\nNGRAM generated: ", ngram_prediction)


if __name__ == "__main__":
    main()
