In [1]:
import numpy as np
import matplotlib.pyplot as plt
import sys
import os
import re
from typing import List
from collections import defaultdict

In [2]:
training_text_filename = "paul_graham.txt"
training_raw_text = open(
    training_text_filename,
    'r',
    encoding='utf-8')\
    .read()\
    .lower()
period_freqs = [len(x.split(" ")) for x in training_raw_text.split(".")]
comma_freqs = [len(x.split(" ")) for x in training_raw_text.split(",")]
# Alpha-numeric and spaces only
# We will re-add punctuation later via normal distributions
training_raw_text = re.sub("[^a-z0-9'\"\s]", "", training_raw_text)
# Remove duplicate sequential space characters
training_raw_text = re.sub("\s+", " ", training_raw_text)

In [3]:
def tokenize_string(input_text):
    tokens = []
    stop_words = set(open("stop_words.txt", "r").read().split('\n'))
    pre_tokenized = input_text.split(" ")
    # Prevents the next chunk from breaking when a stop word ends the data
    while pre_tokenized[-1] in stop_words:
        pre_tokenized = pre_tokenized[:-1]
    for i in range(len(pre_tokenized)):
        # Avoids duplication of words when grouping stop words with subsequent ones
        if i != 0 and pre_tokenized[i-1] in stop_words:
            continue
        # Base case, no grouping needed
        if pre_tokenized[i] not in stop_words:
            tokens.append(pre_tokenized[i])
            continue
        # Group stop words with subsequent ones until the next word is no longer a stop word
        j = i
        while pre_tokenized[j] in stop_words:
            j += 1
        tokens.append(" ".join(pre_tokenized[i:j+1]))
    return tokens
tokens = tokenize_string(training_raw_text)
print(tokens[:20])
len(set(tokens))

['there', 'are', 'two', 'distinct', 'ways', 'to be', 'politically', 'moderate', 'on purpose', 'and', 'by accident', 'intentional', 'moderates', 'are', 'trimmers', 'deliberately', 'choosing', 'a position', 'midway', 'between the extremes']


4934

In [4]:
markov_model = defaultdict(dict)
# Must be sufficiently small enough to prevent over-fitting and large enough to prevent under-fitting
# If you see constantly repeated phrases, this value needs to be changed
input_window = 2
inputs, outputs = [], []
for i in range(len(tokens) - input_window):
    # Using tuple since they are not mutable
    context = tuple(tokens[i:i+input_window])
    target = tokens[i+input_window]
    if target not in markov_model[context]:
        markov_model[context][target] = 0
    markov_model[context][target] += 1
first_key = list(markov_model.keys())[0]
print(f"{' '.join(first_key)} -> {markov_model[first_key]}")

there are -> {'two': 4, 'some kinds': 1, 'almost': 1, 'intellectual': 2, 'some ideas': 2, 'already': 1, 'more': 2, 'lots': 2, 'of course': 1, 'startups': 1, 'lessons': 1, 'now': 1, 'some who': 1, 'rarely': 1, 'no': 2, 'few of those': 1, 'many analogies': 1, 'a handful': 1, 'still': 2, 'certain': 1, 'titles': 1, 'about 40': 1, 'multiple': 1, 'probably': 1, 'only': 1, 'the more': 1}


In [5]:
# Normalize probabilities and restructure model for prediction efficiency
for context, poss in markov_model.items():
    total_freq = sum(list(poss.values()))
    markov_model[context] = {target: freq / total_freq for target, freq in poss.items()}
markov_model = {context: tuple(zip(*list(poss.items()))) for context, poss in markov_model.items()}
print(f"{' '.join(first_key)} -> {markov_model[first_key]}")

there are -> (('two', 'some kinds', 'almost', 'intellectual', 'some ideas', 'already', 'more', 'lots', 'of course', 'startups', 'lessons', 'now', 'some who', 'rarely', 'no', 'few of those', 'many analogies', 'a handful', 'still', 'certain', 'titles', 'about 40', 'multiple', 'probably', 'only', 'the more'), (0.11428571428571428, 0.02857142857142857, 0.02857142857142857, 0.05714285714285714, 0.05714285714285714, 0.02857142857142857, 0.05714285714285714, 0.05714285714285714, 0.02857142857142857, 0.02857142857142857, 0.02857142857142857, 0.02857142857142857, 0.02857142857142857, 0.02857142857142857, 0.05714285714285714, 0.02857142857142857, 0.02857142857142857, 0.02857142857142857, 0.05714285714285714, 0.02857142857142857, 0.02857142857142857, 0.02857142857142857, 0.02857142857142857, 0.02857142857142857, 0.02857142857142857, 0.02857142857142857))


In [6]:
seed_idx = np.random.randint(0, len(tokens) - input_window)
seed = tuple(tokens[i:i+input_window])
print("SEED:")
print(" ".join(seed))
print("GENERATED (first 20 words, no punctuation):")
full_randoms = 0
outputs = []
for _ in range(50):
    if seed not in markov_model:
        idx = np.random.randint(0, len(markov_model))
        o = list(markov_model.keys())[idx]
        full_randoms += 1
    else:
        o = tuple(np.random.choice(markov_model[seed][0], 1, p=markov_model[seed][1]).tolist())
        # o = (list(markov_model[seed][0])[np.argmax(markov_model[seed][1])],)
    if input_window == 1:
        seed = tuple(*(list(seed)[1:] + list(o)))
    else:
        seed = tuple(list(seed)[1:] + list(o))
    outputs.append(o)
# Splitting and joining here since the punctuation model cares about words not phrases
results = (" ".join([" ".join(phrase) for phrase in outputs])).split(" ")
print(" ".join(results[:20]))
print(f"{full_randoms} fully random selections")

SEED:
drafts of this
GENERATED (first 20 words, no punctuation):
[('one of the biggest',), ('things',), ('holding',), ('people',), ('back',), ('in every',), ('field',), ('but',), ('their',), ('influence',), ('varies',), ('one of the reasons',), ('politics',), ('for example',), ('tends',), ('to be',), ('boring',), ('is',), ("that it's",), ('not',)]
one of the biggest things holding people back in every field but their influence varies one of the reasons politics
0 fully random selections


In [7]:
p_mean, p_std = np.mean(period_freqs), np.std(period_freqs)
c_mean, c_std = np.mean(comma_freqs), np.std(comma_freqs)
idx = 0
stop_words = set(open("stop_words.txt", "r").read().split('\n'))
while True:
    next_p = np.random.normal(p_mean, p_std, 1).round(0).astype(int)[0]
    next_c = np.random.normal(c_mean, c_std, 1).round(0).astype(int)[0]
    if next_p + idx >= len(results) or next_c + idx >= len(results):
        results[-1] += "."
        break
    # Favoring shorter sentences
    symb = ","
    if next_p <= next_c:
        idx += next_p
        symb = "."
    else:
        idx += next_c
    while results[idx] in stop_words and not idx >= len(results):
        idx += 1
    results[idx] += symb
print("Final results:")
print(" ".join(results))

Final results:
one of the biggest things holding people back in every field but their, influence varies one of the reasons politics for example tends to be boring is that it's not that important founders who never explicitly say no some of the worse ones never actually do say no for introductions to other investors that will in many cases be an antirecommendation 8 this is also a good way to tell how serious.
