In [3]:
%matplotlib inline

import numpy as np
import pandas as pd
import random
import requests as rq
import sys
import io
from bs4 import BeautifulSoup
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.optimizers import RMSprop
from collections import Counter
from datetime import datetime
import keras
import keras.callbacks
from keras.callbacks import TensorBoard
%load_ext tensorboard

Using TensorFlow backend.


In [4]:
print("Drunk Philosophers")

data_raw = pd.read_csv("./data/philosophy_data.csv")
data_raw = data_raw.drop(['sentence_spacy', 'original_publication_date', 'corpus_edition_date', 'sentence_length', 'sentence_lowered', 'lemmatized_str'], axis=1)
philosophers = list(dict.fromkeys(data_raw['author'].tolist()))

selected_data = data_raw.loc[data_raw['author'] == "Plato"]
text = ' '.join(selected_data['sentence_str'].tolist())

print(text[:2000])

Drunk Philosophers
 What's new, Socrates, to make you leave your usual haunts in the Lyceum and spend your time here by the king archon's court? Surely you are not prosecuting anyone before the king archon as I am? The Athenians do not call this a prosecution but an indictment, Euthyphro. What is this you say? Someone must have indicted you, for you are not going to tell me that you have indicted someone else. But someone else has indicted you? I do not really know him myself, Euthyphro. He is apparently young and unknown. They call him Meletus, I believe. He belongs to the Pitthean deme, if you know anyone from that deme called Meletus, with long hair, not much of a beard, and a rather aquiline nose. I don't know him, Socrates. What charge does he bring against you? A not ignoble one I think , for it is no small thing for a young man to have knowledge of such an important subject. He says he knows how our young men are corrupted and who corrupts them. He is likely to be wise, and when

In [5]:
chars = sorted(list(set(text)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

seqlen = 5
step = seqlen
sentences = []
for i in range(0, len(text) - seqlen - 1, step):
    sentences.append(text[i: i + seqlen + 1])

x = np.zeros((len(sentences), seqlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), seqlen, len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, (char_in, char_out) in enumerate(zip(sentence[:-1], sentence[1:])):
        x[i, t, char_indices[char_in]] = 1
        y[i, t, char_indices[char_out]] = 1


model = Sequential()
model.add(LSTM(128, input_shape=(seqlen, len(chars)), return_sequences=True))
model.add(Dense(len(chars), activation='softmax'))

model.compile(
    loss='categorical_crossentropy',
    optimizer=RMSprop(learning_rate=0.01),
    metrics=['categorical_crossentropy', 'accuracy']
)

def sample(preds, temperature=1.0):
    """Helper function to sample an index from a probability array."""
    preds = np.asarray(preds).astype('float64')
    preds = np.exp(np.log(preds) / temperature)  # softmax
    preds = preds / np.sum(preds)                #
    probas = np.random.multinomial(1, preds, 1)  # sample index
    return np.argmax(probas)                     #


def on_epoch_end(epoch, _):
    """Function invoked at end of each epoch. Prints generated text."""
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - seqlen - 1)
    
    # Q5: What does diversity do?
    diversity = 0.5
    print('----- diversity:', diversity)

    generated = ''
    sentence = text[start_index: start_index + seqlen]
    generated += sentence
    print('----- Generating with seed: "' + sentence + '"')
    sys.stdout.write(generated)

    for i in range(400):
        x_pred = np.zeros((1, seqlen, len(chars)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_indices[char]] = 1.

        preds = model.predict(x_pred, verbose=0)
        next_index = sample(preds[0, -1], diversity)
        next_char = indices_char[next_index]

        sentence = sentence[1:] + next_char

        sys.stdout.write(next_char)
        sys.stdout.flush()
    print()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

#model.fit(x, y,
#          batch_size=128,
#          epochs=1,
#          callbacks=[print_callback])

#model.save("models/Plato")

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  # This is added back by InteractiveShellApp.init_path()
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if sys.path[0] == '':
2022-04-24 13:23:04.377157: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in non-MKL-DNN operations, rebuild TensorFlow with the appropriate compiler flags.
2022-04-24 13:23:04.379576: I tensorflow/core/common_runtime/process_util.cc:115] Creating new thread pool with default inter op setting: 16. Tune using inter_op_parallelism_threads for best performance.


In [9]:
for phil in philosophers:
    print("Training: ", phil)
    selected_data = data_raw.loc[data_raw['author'] == phil]
    text = ' '.join(selected_data['sentence_str'].tolist())
    
    chars = sorted(list(set(text)))
    char_indices = dict((c, i) for i, c in enumerate(chars))
    indices_char = dict((i, c) for i, c in enumerate(chars))

    seqlen = 40
    step = seqlen
    sentences = []
    for i in range(0, len(text) - seqlen - 1, step):
        sentences.append(text[i: i + seqlen + 1])

    x = np.zeros((len(sentences), seqlen, len(chars)), dtype=np.bool)
    y = np.zeros((len(sentences), seqlen, len(chars)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        for t, (char_in, char_out) in enumerate(zip(sentence[:-1], sentence[1:])):
            x[i, t, char_indices[char_in]] = 1
            y[i, t, char_indices[char_out]] = 1


    model = Sequential()
    model.add(LSTM(128, input_shape=(seqlen, len(chars)), return_sequences=True))
    model.add(Dense(len(chars), activation='softmax'))

    model.compile(
        loss='categorical_crossentropy',
        optimizer=RMSprop(learning_rate=0.01),
        metrics=['categorical_crossentropy', 'accuracy']
    )

    print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
    
    model.fit(x, y,
              batch_size=128,
              epochs=50,
              callbacks=[print_callback])

    loc = "models/" + phil
    print("saving to: ", loc)
    model.save(loc)

Training:  Plato


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  app.launch_new_instance()
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


saving to:  models/Plato
Epoch 1/1


2022-04-24 13:37:33.349238: E tensorflow/core/grappler/optimizers/dependency_optimizer.cc:697] Iteration = 0, topological sort failed with message: The graph couldn't be sorted in topological order.
2022-04-24 13:37:33.355299: E tensorflow/core/grappler/optimizers/dependency_optimizer.cc:697] Iteration = 1, topological sort failed with message: The graph couldn't be sorted in topological order.
2022-04-24 13:37:33.370525: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:502] model_pruner failed: Invalid argument: MutableGraphView::MutableGraphView error: node 'loss_4/dense_5_loss/categorical_crossentropy/weighted_loss/concat' has self cycle fanin 'loss_4/dense_5_loss/categorical_crossentropy/weighted_loss/concat'.
2022-04-24 13:37:33.386200: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:502] remapper failed: Invalid argument: MutableGraphView::MutableGraphView error: node 'loss_4/dense_5_loss/categorical_crossentropy/weighted_loss/concat' has self cycle fanin 'loss_4/d

  2688/111201 [..............................] - ETA: 5:10 - loss: 3.1924 - categorical_crossentropy: 3.1924 - accuracy: 0.1722

KeyboardInterrupt: 