In [1]:
import tensorflow as tf
tf.enable_eager_execution()

import pandas as pd
import numpy as np
import os, string, re, csv


from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 


# set seeds for reproducability
from tensorflow import set_random_seed
from numpy.random import seed
set_random_seed(2)
seed(1)

#for grammar
import spacy

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

Using TensorFlow backend.


In [2]:
path = "./all-the-news"
df = pd.DataFrame()
for file in os.listdir(path):
    if file != "formatted.csv":
        print(path+"/"+file)
        dfnew = pd.read_csv(path+"/"+file)
        df = pd.concat([df, dfnew])
articles = df["content"].to_numpy()

def clean_text(txt):
    txt = txt.encode("utf8").decode("ascii",'ignore')
    txt = re.sub(' +',' ', txt)
    txt = txt.replace("\t", "").replace("\r", "")
    return txt 

def write_formatted_csv(article_list):
    with open(path+"/formatted.csv", mode="w", newline=None) as outfile:
        writer = csv.writer(outfile)
        for article in article_list:
            writer.writerow(article)



./all-the-news/articles1.csv
./all-the-news/articles2.csv
./all-the-news/articles3.csv


In [3]:
# write_formatted_csv(corpus)

In [4]:
# word_index = {}
# word_frequency_index = {}
# counter = 0
# for article in corpus:
#     try:
#         article = article.split(" ")
#         if len(article) > 1:
#             for word in article:
#                 word = word.lower()
#                 word = "".join([w for w in word if w not in string.punctuation])
#                 if word not in word_index:
#                     word_index[word] = counter
#                     counter += 1
#                 if word not in word_frequency_index:
#                     word_frequency_index[word] = 1
#                 else:
#                     word_frequency_index[word] += 1
#     except:
#         pass

In [5]:
#convert corpus into one string
articles = [clean_text(x) for x in articles if type(x) is str]

In [6]:
corpus = " ".join(articles)

In [7]:
vocab = sorted(set(corpus))

In [8]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)
text_as_int = np.array([char2idx[c] for c in corpus], dtype=np.uint8)

In [9]:
print(f"{corpus[:10]} - {text_as_int[:10]}")

WASHINGTON - [51 29 47 36 37 42 35 48 43 42]


In [10]:
# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(corpus)//seq_length

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [11]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

In [12]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

In [13]:
dataset = sequences.map(split_input_target)

In [14]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<DatasetV1Adapter shapes: ((64, 100), (64, 100)), types: (tf.uint8, tf.uint8)>

In [15]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [16]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
    return model

In [17]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           23296     
_________________________________________________________________
lstm (LSTM)                  (64, None, 1024)          5246976   
_________________________________________________________________
dense (Dense)                (64, None, 91)            93275     
Total params: 5,363,547
Trainable params: 5,363,547
Non-trainable params: 0
_________________________________________________________________


In [18]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 91) # (batch_size, sequence_length, vocab_size)


In [21]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [22]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 91)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.510748


In [23]:
model.compile(optimizer='adam', loss=loss)

In [24]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [25]:
history = model.fit(dataset, epochs=100, steps_per_epoch=100, callbacks=[checkpoint_callback])

Epoch 1/100

Consider using a TensorFlow optimizer from `tf.train`.
Instructions for updating:
Use tf.train.CheckpointManager to manage checkpoints rather than manually editing the Checkpoint proto.
Epoch 2/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 3/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 4/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 5/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 6/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 7/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 8/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 9/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 10/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 11/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 12/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 13/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 14/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 15/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 16/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 17/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 18/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 19/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 20/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 21/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 22/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 23/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 24/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 25/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 26/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 27/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 28/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 29/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 30/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 31/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 32/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 33/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 34/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 35/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 36/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 37/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 38/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 39/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 40/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 41/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 42/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 43/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 44/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 45/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 46/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 47/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 48/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 49/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 50/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 51/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 52/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 53/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 54/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 55/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 56/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 57/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 58/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 59/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 60/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 61/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 62/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 63/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 64/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 65/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 66/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 67/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 68/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 69/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 70/100

Consider using a TensorFlow optimizer from `tf.train`.
Epoch 71/100



Consider using a TensorFlow optimizer from `tf.train`.
Epoch 72/100
  2/100 [..............................] - ETA: 7:03 - loss: 1.204 - ETA: 7:08 - loss: 1.2103

KeyboardInterrupt: 

In [47]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            23296     
_________________________________________________________________
lstm_1 (LSTM)                (1, None, 1024)           5246976   
_________________________________________________________________
dense_1 (Dense)              (1, None, 91)             93275     
Total params: 5,363,547
Trainable params: 5,363,547
Non-trainable params: 0
_________________________________________________________________


In [48]:
def generate_text(model, start_string):
   # Evaluation step (generating text using the learned model)

    # Number of characters to generate
    num_generate = 1000

    # Converting our start string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    

    # Empty string to store our results
    text_generated = []

    # Low temperatures results in more predictable text.
    # Higher temperatures results in more surprising text.
    # Experiment to find the best setting.
    temperature = 1.0

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a categorical distribution to predict the word returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # We pass the predicted word as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [49]:
print(generate_text(model, start_string=u"hello"))

hellon haunts several polls with their crime, she ability to be about: the current choices had tried to evaluate freedom, but also because the time I think it seems like it, You get themselves in this approva. A New York city filed by Pene Pand, said with O. C. The two good walls in secondbody, at a mission to prison for canapartic bis and promotes a wall for a moment campaign, has never been frustrations, ard others are talking for in order to support him expected to speak together.) As together, We were wedding on beyond North Carolina. Ordered the truth your it as stumbling almost one on that corone on top loans. Tesls before you arrested. Its 90 mune dashed to have offered possible rights and waits so offendes may never have everybody next morning was the only in others it was as if smart, Find have just dont work. Cerform the caused fah of clerians to be a resilient, the scientist s have struggled with ve their overtaskpect. ____ 9. Mr. Band. Mr. Kinbert, a family person, more tha