<a href="https://colab.research.google.com/github/kk412027247/nlp/blob/main/generating_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget https://www.wangluoguimi.com/news/news-headlines.tsv

In [None]:
!head -3 news-headlines.tsv

In [None]:
import numpy as np
import csv
chars = sorted(set("abcdefghijklmnopqrstuvwxyz0123456789 -,;.!?:’’’/\|_@#$%ˆ&*˜‘+-=()[]{}' ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
chars = list(chars)
EOS = 'EOS'
UNK = '<UNK>'
PAD = '<PAD>'
chars.append(UNK)
chars.append(EOS)
chars.insert(0, PAD)

char2idx = {u: i for i, u in enumerate(chars)}
idx2char =  np.array(chars)

def char_idx(c):
  if c in chars:
    return char2idx[c]
  return char2idx[UNK]

data = []
MAX_LEN = 75

with open('news-headlines.tsv', 'r') as file:
  lines = csv.reader(file, delimiter='\t')
  for line in lines:
    hdln = line[0]
    cnvrtd = [char_idx(c) for c in hdln[:-1]]
    if len(cnvrtd) >= MAX_LEN:
      cnvrtd = cnvrtd[0:MAX_LEN-1]
      cnvrtd.append(char2idx[EOS])
    else:
      cnvrtd.append(char2idx[EOS])
      remain=MAX_LEN - len(cnvrtd)
      if remain > 0:
        for i in range(remain):
          cnvrtd.append(char2idx[PAD])
    data.append(cnvrtd)
print('Data file loaded')

In [None]:
np_data = np.array(data)
np_data_in = np_data[:, :-1]
np_data_out = np_data[:, 1:]
np_data_in
import tensorflow as tf
x = tf.data.Dataset.from_tensor_slices((np_data_in, np_data_out))

In [None]:
vocab_size = len(chars)
embedding_dim = 256
rnn_units = 1024
BATCH_SIZE = 256

x_train= x.shuffle(100000, reshuffle_each_iteration=True).batch(BATCH_SIZE, drop_remainder=True)

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, mask_zero=True, batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

model = build_model(vocab_size=vocab_size, embedding_dim=embedding_dim, rnn_units=rnn_units,batch_size=BATCH_SIZE)
print(model.summary())

In [None]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer = 'adam', loss= loss)

In [None]:
import datetime
import os
dt = datetime.datetime.today().strftime("%Y-%b-%d-%H-%M-%S")
checkpoint_dir = './training_checkpoints'+dt
checkpoint_prefx = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefx, save_weights_only=True)

In [None]:
import time
EPOCHS=25
start = time.time()
history = model.fit(x_train, epochs=EPOCHS, callbacks=[checkpoint_callback])

In [None]:
import matplotlib.pyplot as plt
lossplot = 'loss-'+dt+'.png'
plt.plot(history.history['loss'])
plt.title('model loss')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.savefig(lossplot)
print('Save loss to: ', lossplot)