In [1]:
import tensorflow as tf
import keras
from keras.layers import StringLookup,Embedding,GRU,Input,Dense
from keras import Model
from keras.callbacks import ModelCheckpoint
import warnings
import numpy as np
warnings.filterwarnings('ignore')




In [24]:
with open('articles_text.txt','r',encoding='utf-8') as fi:
    text=fi.read().strip().lower().replace('ï','').replace('…','').replace('\u202f','').replace('\xa0','').replace('•','').replace('ü','').\
    replace('|','').replace('″','').replace('‘','').replace('_','').replace('—','')

In [25]:
text[:100]

'rise of e-health and its impact on humans by the year 2030the rise of e-health, or the use of electr'

In [26]:
import string
punc=string.punctuation
vocab=list(set(text))
for i in vocab:
    if i in punc:
        vocab.remove(i)

In [27]:
chars_tensor=tf.strings.unicode_split(text,'UTF-8')
chars_tensor

<tf.Tensor: shape=(716931,), dtype=string, numpy=array([b'r', b'i', b's', ..., b'f', b's', b'.'], dtype=object)>

In [28]:
chars_to_ids=StringLookup(vocabulary=vocab)
ids_to_chars=StringLookup(vocabulary=chars_to_ids.get_vocabulary(),invert=True)

In [29]:
charsids_tensor=chars_to_ids(chars_tensor)

In [30]:
charsids_tensor

<tf.Tensor: shape=(716931,), dtype=int64, numpy=array([ 8, 17,  1, ..., 33,  1, 44], dtype=int64)>

In [31]:
ids_to_chars(charsids_tensor)

<tf.Tensor: shape=(716931,), dtype=string, numpy=array([b'r', b'i', b's', ..., b'f', b's', b'.'], dtype=object)>

In [32]:
vocab_size=len(chars_to_ids.get_vocabulary())
vocab_size

46

In [33]:
join_char=lambda ids:tf.strings.reduce_join(ids_to_chars(ids),axis=-1)

In [35]:
join_char(charsids_tensor[:5])

<tf.Tensor: shape=(), dtype=string, numpy=b'rise '>

In [281]:
seq_len=100
dataset=tf.data.Dataset.from_tensor_slices(charsids_tensor)
seq_dataset=dataset.batch(seq_len+1,drop_remainder=True)
for seq in seq_dataset.take(5):
    print(join_char(seq).numpy())

b'rise of e[UNK]health and its impact on humans by the year 2030the rise of e[UNK]health, or the use of electro'
b'nic means to facilitate health care, has been a major development in the healthcare industry in recen'
b't years[UNK] the use of technology to improve access to healthcare and make it more efficient has the pot'
b'ential to revolutionize the way we think about healthcare and its delivery[UNK] by 2030, it is likely tha'
b't e[UNK]health will have a significant impact on the way we receive and provide healthcare, and this essa'


In [282]:
seq_data=seq_dataset.map(lambda x:(x[:-1],x[1:]))
for inp,out in seq_data.take(1):
    print(join_char(inp).numpy())
    print(join_char(out).numpy())

b'rise of e[UNK]health and its impact on humans by the year 2030the rise of e[UNK]health, or the use of electr'
b'ise of e[UNK]health and its impact on humans by the year 2030the rise of e[UNK]health, or the use of electro'


In [283]:
batch_size=64
buffer_size=15000
data=seq_data.shuffle(buffer_size).batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [284]:
for i,o in data.take(1):
    inp,out=i,o

In [78]:
def build_text_generation_model(vocab_size, embedding_dim, rnn_units,states=None):
    input_sequence = Input(shape=(None,), name='input_sequence')
    embedding_layer = Embedding(vocab_size, embedding_dim, name='embedding')(input_sequence)
    
    gru_layer = GRU(rnn_units, return_sequences=True, return_state=True, name='gru')
    if states is None:
        states = gru_layer.get_initial_state(embedding_layer)
    gru_seq, gru_states = gru_layer(embedding_layer, initial_state=states)

    output_logits = Dense(vocab_size, name='output')(gru_seq)
    model = Model(inputs=input_sequence, outputs=output_logits, name='text_generation_model')
    return model

In [128]:
emd_dim=256
rnn_units=512
model1=build_text_generation_model(vocab_size,emd_dim,rnn_units)

In [313]:
model(i)[0]

<tf.Tensor: shape=(100, 46), dtype=float32, numpy=
array([[-0.00771305,  0.00603304,  0.01988032, ...,  0.00415304,
         0.01138706,  0.00236001],
       [-0.00885162,  0.00186647,  0.01069327, ...,  0.00630166,
        -0.01088867,  0.00648821],
       [ 0.00329366,  0.00880979, -0.01110901, ..., -0.00476713,
         0.00717848, -0.00473939],
       ...,
       [-0.00504883,  0.00395807,  0.01773222, ...,  0.0049665 ,
         0.02405166, -0.00577515],
       [-0.00855392,  0.00175534,  0.01059595, ...,  0.00522615,
        -0.00388215,  0.00141557],
       [ 0.00259581,  0.00940419, -0.01040717, ..., -0.00604095,
         0.01112475, -0.00789805]], dtype=float32)>

In [314]:
loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile('adam',loss=loss)

In [315]:
epochs=30
callback=ModelCheckpoint('text_model.h5',monitor='loss',mode='min',save_best_only=True)
model.fit(data,epochs=epochs,callbacks=[callback])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x26863aa71f0>

In [102]:
model=keras.models.load_model('text_model.h5')

In [115]:
weights=model.get_weights()

In [134]:
input_sequence = Input(shape=(None,), name='input_sequence')
initial_states=Input(shape=(512,),name='initial_states')
embedding_layer = Embedding(vocab_size, 256, name='embedding')(input_sequence)

gru_layer = GRU(rnn_units, return_sequences=True, return_state=True, name='gru')
gru_seq, gru_states = gru_layer(embedding_layer, initial_states)

output_logits = Dense(vocab_size, name='output')(gru_seq)
model = Model(inputs=[input_sequence,initial_states], outputs=[output_logits,gru_states], name='text_generation_model')

In [139]:
logits,final_state=model([w_id_batch,state])

In [149]:
tf.random.categorical(logits[:,-1,:],num_samples=1)[0]

<tf.Tensor: shape=(1,), dtype=int64, numpy=array([30], dtype=int64)>

In [144]:
model.set_weights(weights)

In [164]:
def predict_chars(model,text,chars_to_id,ids_to_chars,states=None):
    uni=tf.strings.unicode_split(text,'UTF-8')
    w_id=chars_to_ids(w_split)
    w_id_batch=tf.reshape(w_id,(1,-1))
    if states is None:
        states=tf.zeros([1,512])
    logits,state=model([w_id_batch,states])
    id_char=tf.random.categorical(logits[:,-1,:],num_samples=1)[0]
    next_char=ids_to_chars(id_char)
    return next_char,state

In [177]:
state=None
text='some happy'
result=[]
for i in range(1000):
    text,state=predict_chars(model,text,chars_to_ids,ids_to_chars,state)
    result.append(text.numpy()[0])

In [179]:
tf.strings.join(result)

<tf.Tensor: shape=(), dtype=string, numpy=b'bbxb[UNK]x[UNK]bbb x[UNK]8b8jb7xbbx[UNK]jjb[UNK]bb7xkx[UNK]b[UNK]b[UNK]xb[UNK]\xe2\x82\xb9xbzx\xe2\x82\xb9xb8bbb[UNK]7b[UNK][UNK]x[UNK]87[UNK]b7[UNK][UNK]bb[UNK]bbb[UNK]x[UNK]bbb[UNK]\xe2\x82\xb9[UNK]7b[UNK]b[UNK][UNK][UNK][UNK]b[UNK]bb8[UNK][UNK] xxnbn[UNK]bx[UNK] bjbv[UNK]b[UNK]b7xd[UNK][UNK]7[UNK]b v[UNK]xjb[UNK][UNK]4[UNK][UNK]b7[UNK]7b7t[UNK]b8b[UNK]\n8bj[UNK]j2[UNK][UNK]n[UNK]7[UNK]/%x8bvxx[UNK]wj[UNK]7x[UNK]bxb7b7j9x[UNK]7[UNK]bbb\n[UNK]b[UNK]7bbbb[UNK][UNK][UNK]bb97[UNK]xxj[UNK]b7bxbbbx4b[UNK]77b8b7b7[UNK]7b[UNK]b7[UNK]b[UNK]xzx9bbe7[UNK][UNK]bk7bx9xb77xxbbvbbbbxxbb[UNK][UNK]bxb[UNK][UNK] xtb[UNK]bb[UNK]xbb77sb7/[UNK]et[UNK]x4b7[UNK]b[UNK][UNK]a77b[UNK]b87x[UNK]/7[UNK][UNK]b7b[UNK]bbb7sb[UNK]b[UNK]bebbxxx[UNK]b [UNK]7b[UNK]bb[UNK]x[UNK]b[UNK][UNK][UNK][UNK]7[UNK][UNK] 7[UNK]/sbn[UNK]x bbb4bb[UNK]b[UNK]b7b[UNK]bx7[UNK]x[UNK]bbx[UNK][UNK][UNK][UNK]bb[UNK]b[UNK]b[UNK]bbbxb7bb[UNK][UNK]b[UNK][UNK]7\nb7xjjb8[UNK]jb[UNK][UNK]b8[UNK][UNK]7[U