# Read in the data from `new-am.txt`

Hopefully this is all of the text we have.

In [0]:
import tensorflow as tf

import numpy as np
import os
import time

In [1]:
# Read, then decode for py2 compat.
text = open('new-am.txt', 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print ('Length of text: {} characters'.format(len(text)))


Length of text: 4431040 characters


In [2]:
print(text[:250])

ጠ/ሚ መለስ ዜናዊ "ጦርነት ኳስ ጨዋታ አይደለም!" አሉ
ሰሞኑን በሕወሓት/ኢሕአዴግ ግምገማ ውስጥ ዋነኛው የግምገማ በትር ያረፈው በጠ/ሚ መለስ ዜናዊ ላይ መሆኑ ተደጋግሞ እየተሰማ ነው።
ከዚሁ ጋር ተያይዞ የጠ/ሚንስትሩ ጋርዶች በሌሎች መቀየራቸው፣  ከአቶ መለስ ዜናዊ ጋር የሚያገናኙ የቤተ መንግሥት የስልክ ግንኙነቶች መቋረጣቸው በሰፊው እየተነገረ ሲሆን፣ማንኛውንም የወቅቱን ጉዳይ አስመልክቶ መ


In [3]:
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

445 unique characters


In [0]:
# Store 
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [0]:
import json

with open('char2idx.json', 'w') as f:
  json.dump(char2idx, f)

with open('idx2char.json', 'w') as f:
  json.dump(list(idx2char), f)

In [0]:
from google.colab import files

files.download('char2idx.json')
files.download('idx2char.json')

In [41]:
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')


{
  '\n':   0,
  ' ' :   1,
  '!' :   2,
  '"' :   3,
  '#' :   4,
  '$' :   5,
  '%' :   6,
  '&' :   7,
  "'" :   8,
  '(' :   9,
  ')' :  10,
  '*' :  11,
  '+' :  12,
  ',' :  13,
  '-' :  14,
  '.' :  15,
  '/' :  16,
  '0' :  17,
  '1' :  18,
  '2' :  19,
  ...
}


In [42]:
print ('{} ---- characters mapped to int ---- > {}'.format(repr(text[:13]), text_as_int[:13]))


'“የዳታ ፕሮሰሲንግ አ' ---- characters mapped to int ---- > [426 311 321 222   1 396 175 177 179 247 340   1 258]


In [43]:
# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
  print(idx2char[i.numpy()])


“
የ
ዳ
ታ
 


In [44]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
  print(repr(''.join(idx2char[item.numpy()])))


'“የዳታ ፕሮሰሲንግ አገልግሎት” ማለት በኮምፒዩተር ሥርዓት አማካኝነት ዳታን የመቀበል ፣ የማከማቸት ፣ የመተንተን ፣ የማሰራጨት ፣ የማጓጓዝ ወይም የማስተላለፍ '
'አገልግሎት ሲሆን የኔትዎርክ አገልግሎችንም\n“የዳታ ፕሮሰሲንግ አገልግሎት” ማለት በኮምፒዩተር ሥርዓት አማካኝነት ዳታን የመቀበል ፣ የማከማቸት ፣ የመተንተን ፣ '
'የማሰራጨት ፣ የማጓጓዝ ወይም የማስተላለፍ አገልግሎት ሲሆን የኔትዎርክ አገልግሎችንም\nየኤጀንሲው ዓላማ\nየኤጀንሲው ዓላማ\nእንዲህ አለው፦ “ዮፍታሔ እንዲህ ይላል፦'
' ‘እስራኤል የሞዓባውያንን ምድርና የአሞናውያንን ምድር አልወሰደም ፤\nእንዲህ አለው፦ “ዮፍታሔ እንዲህ ይላል፦ ‘እስራኤል የሞዓባውያንን ምድርና የአሞናውያንን ም'
'ድር አልወሰደም ፤\nሆኖም ትርፏና የምትቀበለው ክፍያ ለይሖዋ የተቀደሰ ይሆናል ። አይከማችም ወይም አይጠራቀምም ፤ ምክንያቱም በይሖዋ ፊት የሚኖሩ ሰዎች እስኪጠግ'


In [0]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)


In [46]:
for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))


Input data:  '“የዳታ ፕሮሰሲንግ አገልግሎት” ማለት በኮምፒዩተር ሥርዓት አማካኝነት ዳታን የመቀበል ፣ የማከማቸት ፣ የመተንተን ፣ የማሰራጨት ፣ የማጓጓዝ ወይም የማስተላለፍ'
Target data: 'የዳታ ፕሮሰሲንግ አገልግሎት” ማለት በኮምፒዩተር ሥርዓት አማካኝነት ዳታን የመቀበል ፣ የማከማቸት ፣ የመተንተን ፣ የማሰራጨት ፣ የማጓጓዝ ወይም የማስተላለፍ '


In [47]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))


Step    0
  input: 426 ('“')
  expected output: 311 ('የ')
Step    1
  input: 311 ('የ')
  expected output: 321 ('ዳ')
Step    2
  input: 321 ('ዳ')
  expected output: 222 ('ታ')
Step    3
  input: 222 ('ታ')
  expected output: 1 (' ')
Step    4
  input: 1 (' ')
  expected output: 396 ('ፕ')


In [48]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset


<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [0]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024


In [0]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model


In [0]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)


In [52]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")


(64, 100, 445) # (batch_size, sequence_length, vocab_size)


In [53]:
model.summary()


Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (64, None, 256)           113920    
_________________________________________________________________
gru_2 (GRU)                  (64, None, 1024)          3938304   
_________________________________________________________________
dense_2 (Dense)              (64, None, 445)           456125    
Total params: 4,508,349
Trainable params: 4,508,349
Non-trainable params: 0
_________________________________________________________________


In [54]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 445)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       6.099281


In [0]:
model.compile(optimizer='adam', loss=loss)

In [0]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [0]:
EPOCHS=30

In [0]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100

In [0]:
tf.train.latest_checkpoint(checkpoint_dir)

In [0]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))


# Save the model

In [0]:
!rm -rf char_model
model.save("char_model")
!tar -zcvf char_model.tar.gz char_model

from google.colab import files
files.download('char_model.tar.gz')