In [19]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Tokenizer

In [9]:
sentences = [
    # Orwell’s Six Rules for Writing
    'Never use a metaphor, simile, or other figure of speech which you are used to seeing in print.',
    'Never use a long word where a short one will do.',
    'If it is possible to cut a word out, always cut it out.',
    'Never use the passive where you can use the active.',
    'Never use a foreign phrase, a scientific word, or a jargon word if you can think of an everyday English equivalent.',
    'Break any of these rules sooner than say anything outright barbarous.',
]

In [15]:
tokenizer = Tokenizer(num_words=100, oov_token='<UNK>')

In [16]:
tokenizer.fit_on_texts(sentences)

In [17]:
# Is this a corpus? What is a corpus?
tokenizer.word_index

{'<UNK>': 1,
 'a': 2,
 'use': 3,
 'never': 4,
 'word': 5,
 'of': 6,
 'you': 7,
 'or': 8,
 'to': 9,
 'where': 10,
 'if': 11,
 'it': 12,
 'cut': 13,
 'out': 14,
 'the': 15,
 'can': 16,
 'metaphor': 17,
 'simile': 18,
 'other': 19,
 'figure': 20,
 'speech': 21,
 'which': 22,
 'are': 23,
 'used': 24,
 'seeing': 25,
 'in': 26,
 'print': 27,
 'long': 28,
 'short': 29,
 'one': 30,
 'will': 31,
 'do': 32,
 'is': 33,
 'possible': 34,
 'always': 35,
 'passive': 36,
 'active': 37,
 'foreign': 38,
 'phrase': 39,
 'scientific': 40,
 'jargon': 41,
 'think': 42,
 'an': 43,
 'everyday': 44,
 'english': 45,
 'equivalent': 46,
 'break': 47,
 'any': 48,
 'these': 49,
 'rules': 50,
 'sooner': 51,
 'than': 52,
 'say': 53,
 'anything': 54,
 'outright': 55,
 'barbarous': 56}

In [18]:
tokenizer.word_counts

OrderedDict([('never', 4),
             ('use', 5),
             ('a', 7),
             ('metaphor', 1),
             ('simile', 1),
             ('or', 2),
             ('other', 1),
             ('figure', 1),
             ('of', 3),
             ('speech', 1),
             ('which', 1),
             ('you', 3),
             ('are', 1),
             ('used', 1),
             ('to', 2),
             ('seeing', 1),
             ('in', 1),
             ('print', 1),
             ('long', 1),
             ('word', 4),
             ('where', 2),
             ('short', 1),
             ('one', 1),
             ('will', 1),
             ('do', 1),
             ('if', 2),
             ('it', 2),
             ('is', 1),
             ('possible', 1),
             ('cut', 2),
             ('out', 2),
             ('always', 1),
             ('the', 2),
             ('passive', 1),
             ('can', 2),
             ('active', 1),
             ('foreign', 1),
             ('phrase', 1),
     

## Word to sequences

In [21]:
seq = tokenizer.texts_to_sequences(sentences)
seq

[[4, 3, 2, 17, 18, 8, 19, 20, 6, 21, 22, 7, 23, 24, 9, 25, 26, 27],
 [4, 3, 2, 28, 5, 10, 2, 29, 30, 31, 32],
 [11, 12, 33, 34, 9, 13, 2, 5, 14, 35, 13, 12, 14],
 [4, 3, 15, 36, 10, 7, 16, 3, 15, 37],
 [4, 3, 2, 38, 39, 2, 40, 5, 8, 2, 41, 5, 11, 7, 16, 42, 6, 43, 44, 45, 46],
 [47, 48, 6, 49, 50, 51, 52, 53, 54, 55, 56]]

## Padding

In [24]:
pad_sequences(seq, padding='post')

array([[ 4,  3,  2, 17, 18,  8, 19, 20,  6, 21, 22,  7, 23, 24,  9, 25,
        26, 27,  0,  0,  0],
       [ 4,  3,  2, 28,  5, 10,  2, 29, 30, 31, 32,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0],
       [11, 12, 33, 34,  9, 13,  2,  5, 14, 35, 13, 12, 14,  0,  0,  0,
         0,  0,  0,  0,  0],
       [ 4,  3, 15, 36, 10,  7, 16,  3, 15, 37,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0],
       [ 4,  3,  2, 38, 39,  2, 40,  5,  8,  2, 41,  5, 11,  7, 16, 42,
         6, 43, 44, 45, 46],
       [47, 48,  6, 49, 50, 51, 52, 53, 54, 55, 56,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0]], dtype=int32)