In [1]:
import numpy as np
import matplotlib.pyplot as plt

import os

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
corpus = [
    'I love my dog.',
    'I love my cat.',
    'You love my dog.',
    'Do you think my dog is amazing?',
]

# oov for Out Of Vocabulary
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')

tokenizer.fit_on_texts(corpus)

# note that more frequent words have a lower index
print('word index:')
print(tokenizer.word_index)

seqs = tokenizer.texts_to_sequences(corpus)
print(f'tokenized sequences stored in a {type(seqs)}:')
print(seqs)

seqs = pad_sequences(
    seqs,
    padding='post',
    maxlen=5,
    truncating='post',
)
print(f'padded sequences stored in a {seqs.shape} {type(seqs)}:')
print(seqs)

newtxs = [
    'i really love my dog!',
    'my dog loves my manatee',
]

print('texts with unseen words:')
print(tokenizer.texts_to_sequences(newtxs))

word index:
{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
tokenized sequences stored in a <class 'list'>:
[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]
padded sequences stored in a (4, 5) <class 'numpy.ndarray'>:
[[5 3 2 4 0]
 [5 3 2 7 0]
 [6 3 2 4 0]
 [8 6 9 2 4]]
texts with unseen words:
[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]


In [7]:
# num_words affects only the texts_to_sequences() transform, but not word_index.
# Note also that the number of most frequent words kept is (num_words - 1)
# words outside this top list will be omitted altogether
tokenizer = Tokenizer(num_words=3)

tokenizer.fit_on_texts(corpus)

print('word index:')
print(tokenizer.word_index)

print('tokenized sequences:')
print(tokenizer.texts_to_sequences(corpus))

print('texts with unseen words:')
print(tokenizer.texts_to_sequences(newtxs))

word index:
{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}
tokenized sequences:
[[2, 1], [2, 1], [2, 1], [1]]
texts with unseen words:
[[2, 1], [1, 1]]


In [6]:
# However, if oov_token is specified, it will always be included in the top list,
# so only (num_words - 2) most frequent words will be kept.
# Moreover, all words outside this top list will be viewed as oov_token,
# rather than being omitted, so length of text will not change
tokenizer = Tokenizer(num_words=3, oov_token='<OOV>')

tokenizer.fit_on_texts(corpus)

print('word index:')
print(tokenizer.word_index)

print('tokenized sequences:')
print(tokenizer.texts_to_sequences(corpus))

print('texts with unseen words:')
print(tokenizer.texts_to_sequences(newtxs))

word index:
{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
tokenized sequences:
[[1, 1, 2, 1], [1, 1, 2, 1], [1, 1, 2, 1], [1, 1, 1, 2, 1, 1, 1]]
texts with unseen words:
[[1, 1, 1, 2, 1], [2, 1, 1, 2, 1]]


In [115]:
import urllib

json_local = os.path.join('data', 'sarcasm.json')
json_url = 'https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json'

if not os.path.exists(json_local):
    urllib.request.urlretrieve(json_url, json_local)

In [36]:
import json

with open(json_local) as file:
    data = json.load(file)
    
print(type(data), len(data))
print(data[0])
print(data[20000])

<class 'list'> 26709
{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5', 'headline': "former versace store clerk sues over secret 'black code' for minority shoppers", 'is_sarcastic': 0}
{'article_link': 'https://www.theonion.com/pediatricians-announce-2011-newborns-are-ugliest-babies-1819572977', 'headline': 'pediatricians announce 2011 newborns are ugliest babies in 30 years', 'is_sarcastic': 1}


In [108]:
%%time

# this one-liner is not as fast as the dumb way,
# most likely due to generating a large intermediate nested list
# urls, hls, labels = zip(*[(e['article_link'], e['headline'], e['is_sarcastic']) for e in data])

urls, hls, labels = [], [], []

for e in data:
    urls.append(e['article_link'])
    hls.append(e['headline'])
    labels.append(e['is_sarcastic'])

CPU times: total: 15.6 ms
Wall time: 16.1 ms


In [117]:
tokenizer = Tokenizer(oov_token='<OOV>')

tokenizer.fit_on_texts(hls)

print('number of words:', len(tokenizer.word_index))
samp = {}
for i, (k, v) in enumerate(tokenizer.word_index.items()):
    samp[k] = v
    if i > 20:
        break
print(samp)

seqs = tokenizer.texts_to_sequences(hls)

print(hls[:2])
print(seqs[:2])

seqs = pad_sequences(seqs, padding='post')

print(seqs.shape)
print(seqs[:2])

number of words: 29657
{'<OOV>': 1, 'to': 2, 'of': 3, 'the': 4, 'in': 5, 'for': 6, 'a': 7, 'on': 8, 'and': 9, 'with': 10, 'is': 11, 'new': 12, 'trump': 13, 'man': 14, 'from': 15, 'at': 16, 'about': 17, 'you': 18, 'this': 19, 'by': 20, 'after': 21, 'up': 22}
["former versace store clerk sues over secret 'black code' for minority shoppers", "the 'roseanne' revival catches up to our thorny political mood, for better and worse"]
[[308, 15115, 679, 3337, 2298, 48, 382, 2576, 15116, 6, 2577, 8434], [4, 8435, 3338, 2746, 22, 2, 166, 8436, 416, 3112, 6, 258, 9, 1002]]
(26709, 40)
[[  308 15115   679  3337  2298    48   382  2576 15116     6  2577  8434
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0]
 [    4  8435  3338  2746    22     2   166  8436   416  3112     6   258
      9  1002     0     0     0     0     0     0     0     0     0     0
      0     0     0     