# Lab 1: Tokenizer Basics

when work with text, we need to represent it as a numerical so the computer can process it. We do it by tokenizing the sentences for each word as a number (create vocab/ word_index)

In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer

# sentences
sentences = ["I love my dog",
             "I, love my cat"]

# initialize Tokenizer
tokenizer = Tokenizer(num_words=100)

# fit text
tokenizer.fit_on_texts(sentences)

# see the vocab
word_index = tokenizer.word_index
print(word_index)

{'i': 1, 'love': 2, 'my': 3, 'dog': 4, 'cat': 5}


num words parameter is used to limit number of vocab generated. but it doesn't affect the generated word index for this data yet

In [15]:
from tensorflow.keras.preprocessing.text import Tokenizer

sentences = ["I love my dog",
             "I. love my cat",
             "You love my dog!"]

tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index
print(word_index)

{'love': 1, 'my': 2, 'i': 3, 'dog': 4, 'cat': 5, 'you': 6}


punctuation are ignored, and texts all in lowercase by default

# Lab 2: Generating Sequences and Padding
After generate the word_index. Then apply the word_index to **convert text to sequence** (sentences to number sequences) because model only can understand number. An then **padded** so it has the same length as any machine learning model requirement that all input need to be the same sized.

In [16]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = ["I love my dog",
             "I, love my cat",
             "You love my dog!",
             "Do you think my dog is amazing?"]

# initialize tokenizer
tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")

# use tokenizer to create vocab of corpus
tokenizer.fit_on_texts(sentences)

# vocab of tokens
print(f'\nWord index: {tokenizer.word_index}')

# convert sentences to tokens using the word_index
sequences = tokenizer.texts_to_sequences(sentences)

print(f"\nSequences: {sequences}")


Word index: {'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}

Sequences: [[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]


In [17]:
# pad the sequences into the same length
padded_sequences = pad_sequences(sequences, maxlen=5)

print('\nPadded sequence:')
print(padded_sequences)


Padded sequence:
[[ 0  5  3  2  4]
 [ 0  5  3  2  7]
 [ 0  6  3  2  4]
 [ 9  2  4 10 11]]


we use the oov_token before to tokenize the word that tokenizer never seen before as below

In [18]:
sentences2 = ["i really love my dog",
             "my dog loves my manatee"]

sequences2 = tokenizer.texts_to_sequences(sentences2)

print("\nSequences:")
print(sequences2)

padded_sequences2 = pad_sequences(sequences2, maxlen=10)
print("\nPadded sequences:")
print(padded_sequences2)


Sequences:
[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]

Padded sequences:
[[0 0 0 0 0 5 1 3 2 4]
 [0 0 0 0 0 2 4 1 2 1]]


# Lab 3: Tokenizing the Sarcasm Dataset

In [43]:
# Download the dataset
!wget https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json

--2023-10-21 00:50:10--  https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json
Resolving storage.googleapis.com (storage.googleapis.com)... 108.177.126.207, 172.217.218.207, 2a00:1450:4013:c01::cf, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|108.177.126.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5643545 (5.4M) [application/json]
Saving to: ‘sarcasm.json.1’


2023-10-21 00:50:11 (9.02 MB/s) - ‘sarcasm.json.1’ saved [5643545/5643545]



In [44]:
# open the file
import json

with open('sarcasm.json') as f:
  dataset = json.load(f)

In [45]:
# look at the data

# is sarcas data
print(dataset[0])

# is not sarcas data
print(dataset[20000])

{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5', 'headline': "former versace store clerk sues over secret 'black code' for minority shoppers", 'is_sarcastic': 0}
{'article_link': 'https://www.theonion.com/pediatricians-announce-2011-newborns-are-ugliest-babies-1819572977', 'headline': 'pediatricians announce 2011 newborns are ugliest babies in 30 years', 'is_sarcastic': 1}


In [46]:
# process the data into separate arrray so the tools tokenizer etc can process it
articles = []
sentences = []
labels = []

for item in dataset:
  articles.append(item['article_link'])
  sentences.append(item['headline'])
  labels.append(item['is_sarcastic'])

so, let's process it

In [47]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# initialize tokenizer
tokenizer = Tokenizer(oov_token="<OOV>")

tokenizer.fit_on_texts(sentences)

print(f'\nTotal generated word index: {len(tokenizer.word_index)}')

print('Total of word index: ', len(tokenizer.word_index))

sequences = tokenizer.texts_to_sequences(sentences)

# pad sequence
padded = pad_sequences(sequences, padding='post')

index=2002
print('\nsample headline: ', sentences[index])
print('\npadded Sequence:', padded[index])

print(f'\nShape of sentence after padded: {padded.shape}')


Total generated word index: 29657
Total of word index:  29657

sample headline:  what being a christian means to me: don't worry about the rules; just love

padded Sequence: [  33  115    7 1034 1304    2  268  184 2597   17    4  621   36  144
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]

Shape of sentence after padded: (26709, 40)


# Assignment C3 W1

we use BBC dataset [BBC News Classification Dataset](https://www.kaggle.com/c/learn-ai-bbc/overview), which contains 2225 examples of news articles with their respective categories (labels).

In [24]:
!wget --no-check-certificate \
    https://storage.googleapis.com/learning-datasets/bbc-text.csv \
    -O /tmp/bbc-text.csv

--2023-10-21 00:26:53--  https://storage.googleapis.com/learning-datasets/bbc-text.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 108.177.126.207, 108.177.127.207, 172.217.218.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|108.177.126.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5057493 (4.8M) [text/csv]
Saving to: ‘/tmp/bbc-text.csv’


2023-10-21 00:26:54 (7.76 MB/s) - ‘/tmp/bbc-text.csv’ saved [5057493/5057493]



In [25]:
# first, look at the data
with open('/tmp/bbc-text.csv', 'r') as csvfile:
  print(f'The first line (header) looks like this\n {csvfile.readline()}')
  print(f'Each data point looks like this\n {csvfile.readline()}')

The first line (header) looks like this
 category,text

Each data point looks like this
 tech,tv future in the hands of viewers with home theatre systems  plasma high-definition tvs  and digital video recorders moving into the living room  the way people watch tv will be radically different in five years  time.  that is according to an expert panel which gathered at the annual consumer electronics show in las vegas to discuss how these new technologies will impact one of our favourite pastimes. with the us leading the trend  programmes and other content will be delivered to viewers via home networks  through cable  satellite  telecoms companies  and broadband service providers to front rooms and portable devices.  one of the most talked-about technologies of ces has been digital and personal video recorders (dvr and pvr). these set-top boxes  like the us s tivo and the uk s sky+ system  allow people to record  store  play  pause and forward wind tv programmes when they want.  essential

next, we want to remove stopwords from the sentences to give better prediction

In [26]:
def remove_stopwords(sentence):
    '''
    remove stopwords from sentences

    Args:
    sentence (string): sentence to remove stopwords from

    Returns:
    sentence (string): sentence in lowercase without stopwords
    '''
    # List of stopwords
    #Stopwords list from https://github.com/Yoast/YoastSEO.js/blob/develop/src/config/stopwords.js
    stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

    sentence = sentence.lower()

    words = sentence.split(' ')
    cleaned_words = [x for x in words if x not in stopwords]
    sentence = " ".join(cleaned_words)

    return sentence

parse data so it can be processed further


In [27]:
import csv

sentences = []
labels = []

with open('/tmp/bbc-text.csv', 'r') as csvfile:
    csv_reader = csv.reader(csvfile, delimiter=',')

    # skip the header
    next(csv_reader)

    for row in csv_reader:
      labels.append(row[0])
      cleaned_sentences = remove_stopwords(row[1])
      sentences.append(cleaned_sentences)

In [41]:
# tokenize sentences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(oov_token="<OOV>")

tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index
print("Total of word index: ", len(word_index))

sequences = tokenizer.texts_to_sequences(sentences)
padded_sentences = pad_sequences(sequences, padding='post')

print(f'\nPadded the first sentence \n', padded_sentences[0])

Total of word index:  29714

Padded the first sentence 
 [  96  176 1157 ...    0    0    0]


after tokenizing the sentence, we also need tokenize the labels because the labels are text too

In [40]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)
labels_sequences = label_tokenizer.texts_to_sequences(labels)

print(f'Word index of labels: {label_tokenizer.word_index}')
print(f'\nThe first ten labels: {labels_sequences[:10]}')

Word index of labels: {'sport': 1, 'business': 2, 'politics': 3, 'tech': 4, 'entertainment': 5}

The first ten labels: [[4], [2], [1], [1], [5], [3], [3], [1], [1], [5]]
