### Tokenize

In [1]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import csv

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
sentences = [
    'i love statistics',
    'I, love to code',
    'Learning Data Science is my passion!'
]

tokenizer = Tokenizer(num_words = 100)        # tokenize object of max 100 words
tokenizer.fit_on_texts(sentences)             # tokenize the sentences
word_index = tokenizer.word_index             # returns dictionary of word and with its index
print(word_index)

{'i': 1, 'love': 2, 'statistics': 3, 'to': 4, 'code': 5, 'learning': 6, 'data': 7, 'science': 8, 'is': 9, 'my': 10, 'passion': 11}


## OOV, Sequences, Padding

In [None]:
sentences = [
    'i love statistics',
    'I, love to code',
    'Learning Data Science is my passion!',
    'I am a Computer Science engineer, and I am studying Data Science.'
]

# out-of-vocabulary
tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>")       # words present in test but missing in train will be marked OOV
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)             # transforms each word in sentences to a sequence of integers

padded = pad_sequences(sequences, maxlen=5)                     # (List of sequences, max length of all sequences)

print("\nWord Index = " , word_index)
print("\nSequences = " , sequences)
print("\nPadded Sequences:")
print(padded)


# Try with words that the tokenizer wasn't fit to
test_data = [
    'i really love statistics',                               # really not tokenized -> OOV
    'Learning Data Science and NLP is my passion'             # NLP not tokenized -> OOV
]

test_seq = tokenizer.texts_to_sequences(test_data)
print("\nTest Sequence = ", test_seq)

padded = pad_sequences(test_seq, maxlen=10)
print("\nPadded Test Sequence: ")
print(padded)


Word Index =  {'<OOV>': 1, 'i': 2, 'science': 3, 'love': 4, 'data': 5, 'am': 6, 'statistics': 7, 'to': 8, 'code': 9, 'learning': 10, 'is': 11, 'my': 12, 'passion': 13, 'a': 14, 'computer': 15, 'engineer': 16, 'and': 17, 'studying': 18}

Sequences =  [[2, 4, 7], [2, 4, 8, 9], [10, 5, 3, 11, 12, 13], [2, 6, 14, 15, 3, 16, 17, 2, 6, 18, 5, 3]]

Padded Sequences:
[[ 0  0  2  4  7]
 [ 0  2  4  8  9]
 [ 5  3 11 12 13]
 [ 2  6 18  5  3]]

Test Sequence =  [[2, 1, 4, 7], [10, 5, 3, 17, 1, 11, 12, 13]]

Padded Test Sequence: 
[[ 0  0  0  0  0  0  2  1  4  7]
 [ 0  0 10  5  3 17  1 11 12 13]]


## Padded sequence for larger data in JSON file

In [None]:
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json \
    -O /content/sarcasm.json
  
import json

with open("/content/sarcasm.json", 'r') as f:
    datastore = json.load(f)

--2021-08-05 03:33:48--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.135.128, 74.125.142.128, 74.125.195.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.135.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5643545 (5.4M) [application/json]
Saving to: ‘/content/sarcasm.json’


2021-08-05 03:33:48 (63.2 MB/s) - ‘/content/sarcasm.json’ saved [5643545/5643545]



In [None]:
datastore[:5]

[{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5',
  'headline': "former versace store clerk sues over secret 'black code' for minority shoppers",
  'is_sarcastic': 0},
 {'article_link': 'https://www.huffingtonpost.com/entry/roseanne-revival-review_us_5ab3a497e4b054d118e04365',
  'headline': "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
  'is_sarcastic': 0},
 {'article_link': 'https://local.theonion.com/mom-starting-to-fear-son-s-web-series-closest-thing-she-1819576697',
  'headline': "mom starting to fear son's web series closest thing she will have to grandchild",
  'is_sarcastic': 1},
 {'article_link': 'https://politics.theonion.com/boehner-just-wants-wife-to-listen-not-come-up-with-alt-1819574302',
  'headline': 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
  'is_sarcastic': 1},
 {'article_link': 'https://www.huffingtonpost.com/entry/jk-rowling-w

In [None]:
len(datastore)

26709

In [None]:
datastore[0]['headline']

"former versace store clerk sues over secret 'black code' for minority shoppers"

In [None]:
sentences = [] 
labels = []
urls = []

for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])

In [None]:
sentences[:5]

["former versace store clerk sues over secret 'black code' for minority shoppers",
 "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
 "mom starting to fear son's web series closest thing she will have to grandchild",
 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
 'j.k. rowling wishes snape happy birthday in the most magical way']

In [None]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentences)

In [None]:
len(word_index)

29657

In [None]:
word_index['to']        # gives index of 'to', word_index will have list of words and indexes

2

In [None]:
sentences[0]

"former versace store clerk sues over secret 'black code' for minority shoppers"

In [None]:
padded = pad_sequences(sequences, padding='post')
print(padded[0])                    # 1st word's padded o/p
print(padded.shape)

[  308 15115   679  3337  2298    48   382  2576 15116     6  2577  8434
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
(26709, 40)


**Pipeline**

Sentences -> Tokenize (with OOV) -> fit_on_texts -> word_index -> texts_to_sequences -> pad_sequences

## Exercise 9

In [None]:
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/bbc-text.csv \
    -O /content/bbc-text.csv

--2021-08-05 03:53:12--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/bbc-text.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.142.128, 74.125.195.128, 74.125.199.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.142.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5057493 (4.8M) [text/csv]
Saving to: ‘/content/bbc-text.csv’


2021-08-05 03:53:12 (205 MB/s) - ‘/content/bbc-text.csv’ saved [5057493/5057493]



In [None]:
df = pd.read_csv('bbc-text.csv', delimiter=',')
df.shape

(2225, 2)

In [None]:
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [None]:
#Stopwords list from https://github.com/Yoast/YoastSEO.js/blob/develop/src/config/stopwords.js
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [None]:
sentences = []
labels = []

with open("/content/bbc-text.csv", 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        labels.append(row[0])
        sentence = row[1]
        for word in stopwords:
            token = " " + word + " "
            sentence = sentence.replace(token, " ")
            sentence = sentence.replace("  ", " ")
        sentences.append(sentence)

print(len(sentences))
print(sentences[0])

2225
tv future hands viewers home theatre systems plasma high-definition tvs digital video recorders moving living room way people watch tv will radically different five years time. according expert panel gathered annual consumer electronics show las vegas discuss new technologies will impact one favourite pastimes. us leading trend programmes content will delivered viewers via home networks cable satellite telecoms companies broadband service providers front rooms portable devices. one talked-about technologies ces digital personal video recorders (dvr pvr). set-top boxes like us s tivo uk s sky+ system allow people record store play pause forward wind tv programmes want. essentially technology allows much personalised tv. also built-in high-definition tv sets big business japan us slower take off europe lack high-definition programming. not can people forward wind adverts can also forget abiding network channel schedules putting together a-la-carte entertainment. us networks cable sa

In [None]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

print(len(word_index))

29714


In [None]:
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='pre')

print(padded[0])
print(padded.shape)

[  0   0   0 ... 949  87  87]
(2225, 2442)


In [None]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

label_word_index = label_tokenizer.word_index
label_seq = label_tokenizer.texts_to_sequences(labels)

print(label_seq[:5])          # labels of 1st 5 sentences
print(label_word_index)

[[4], [2], [1], [1], [5]]
{'sport': 1, 'business': 2, 'politics': 3, 'tech': 4, 'entertainment': 5}
