## Notes from NLP in TensorFlow, Coursera

### Week 1

In [1]:
import tensorflow 
from tensorflow import keras

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]

In [4]:
# tokenize words in sentences
# quaitify text data
# oov_token means out-of-vocabulary 
# 0 is reserved for padding
tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}


In [5]:
# tokens in sequence for each sentence
sequences = tokenizer.texts_to_sequences(sentences)

# pad starts before sentences
padded = pad_sequences(sequences, maxlen=5)
print("\nWord Index = " , word_index)
print("\nSequences = " , sequences)
print("\nPadded Sequences:")
print(padded)


Word Index =  {'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}

Sequences =  [[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]

Padded Sequences:
[[ 0  5  3  2  4]
 [ 0  5  3  2  7]
 [ 0  6  3  2  4]
 [ 9  2  4 10 11]]


In [6]:
# Try with words that the tokenizer wasn't fit to
test_data = [
    'i really love my dog',
    'my dog loves my manatee'
]

test_seq = tokenizer.texts_to_sequences(test_data)
print("\nTest Sequence = ", test_seq)

padded = pad_sequences(test_seq, maxlen=10)
print("\nPadded Test Sequence: ")
print(padded)


Test Sequence =  [[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]

Padded Test Sequence: 
[[0 0 0 0 0 5 1 3 2 4]
 [0 0 0 0 0 2 4 1 2 1]]


### Headline Sarcasm Detection 
#### https://rishabhmisra.github.io/publications/#datasets
#### https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json

In [9]:
import json

with open("/Users/fujinhuizi/Documents/GitHub/data/sarcasm.json", 'r') as f:
    datastore = json.load(f)

sentences = [] 
labels = []
urls = []
for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])

In [10]:
# create tokens
tokenizer = Tokenizer(oov_token="<OOV>")
# creates the vocabulary index based on word frequency - lower value means more frequent
tokenizer.fit_on_texts(sentences)

In [21]:
word_index = tokenizer.word_index
print(len(word_index))
#print(word_index)
print(list(word_index)[:20])

sequences = tokenizer.texts_to_sequences(sentences)
# sentences will be post padded
padded = pad_sequences(sequences, padding='post')
print(sentences[3])
print(padded[3])
print(padded.shape)

29657
['<OOV>', 'to', 'of', 'the', 'in', 'for', 'a', 'on', 'and', 'with', 'is', 'new', 'trump', 'man', 'from', 'at', 'about', 'you', 'this', 'by']
boehner just wants wife to listen, not come up with alternative debt-reduction ideas
[1485   36  224  400    2 1832   29  319   22   10 2924 1393 6969  968
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]
(26709, 40)
