_TODO: Split Data into Train & Test_  
_TODO: Review Word Embeddings_  


- Build models that identify the category of a piece of text using binary categorization ✔️  
- Use word embeddings in your TensorFlow model. ✔️  
- Use LSTMs in your model to classify text for either binary or multi-class categorization.    
- Add RNN and GRU layers to your model.
- Use RNNS, LSTMs, GRUs and CNNs in models that work with text.
- Train LSTMs on existing text to generate text (such as songs and poetry)

## Dataset: News Headline with Sarcasm

In [4]:
# data wrangling
import os
import pandas as pd
import numpy as np
np.set_printoptions(edgeitems=10, linewidth=1000, formatter=dict(float=lambda x: "%.3g" % x))

# preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


# modeling
import tensorflow as tf

## load data
import os
from pathlib import Path
home = str(Path.home())
base_dir = os.path.join(home, 'datasets/news-headlines-sarcasm')
file_name = 'Sarcasm_Headlines_Dataset.json'
json_file = os.path.join(base_dir, file_name)

df = pd.read_json(json_file, lines=True)
headlines = df.headline.to_list()
true_values = df.is_sarcastic.to_list()

## tokenize
tokenizer = Tokenizer(num_words=10000, oov_token='<UNK>')
tokenizer.fit_on_texts(headlines)
seq = tokenizer.texts_to_sequences(headlines)
padded_sequences = pad_sequences(seq)

true_values = np.array(true_values)

In [5]:
padded_sequences

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0, ...,  679, 3337, 2298,   48,  382, 2576,    1,    6, 2577, 8434],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0, ...,   22,    2,  166, 8436,  416, 3112,    6,  258,    9, 1002],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0, ..., 1749, 2093,  582, 4719,  221,  143,   39,   46,    2,    1],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0, ...,    2, 1832,   29,  319,   22,   10, 2924, 1393, 6969,  968],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0, ..., 4720,  908,    1,  623,  594,    5,    4,   95, 1309,   92],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0, ...,    0,    0,    0,    0,    0,    0,    1,    4,  365,   73],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0, ...,    0,    0,    4, 6970,  351,    6,  461, 4274, 2195, 1486],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0, 

In [6]:
vocab_size = 10000
embedding_dim = 16
max_length = 40
trunc_type = 'post'
oov_tok = '<UNK>'

In [7]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=6, activation='relu'),
    tf.keras.layers.Dense(units=1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 40, 16)            160000    
_________________________________________________________________
flatten (Flatten)            (None, 640)               0         
_________________________________________________________________
dense (Dense)                (None, 6)                 3846      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 163,853
Trainable params: 163,853
Non-trainable params: 0
_________________________________________________________________


In [8]:
num_epoch = 10
model.fit(padded_sequences,
         true_values,
         epochs=4
         )

Train on 26709 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7f96625d5b90>