<a href="https://colab.research.google.com/github/ilopezro/cse143/blob/jen-assg2/Assignment_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **CSE 143: Assignment 2** <br>
Professor Jeffrey Flannagan <br>
1/30/2020 
<br><br>
Isai Lopez Rodas <br>
ilopezro 
<br><br>
Jennifer Dutra <br>
jrdutra
<br><br>
Khang Tran <br>
khvitran

# Setup <br>
Taken directly from Professor's Github

In [0]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
    !pip install -q -U tensorflow-addons
    IS_COLAB = True
except Exception:
    IS_COLAB = False

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

if not tf.config.list_physical_devices('GPU'):
    print("No GPU was detected. LSTMs and CNNs can be very slow without a GPU.")
    if IS_COLAB:
        print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "nlp"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [0]:
tf.random.set_seed(42)

In [0]:
import tensorflow_datasets as tfds

#splitting of training and dev data
train_data, dev_data, test_data = tfds.load("imdb_reviews", split=('train[:60%]', 'train[60%:]', 'test'), as_supervised=True)

In [0]:
train_size = len(list(train_data))

In [0]:
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch

In [0]:
from collections import Counter

# preprocessing training data 
vocabulary = Counter()
for X_batch, y_batch in train_data.batch(32).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))
        
#preprocessing dev data
devVocabulary = Counter()
for X_batch, y_batch in dev_data.batch(32).map(preprocess):
    for review in X_batch:
        devVocabulary.update(list(review.numpy()))

In [0]:
print(vocabulary.most_common()[:3])
print(devVocabulary.most_common()[:3])

[(b'<pad>', 128536), (b'the', 36691), (b'a', 22997)]
[(b'<pad>', 85653), (b'the', 24446), (b'a', 15567)]


In [0]:
print(len(vocabulary))
print(len(devVocabulary))

41624
34138


In [0]:
vocab_size = 10000
truncated_vocabulary = [
    word for word, count in vocabulary.most_common()[:vocab_size]]
truncated_dev_vocabulary = [
    word for word, count in devVocabulary.most_common()[:vocab_size]]

In [0]:
word_to_id_train = {word: index for index, word in enumerate(truncated_vocabulary)}
word_to_id_dev = {word: index for index, word in enumerate(truncated_dev_vocabulary)}

In [0]:
words = tf.constant(truncated_vocabulary)
dev_words = tf.constant(truncated_dev_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
word_ids_dev = tf.range(len(truncated_dev_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
vocab_init_dev = tf.lookup.KeyValueTensorInitializer(dev_words, word_ids_dev)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)
dev_table = tf.lookup.StaticVocabularyTable(vocab_init_dev, num_oov_buckets)

In [0]:
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch
  
def encode_words_dev(X_batch, y_batch):
    return dev_table.lookup(X_batch), y_batch

train_set = train_data.repeat().batch(512).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)
dev_set = dev_data.batch(512).map(preprocess)
dev_set = dev_set.map(encode_words_dev).prefetch(1)


In [0]:
for X_batch, y_batch in train_set.take(1):
    print(X_batch)
    print(y_batch)
for X_batch, y_batch in dev_set.take(1):
    print(X_batch)
    print(y_batch)

In [0]:
embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.SimpleRNN(128),
    # keras.layers.Dropout(0.1),
    # we can also call dropout inside simpleRNN and change activation of 
    # simple rnn inside. the default = tanh
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer=tf.keras.optimizers.Adam(), metrics=["accuracy"])
history = model.fit(train_set, steps_per_epoch=train_size // 32, epochs=5, validation_data=dev_set)

Train for 468 steps
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
