In [1]:
from packages import *
import pandas as pd
import tensorflow as tf
from datetime import datetime, timedelta
import time
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import *
from tensorflow.data.experimental import AUTOTUNE
from sklearn.metrics import f1_score, r2_score
from sklearn.model_selection import train_test_split
from collections import Counter
%load_ext autoreload
%autoreload 2

# Model
We will be building our models in this notebook.

Some configuration required for tensorflow.

In [2]:
epsilon = 1e-7
fp = 'float32'
tf.config.gpu.set_per_process_memory_growth(True)
tf.config.gpu.set_per_process_memory_fraction(.85)
tf.keras.backend.set_floatx(fp)
tf.keras.backend.set_epsilon(epsilon)
tf.keras.backend.set_image_data_format('channels_last')

## Dataset

In [3]:
batch_size = 128
glove_dim = 50
input_shape = (300, glove_dim, 1)
max_length = 300
shuffle_buffer_size = batch_size*4
prefetch_buffer_size = 1
chunk_count = 100 
splits = ['train', 'val', 'test'] #80:10:10

In [4]:
tf_filenames = [os.path.join(DATASET_DIR, 'preprocessed', 'tfrecord', 'review-text-{:02d}'.format(i)) for i in range(chunk_count)] #100 files
np_filenames = [os.path.join(DATASET_DIR, 'preprocessed', 'review-text-{:02d}.npy'.format(i)) for i in range(chunk_count)] #100 files

In [5]:
glove_lookup = load_pickle(os.path.join(GLOVE_DIR, 'glove-{}D-byte-{}.pkl'.format(glove_dim, fp)))

In [6]:
unk_vector = np.mean(np.array(list(glove_lookup.values())), axis=0)
null_vector = np.zeros(glove_dim)
glove_lookup[UNK_KEY.encode('ascii')] = unk_vector #if using byte glove dict
glove_lookup[NULL_KEY.encode('ascii')] = null_vector

In [7]:
stars = (load_data({'review': ['stars']})['review']['stars'] - 1).astype(np.int32) # this is so that stars are 0-indexed

In [9]:
feature_description = {
    'review': tf.io.FixedLenFeature([], tf.string),
}
@tf.function
def _parse_function(example_proto):
    return tf.io.parse_single_example(example_proto, feature_description)

In [14]:
x = list(glove_lookup.values())

In [31]:
keys = list(glove_lookup.keys())

In [32]:
keys.sort()

In [34]:
key_to_ix = {key: i for i, key in enumerate(keys)}

In [42]:
save_pickle(os.path.join(GLOVE_DIR, 'glove-byte-float32_to_ix.pkl'), key_to_ix)

In [None]:
with open(path, 'wb+') as f:
    pickle.dump(data, f)

In [29]:
for dim in [50, 100, 200, 300]:
    glove = load_pickle(os.path.join(GLOVE_DIR, 'glove-{:02d}D-byte-{}.pkl'.format(dim, fp)))
    keys_in_look = np.array(list(glove.keys()))
    assert np.array_equal(keys_in_look, np.array(keys))
    break

In [None]:
393592

In [18]:
tf_embedding = tf.Variable(
    tf.constant(x),
    trainable=False,
    name="Embedding"
)

In [12]:
# def get_glove_embeddings()

In [13]:
def get_glove_embeddings(tensor): # bu mu yavas
    tokens = tensor.numpy()
    embeddings = np.array([glove_lookup[token] for token in tokens])
    return embeddings

def embed(tensor):
    tf_embeddings = tf.py_function(get_glove_embeddings,
                                   inp=[tensor],
                                   Tout=tf.float32
    )
    return tf_embeddings

@tf.function
def channelize(tensor):
    return tf.expand_dims(tensor, -1)

In [14]:
raw_dataset = tf.data.TFRecordDataset(tf_filenames)
parsed_dataset = raw_dataset.map(lambda x: tf.io.parse_tensor(_parse_function(x)['review'], out_type=tf.string), num_parallel_calls=AUTOTUNE)
embedded_dataset = parsed_dataset.map(lambda x: embed(x), num_parallel_calls=AUTOTUNE)
channelized_dataset = embedded_dataset.map(channelize, num_parallel_calls=AUTOTUNE)
stars_dataset = tf.data.Dataset.from_tensor_slices(stars)
dataset = tf.data.Dataset.zip((channelized_dataset, stars_dataset))
dataset = dataset.map(lambda x, y: ({'review': x}, {'stars': y}))
dataset = dataset.batch(batch_size)
# dataset = dataset.prefetch(prefetch_buffer_size)

In [15]:
%%time
for i in dataset: break

CPU times: user 401 ms, sys: 545 ms, total: 946 ms
Wall time: 398 ms


## Models

### JingleBell

In [32]:
class JingleBell:
    
    def build(star_units=5):
        inputs = Input(shape=(max_length, glove_dim), name='review')
        x = Flatten(name='flatten')(inputs)
        out = Dense(5, activation='softmax', name='stars')(x)
        return tf.keras.models.Model(inputs={'review':inputs}, outputs={'stars': out}, name='JingleBell')


In [41]:
bell = JingleBell.build()

In [42]:
bell.compile(optimizer='sgd', loss='sparse_categorical_crossentropy',
             metrics=['accuracy'])
log_dir = os.path.join(SRC_DIR, 'logs', '{}'.format(datetime.fromtimestamp(time.time()).strftime('%H-%M-%S_%Y-%m-%d')))
tb = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, write_graph=True, update_freq='batch')

In [45]:
bell.fit(dataset, callbacks=[tb])

W0605 21:22:44.855589 140700156294976 training_utils.py:1353] Expected a shuffled dataset but input dataset `x` is not shuffled. Please invoke `shuffle()` on input dataset.
W0605 21:22:44.884484 140700156294976 callbacks.py:1218] TensorBoard Callback will ignore `write_graph=True`when `Model.run_eagerly=True`.`


   1531/Unknown - 481s 314ms/step - loss: 1.3616 - accuracy: 0.4547

KeyboardInterrupt: 

### BabyBlueberry

In [None]:
embedding_layer = layers.Embedding(1000, 32)

In [15]:
class BabyBlueberry:

    def build(input_shape = input_shape, star_units=5):
        inputs = Input(shape=input_shape, name='review')
        x = Conv2D(16, (3, 3), activation='relu', name='block1_conv1', padding='same')(inputs)
        x = Conv2D(16, (3, 3), activation='relu', name='block1_conv2', padding='same')(x)
        x = MaxPooling2D((2, 2), strides=(2, 2), name='block1_pool')(x)
        x = Conv2D(32, (3, 3), activation='relu', name='block2_conv1', padding='same')(x)
        x = Conv2D(32, (3, 3), activation='relu', name='block2_conv2', padding='same')(x)
        x = Conv2D(32, (3, 3), activation='relu', name='block2_conv3', padding='same')(x)
        x = MaxPooling2D((2, 2), strides=(2, 2), name='block2_pool')(x)
        x = Conv2D(64, (3, 3), activation='relu', name='block3_conv1', padding='same')(x)
        x = Conv2D(64, (3, 3), activation='relu', name='block3_conv2', padding='same')(x)
        x = Conv2D(64, (3, 3), activation='relu', name='block3_conv3', padding='same')(x)
        x = MaxPooling2D((2, 2), strides=(2, 2), name='block3_pool')(x) 
        x = Conv2D(128, (3, 3), activation='relu', name='block4_conv1', padding='same')(x)
        x = Conv2D(128, (3, 3), activation='relu', name='block4_conv2', padding='same')(x)
        x = Conv2D(128, (3, 3), activation='relu', name='block4_conv3', padding='same')(x)
        x = MaxPooling2D((2, 2), strides=(2, 2), name='block4_pool')(x) 
        x = Flatten(name='flatten')(x)
        x = Dropout(rate=.2, name='dropout')(x)
        out = Dense(star_units, activation='softmax', name='stars')(x)
        return tf.keras.models.Model(inputs={'review':inputs}, outputs={'stars': out}, name='Blueberry')
        
    @tf.function
    def loss_fn(truth, logits):
        loss = tf.keras.losses.sparse_categorical_crossentropy(truth, logits)
        return loss

In [16]:
baby = BabyBlueberry.build()

In [17]:
adam = tf.keras.optimizers.Adam()

In [18]:
baby.compile(optimizer=adam, metrics={'stars': ['accuracy']} ,loss={'stars': BabyBlueberry.loss_fn})

In [20]:
%%time
for i in dataset.take(100):
    pass

CPU times: user 35.3 s, sys: 40.9 s, total: 1min 16s
Wall time: 30.9 s
