In [1]:
from packages import *
import pandas as pd
import tensorflow as tf
from datetime import datetime, timedelta
import time
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import *
from tensorflow.data.experimental import AUTOTUNE
from sklearn.metrics import f1_score, r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from collections import Counter
import tensorflow.keras.backend as K
%load_ext autoreload
%autoreload 2

# Model
We will be building our models in this notebook.

Some configuration required for tensorflow.

In [2]:
tf.config.gpu.set_per_process_memory_growth(True)
tf.config.gpu.set_per_process_memory_fraction(.40)
tf.keras.backend.set_image_data_format('channels_last')

## Dataset

In [3]:
glove_dim = 300
max_length = 300
chunk_count = 100
random_seed = np.random.randint(0, 1000)
splits = ['train', 'val', 'test'] #80:10:10

In [4]:
tf_text_filenames = [os.path.join(DATASET_DIR, 'preprocessed', 'tfrecord', 'xext',  #xext temporarily
                                  'review-text-{:02d}.tf'.format(i)) for i in range(chunk_count)]
tf_ix_filenames = [os.path.join(DATASET_DIR, 'preprocessed', 'tfrecord', 'ix', 
                                  'review-ix-{:02d}.tf'.format(i)) for i in range(chunk_count)]
all_stars = (load_data({'review': ['stars']})['review']['stars'] - 1).astype(np.int32)# this is so that stars are 0-indexed
N = {}
stars_chunked = all_stars.reshape((chunk_count, 6685900//chunk_count))

In [5]:
class_weights = compute_class_weight('balanced', [0, 1, 2, 3, 4], all_stars)

In [6]:
tf_text, tf_ix, stars= {}, {}, {}

In [7]:
tf_text['train'], tf_text_filenames_val_test, tf_ix['train'], tf_ix_filenames_val_test, stars['train'], stars_val_test \
= train_test_split(tf_text_filenames, tf_ix_filenames, stars_chunked, random_state=random_seed, test_size = .4)

In [8]:
tf_text['val'], tf_text['test'], tf_ix['val'], tf_ix['test'], stars['val'], stars['test'] \
= train_test_split(tf_text_filenames_val_test, tf_ix_filenames_val_test, stars_val_test, random_state=random_seed, test_size = .5)

In [9]:
for split in splits:
    stars[split] = stars[split].reshape(np.product(stars[split].shape))
    N[split] = len(stars[split])

In [10]:
print('({}:{}:{})'.format(len(tf_text['train']), len(tf_text['val']), len(tf_text['test'])))

(60:20:20)


In [11]:
glove_lookup = load_pickle(os.path.join(GLOVE_DIR, 'glove-{}D-byte-float32.pkl'.format(glove_dim)))
keys_to_ix = load_pickle(os.path.join(GLOVE_DIR, 'glove-byte-keys_to_ix.pkl'))
ix_to_key = {value: key for key, value in keys_to_ix.items()}

In [12]:
unk_vector = np.mean(np.array(list(glove_lookup.values())), axis=0)
null_vector = np.zeros(glove_dim)
glove_lookup[UNK_KEY.encode('ascii')] = unk_vector #if using byte glove dict
glove_lookup[NULL_KEY.encode('ascii')] = null_vector

In [13]:
oops = 0
glove_lookup_array = []
for i in range(len(ix_to_key)):
    if ix_to_key[i] not in glove_lookup:
        oops+=1
    glove_lookup_array.append(glove_lookup.get(ix_to_key[i], null_vector))
glove_lookup_array = np.array(glove_lookup_array, dtype=np.float32)
print('oops {} times.'.format(oops))

oops 1 times.


In [14]:
@tf.function
def _parse_function(proto, to_ix):
    # define your tfrecord again. Remember that you saved your image as a string.
    keys_to_features = {'review': tf.io.FixedLenFeature([300,], tf.int64) if to_ix else tf.io.FixedLenFeature([300,], tf.string),}
    
    # Load one example
    parsed_features = tf.io.parse_single_example(proto, keys_to_features)
    
    return parsed_features 

In [15]:
@tf.function
def embed(tensor):
    return tf.gather(glove_lookup_array, tensor)

In [16]:
@tf.function
def add_channel(tensor):
    return tf.expand_dims(tensor, -1)

In [17]:
def make_dataset(batch_size, embedded, to_ix=False, channelize=True):      
    shuffle_buffer_size = batch_size * 10
    prefetch_buffer_size = 1
    if embedded or to_ix: 
        files = tf_ix
        to_ix = True
    else:
        files = tf_text
    dataset = {}
    num_batches = {}
    for split in splits:
        dataset[split] = tf.data.TFRecordDataset(files[split])
        stars_dataset = tf.data.Dataset.from_tensor_slices(stars[split])
        dataset[split] = dataset[split].map(lambda x: _parse_function(x, to_ix)['review'], num_parallel_calls=AUTOTUNE)
        if embedded:
            dataset[split] = dataset[split].map(embed, num_parallel_calls=AUTOTUNE)
        if channelize:
            dataset[split] = dataset[split].map(add_channel, num_parallel_calls=AUTOTUNE)
        dataset[split] = tf.data.Dataset.zip((dataset[split], stars_dataset))
        dataset[split] = dataset[split].shuffle(shuffle_buffer_size)
        dataset[split] = dataset[split].map(lambda x, y: ({'review': x}, {'stars': y}))
        dataset[split] = dataset[split].batch(batch_size)
        dataset[split] = dataset[split].prefetch(prefetch_buffer_size)
        num_batches[split] = N[split] // batch_size
    print('num_batches: {}, batch_size: {}, shuffle_buffer_size: {}, prefetch_buffer_size: {}'.format(num_batches, batch_size, 
                                                                                 shuffle_buffer_size, prefetch_buffer_size))
    return dataset, num_batches

# Models

### BabyBlueberry
Convolutional Model with GLoVe embeddings.

In [29]:
class BabyBlueberry:

    def build(input_shape=(max_length, glove_dim, 1), star_units=5):
        
        inputs = Input(shape=input_shape, name='review')
        
        lstm = LSTM(10, name='lstm_1')(tf.squeeze(inputs,axis=-1))
        
        x_3 = Conv2D(300, (3, glove_dim), activation='relu', name='conv_3', padding='valid', kernel_regularizer='l2')(inputs)
#         x_3 = GlobalMaxPooling2D(name='pool_3')(x_3)
        x_3 = MaxPooling2D(pool_size=(max_length - 2, 1), name='pool_3')(x_3)

        
        
        x_4 = Conv2D(300, (4, 300), activation='relu', name='conv_4', padding='valid', kernel_regularizer='l2')(inputs)
#         x_4 = GlobalMaxPooling2D(name='pool_4')(x_4)
        x_4 = MaxPooling2D(pool_size=(max_length - 3, 1), name='pool_4')(x_4)

        
        x_5 = Conv2D(300, (5, 300), activation='relu', name='conv_5', padding='valid', kernel_regularizer='l2')(inputs)
#         x_5 = GlobalMaxPooling2D(name='pool_5')(x_5) 
        x_5 = MaxPooling2D(pool_size=(max_length - 4, 1), name='pool_5')(x_5)
    
            
        x_6 = Conv2D(300, (6, 300), activation='relu', name='conv_6', padding='valid', kernel_regularizer='l2')(inputs)
#         x_5 = GlobalMaxPooling2D(name='pool_5')(x_6) 
        x_6 = MaxPooling2D(pool_size=(max_length - 5, 1), name='pool_6')(x_6)

        x = Concatenate(name='concat')([x_3, x_4, x_5, x_6, lstm])
        x = Flatten(name='flatten')(x)
        x = Dropout(rate=.4, name='dropout')(x)
        x = Dense(500, activation='relu', name='dense')(x)
#         x = Dropout(rate=.2, name='dropout')(x)
        out = Dense(star_units, activation='softmax', name='stars')(x)
        return tf.keras.models.Model(inputs={'review':inputs}, outputs={'stars': out}, name='Blueberry')
        
    @tf.function
    def loss_fn(truth, logits):
        loss = tf.keras.losses.sparse_categorical_crossentropy(truth, logits)
        return loss

In [30]:
batch_size = 64
dataset, num_batches = make_dataset(batch_size=batch_size, embedded=True, to_ix=True, channelize=True)

num_batches: {'train': 62680, 'val': 20893, 'test': 20893}, batch_size: 64, shuffle_buffer_size: 640, prefetch_buffer_size: 1


In [31]:
def rename(newname):
    def decorator(f):
        f.__name__ = newname
        return f
    return decorator

In [32]:
def one_class_accuracy(class_ix):
    @rename('{}_acc'.format(class_ix+1))
    def single_class_accuracy(y_true, y_pred):
        class_id_true = K.cast(y_true, 'int64')
        class_id_preds = K.argmax(y_pred, axis=-1)
        # Replace class_id_preds with class_id_true for recall here
        accuracy_mask = K.cast(K.equal(class_id_preds, class_ix), 'int32')
        class_acc_tensor = K.cast(K.equal(class_id_true, class_id_preds), 'int32') * accuracy_mask
        class_acc = K.sum(class_acc_tensor) / K.maximum(K.sum(accuracy_mask), 1)
        return K.cast(K.ones(batch_size, dtype='float64') * (class_acc / batch_size), dtype='float32')
    return single_class_accuracy

In [33]:
baby = BabyBlueberry.build(input_shape=(300,300,1))

W0606 17:17:51.333639 139862612498240 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.UnifiedLSTM object at 0x7f33f8319358>: Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.


ValueError: A `Concatenate` layer requires inputs with matching shapes except for the concat axis. Got inputs shapes: [(None, 1, 1, 300), (None, 1, 1, 300), (None, 1, 1, 300), (None, 1, 1, 300), (None, 10)]

In [None]:
adam = tf.keras.optimizers.Adam()

In [28]:
?tf.squeeze

[0;31mSignature:[0m [0mtf[0m[0;34m.[0m[0msqueeze[0m[0;34m([0m[0minput[0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mname[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mFile:[0m      ~/anaconda3/envs/yelp-nlu_3.6/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py
[0;31mType:[0m      function


In [None]:
star_metrics = [one_class_accuracy(i) for i in range(5)]
star_metrics.append(tf.keras.metrics.sparse_categorical_accuracy)

In [None]:
# plot_model(baby, show_shapes=False, show_layer_names=True, to_file='conv-model.png',rankdir='LR' )

In [None]:
baby.summary()

In [None]:
baby.compile(optimizer=adam, metrics={'stars': ['sparse_categorical_accuracy']} ,loss={'stars': BabyBlueberry.loss_fn})
log_dir = os.path.join(SRC_DIR, 'logs', 'SuperBlueberry-{}'.format(datetime.fromtimestamp(time.time()).strftime('%H-%M-%S_%m-%d')))
tb = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, write_graph=False, update_freq='batch')

In [27]:
for x, y in dataset['train']:break
truth = y['stars']
pred = baby(x['review'])['stars']

In [41]:
x = np.arange(12)

In [None]:
x.fldd

In [42]:
predictions, truth = [], []
for x, y in dataset['val'].take(100):
    pred = baby.predict_on_batch(x)
    truth.append(y['stars'].numpy())
    predictions.append(tf.argmax(pred['stars'], axis=-1).numpy())
predictions=np.array(predictions).flatten()
truth = np.array(truth).flatten()

In [44]:
print(classification_report(truth, predictions))

              precision    recall  f1-score   support

           0       0.67      0.78      0.72        85
           1       0.53      0.20      0.29        46
           2       0.31      0.34      0.32        77
           3       0.38      0.29      0.33       133
           4       0.74      0.83      0.78       299

    accuracy                           0.60       640
   macro avg       0.52      0.49      0.49       640
weighted avg       0.59      0.60      0.59       640



In [28]:
baby.fit(dataset['train'].take(100),
         epochs=5,
         validation_data=dataset['val'].take(100),
         callbacks=[tb], class_weight=class_weights)

Epoch 1/10
  49291/Unknown - 7687s 156ms/step - loss: 1.0619 - 1_acc: 0.1504 - 2_acc: 0.0721 - 3_acc: 0.1093 - 4_acc: 0.2192 - 5_acc: 0.4379 - sparse_categorical_accuracy: 0.6167

KeyboardInterrupt: 

In [None]:
%%time
for i in dataset.take(100):
    pass

## Models

### JingleBell

In [18]:
class JingleBell:
    
    def build(star_units=5):
        inputs = Input(shape=(max_length, glove_dim), name='review')
        x = Flatten(name='flatten')(inputs)
        out = Dense(5, activation='softmax', name='stars', kernel_regularizer='l1')(x)
        return tf.keras.models.Model(inputs={'review':inputs}, outputs={'stars': out}, name='JingleBell')


In [19]:
dataset = make_dataset(embedded=True)

In [143]:
bell = JingleBell.build()

In [23]:
Z=[]
for x in dataset['train'].take(1000):
    Z.append(x)

In [32]:
for i in range(len(X)):
    assert(np.array_equal(X[i][1]['stars'], Z[i][1]['stars']))

In [21]:
bell.compile(optimizer='adam', loss='sparse_categorical_crossentropy', class_weights=class_weights,
             metrics=['accuracy'])
log_dir = os.path.join(SRC_DIR, 'logs', '{}'.format(datetime.fromtimestamp(time.time()).strftime('%H-%M-%S_%Y-%m-%d')))
tb = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, write_graph=False, update_freq='batch')

NameError: name 'bell' is not defined

In [145]:
bell.fit(dataset['train'].take(num_batches//100), epochs=10, validation_data= dataset['val'].take(100), callbacks=[tb])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f74fa4e6390>