In [1]:
from packages import *
import tensorflow as tf
from datetime import datetime, timedelta
import time
import pandas as pd
import tensorflow.keras.backend as K
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import *
from sklearn.metrics import classification_report, precision_recall_fscore_support, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils import shuffle
%load_ext autoreload
%autoreload 2

# Model
We will be building our models in this notebook.

Some configuration required for tensorflow.

In [2]:
tf.config.gpu.set_per_process_memory_growth(True)
tf.config.gpu.set_per_process_memory_fraction(.25)
tf.keras.backend.set_image_data_format('channels_last')

## Dataset

In [3]:
glove_dim = 50
max_length = 300
chunk_count = 100
random_seed = 100 #np.random.randint(0, 1000000000)
print('using random_seed: {}'.format(random_seed))
splits = ['train', 'val', 'test'] #60:20:20

using random_seed: 100


In [4]:
tf_text_filenames = [os.path.join(DATASET_DIR, 'preprocessed', 'tfrecord', 'xext',  #xext temporarily
                                  'review-text-{:02d}.tf'.format(i)) for i in range(chunk_count)]
tf_ix_filenames = [os.path.join(DATASET_DIR, 'preprocessed', 'tfrecord', 'ix', 
                                  'review-ix-{:02d}.tf'.format(i)) for i in range(chunk_count)]
all_stars = (load_data({'review': ['stars']})['review']['stars'] - 1).astype(np.int32)# this is so that stars are 0-indexed
N = {}
stars_chunked = all_stars.reshape((chunk_count, 6685900//chunk_count))

class_weights = compute_class_weight('balanced', [0, 1, 2, 3, 4], all_stars)
class_weights = {'five': class_weights, 'ternary': np.array([class_weights[0], class_weights[1:4].sum() , class_weights[4]])}

tf_text, tf_ix, stars= {}, {}, {}

tf_text['train'], tf_text_filenames_val_test, tf_ix['train'], tf_ix_filenames_val_test, stars['train'], stars_val_test \
= train_test_split(tf_text_filenames, tf_ix_filenames, stars_chunked, random_state=random_seed, test_size = .4)

tf_text['val'], tf_text['test'], tf_ix['val'], tf_ix['test'], stars['val'], stars['test'] \
= train_test_split(tf_text_filenames_val_test, tf_ix_filenames_val_test, stars_val_test, random_state=random_seed, test_size = .5)

for split in splits:
    stars[split] = stars[split].reshape(np.product(stars[split].shape))
    N[split] = len(stars[split])

print('({}:{}:{})'.format(len(tf_text['train']), len(tf_text['val']), len(tf_text['test'])))

glove_lookup = load_pickle(os.path.join(GLOVE_DIR, 'glove-{}D-byte-float32.pkl'.format(glove_dim)))
keys_to_ix = load_pickle(os.path.join(GLOVE_DIR, 'glove-byte-keys_to_ix.pkl'))
ix_to_key = {value: key for key, value in keys_to_ix.items()}

unk_vector = np.mean(np.array(list(glove_lookup.values())), axis=0)
null_vector = np.zeros(glove_dim)
glove_lookup[UNK_KEY.encode('ascii')] = unk_vector #if using byte glove dict
glove_lookup[NULL_KEY.encode('ascii')] = null_vector

oops = 0
glove_lookup_array = []
for i in range(len(ix_to_key)):
    if ix_to_key[i] not in glove_lookup:
        oops+=1
    glove_lookup_array.append(glove_lookup.get(ix_to_key[i], null_vector))
glove_lookup_array = np.array(glove_lookup_array, dtype=np.float32)
print('oops {} times.'.format(oops))

@tf.function
def get_review_length(review):
    return tf.cast(tf.reduce_sum(tf.cast(tf.not_equal(review, 0), tf.int8)), tf.int32)

@tf.function
def _parse_function(proto, to_ix):
    # define your tfrecord again. Remember that you saved your image as a string.
    keys_to_features = {'review': tf.io.FixedLenFeature([300,], tf.int64) if to_ix else tf.io.FixedLenFeature([300,], tf.string),}
    
    # Load one example
    parsed_features = tf.io.parse_single_example(proto, keys_to_features)
    
    return parsed_features 

embedding = tf.constant(glove_lookup_array)
@tf.function
def embed(tensor):
    return tf.gather(embedding, tensor)

@tf.function
def add_channel(tensor):
    return tf.expand_dims(tensor, -1)

# 11 GB in memory
# X = []
# x = tf.data.TFRecordDataset(tf_text['train'])
# for y in x :
#     X.append(y)

(60:20:20)
oops 1 times.


In [5]:
@tf.function
def get_review_length(review):
    return tf.cast(tf.reduce_sum(tf.cast(tf.not_equal(review, 0), tf.int32)), tf.float32)

In [6]:
def make_dataset(batch_size, to_embed=False, to_ix=False, channelize=True, to_ternary_task=False, features=None): 
    '''
    Note: once created dataset remains the same from iteration to iteration. 
    In Keras.fit, once given steps_per_epoch the data for epochs are not the same.
    Validation data is the same.
    If cell is rerun, then the same order of data will be fed.
    '''
    if to_ternary_task:
        star_mapping = tf.constant([0, 1, 1, 1, 2], dtype=tf.int32)
    def _prepare_star(star):
        if to_ternary_task: return {'stars': tf.gather(star_mapping, star)}
        return {'stars': star}
    def _parse_transform(x):
        parsed = _parse_function(x, True)['review']
        out = {}
        if features:
            out = {key: f(parsed) for key, f in features.items()}
        if to_ix:
            out['review-ix'] = tf.cast(parsed, tf.int32)
        if to_embed:
            if channelize:
                out['review-embed'] = add_channel(embed(parsed))
            else:
                out['review-embed'] = embed(parsed) 
        return out   
    
    shuffle_buffer_size = batch_size * 100
    prefetch_buffer_size = 4
    files = tf_ix
    dataset = {}
    num_batches = {}
    for split in splits:
        dataset[split] = tf.data.TFRecordDataset(files[split])
        stars_dataset = tf.data.Dataset.from_tensor_slices(stars[split])
        dataset[split] = tf.data.Dataset.zip((dataset[split], stars_dataset))
        dataset[split] = dataset[split].shuffle(shuffle_buffer_size, seed=random_seed)
        dataset[split] = dataset[split].repeat()
        dataset[split] = dataset[split].map(lambda x, y: (_parse_transform(x), _prepare_star(y)), num_parallel_calls=12)
        dataset[split] = dataset[split].batch(batch_size)
        dataset[split] = dataset[split].prefetch(prefetch_buffer_size)
        num_batches[split] = N[split] // batch_size
    print('num_batches: {}, batch_size: {}, shuffle_buffer_size: {}, prefetch_buffer_size: {}'.format(num_batches, batch_size, 
                                                                                 shuffle_buffer_size, prefetch_buffer_size))
    return dataset, num_batches

In [7]:
# %%time
# for i, (x, y) in enumerate(dataset['train']):
#     if i == 500:
#         break
# for i, (x, y) in enumerate(dataset['train']):
#     if i == 500:
#         break
# # Wall time: 30.1 --> 14.2 --> 4.3 s wow (64 batchsize, glovedim = 300)

In [8]:
# for x_init, y_init in dataset['train']:break

# Models

### BabyBlueberry
Convolutional Model with GLoVe embeddings.

In [9]:
@rename('all_acc')
def all_class_accuracy(y_true, y_pred):
    return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred) 

def one_class_recall(label):
    @rename('{}_recalll'.format(label))
    def single_class_recall(y_true, y_pred): # (64, 1), (64, 5) should return (64,)
        truth = K.flatten(K.cast(y_true, 'int32'))
        preds = K.cast(K.argmax(y_pred, axis=-1), 'int32')
        recall_mask = K.cast(K.equal(truth, label), 'int32')        
        recall = K.cast(K.equal(preds, truth), 'int32') * recall_mask
        recall = K.cast(K.sum(recall) / K.maximum(K.sum(recall_mask), 1), 'float32')
        return  K.ones(batch_size, dtype='float32') * recall
    return single_class_recall

def one_class_precision(label):
    @rename('{}_precision'.format(label))
    def single_class_precision(y_true, y_pred): # (64, 1), (64, 5) should return (64,)
        truth = K.flatten(K.cast(y_true, 'int32'))
        preds = K.cast(K.argmax(y_pred, axis=-1), 'int32')
        precision_mask = K.cast(K.equal(preds, label), 'int32')          
        precision = K.cast(K.equal(preds, truth), 'int32') * precision_mask
        precision = K.cast(K.sum(precision) / K.maximum(K.sum(precision_mask), 1), 'float32')
        return  K.ones(batch_size, dtype='float32') * precision
    return single_class_precision

In [10]:
star_metrics = [all_class_accuracy]
star_metrics += [one_class_recall(i) for i in range(5)] + [one_class_precision(i) for i in range(5)]

In [11]:
def get_kernel_spec(filters, height, width):
    return {'filters':filters, 'height': height, 'width': width}

In [12]:
class BabyBerry:
    
    def build(dropout_rate, kernels, out_units, input_shape=(max_length, glove_dim, 1)):
        ix_input = Input(shape= (max_length,), name='review-ix')
        glove_input = Input(shape=input_shape, name='review-embed')
        length_input = Input(shape=(1,), name='review-length')
        embedding = Embedding(input_dim=len(glove_lookup_array), output_dim=glove_dim, 
                              input_length=max_length, name='trained_embed')(ix_input)
        
        channel_embedding = Reshape(target_shape=(max_length, glove_dim, 1), name='reshape_embed')(embedding)
                
        combined_embedding = Concatenate(name='concat_embed')([channel_embedding, glove_input])
        branches = []
        names = {}
        for kernel in kernels:
            if (kernel['height'], kernel['width']) in names:
                names[(kernel['height'], kernel['width'])] = i = names[(kernel['height'], kernel['width'])] + 1
            else:
                names[(kernel['height'], kernel['width'])] = i = 1
            branch = Conv2D(kernel['filters'], (kernel['height'], kernel['width']), activation='relu', 
                            name='conv_{}_{}_{}'.format(kernel['height'], kernel['width'], i), 
                            padding='valid')(combined_embedding)
            branch = GlobalMaxPool2D(name='pool_{}_{}_{}'.format(kernel['height'], kernel['width'], i))(branch)
            branches.append(branch)
        
        
        x = Concatenate(name='concat_conv')(branches + [length_input])
        x = Dropout(rate=dropout_rate, name='dropout_1')(x)
        x = Dense(100, activation='relu', name='dense_1')(x)
        out = Dense(out_units, activation='softmax', name='stars')(x)
        
        return tf.keras.models.Model(inputs={'review-embed':glove_input, 'review-ix': ix_input, 'review-length':length_input}, 
                                     outputs={'stars': out}, name='BabyBerry')
        
    @tf.function
    def loss_fn(truth, logits):
        loss = tf.keras.losses.sparse_categorical_crossentropy(truth, logits)
        return loss

In [13]:
batch_size = 256
dataset, num_batches = make_dataset(batch_size=batch_size, to_embed=True, to_ix=True, channelize=True, 
                                    features={'review-length':get_review_length})

num_batches: {'train': 15670, 'val': 5223, 'test': 5223}, batch_size: 256, shuffle_buffer_size: 25600, prefetch_buffer_size: 4


In [14]:
kernels = [get_kernel_spec(filters=100, height=h, width=glove_dim) for h in [2, 3, 4, 5, 10]]
baby = BabyBerry.build(kernels=kernels, dropout_rate=.1, out_units=5)
adam = tf.keras.optimizers.Adam()
baby.compile(optimizer=adam, metrics={'stars': star_metrics} ,loss={'stars': BabyBerry.loss_fn})

In [15]:
log_dir = os.path.join(SRC_DIR, 'logs', '{}-CONV-BabyBerry'.format(datetime.fromtimestamp(time.time()).strftime('%d-%m_%H-%M-%S')))
tb = tf.keras.callbacks.TensorBoard(log_dir=log_dir, write_graph=False, update_freq='batch')

In [17]:
_=baby.fit(dataset['train'], steps_per_epoch=100, epochs=150, 
                             validation_data= dataset['val'], validation_steps=20,
                             callbacks=[tb], class_weight=class_weights['five'], verbose=1, shuffle=False)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

In [18]:
baby.save(os.path.join(MODELS_DIR, 'baby_conv_length.h5'))

In [None]:
results = baby.evaluate(dataset['test'], steps=5000)



In [None]:
def get_f1_scores(results):
    m = {}
    m[1] = (results[2], results[7])
    m[2] = (results[3], results[8])
    m[3] = (results[4], results[9])
    m[4] = (results[5], results[10])
    m[5] = (results[6], results[11])
    f = {}
    total = 0
    for i in m:
        f[i] = 2 * np.product(m[i]) / np.sum(m[i])
        total+=f[i]
    f['macro'] = total/5
    return f, m

In [52]:
baby_embedding = baby.layers[1].get_weights()[0]
baby_dict = {key: baby_embedding[val] for key, val in keys_to_ix.items()}

In [90]:
dists, ix = neighbors(baby_dict['food'.encode()], baby_dict)

In [91]:
dists[dists.columns[ix]].T.head(10)

Unnamed: 0,0
b'cold',0.152718
b'coding',0.154795
b'irons',0.195102
b'one-inch',0.205606
b'velveeta',0.209315
b'frays',0.21371
b'swallowed',0.213735
b'connective',0.219325
b'polymer',0.22103
b'pepsico',0.224535


In [92]:
dists_glove, ix_glove = neighbors((glove_lookup['cold'.encode()] +glove_lookup['food'.encode()])/2, glove_lookup)

In [93]:
dists_glove[dists_glove.columns[ix_glove]].T.head(10)

Unnamed: 0,0
b'food',0.094483
b'cold',0.121795
b'hot',0.215604
b'dry',0.217672
b'water',0.220291
b'fresh',0.224757
b'eating',0.227453
b'cooking',0.229176
b'especially',0.241382
b'keeping',0.254835
