In [None]:
from packages import *
import tensorflow as tf
from datetime import datetime, timedelta
import time
import pandas as pd
import tensorflow.keras.backend as K
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import *
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils import shuffle
%load_ext autoreload
%autoreload 2

# Model
We will be building our models in this notebook.

Some configuration required for tensorflow.

In [2]:
tf.config.gpu.set_per_process_memory_growth(True)
tf.config.gpu.set_per_process_memory_fraction(.3)
tf.keras.backend.set_image_data_format('channels_last')

## Dataset

In [148]:
glove_dim = 50
max_length = 300
chunk_count = 100
random_seed = np.random.randint(0, 1000000000)
print('using random_seed: {}'.format(random_seed))
splits = ['train', 'val', 'test'] #60:20:20

using random_seed: 937717800


In [149]:
tf_text_filenames = [os.path.join(DATASET_DIR, 'preprocessed', 'tfrecord', 'xext',  #xext temporarily
                                  'review-text-{:02d}.tf'.format(i)) for i in range(chunk_count)]
tf_ix_filenames = [os.path.join(DATASET_DIR, 'preprocessed', 'tfrecord', 'ix', 
                                  'review-ix-{:02d}.tf'.format(i)) for i in range(chunk_count)]
all_stars = (load_data({'review': ['stars']})['review']['stars'] - 1).astype(np.int32)# this is so that stars are 0-indexed
N = {}
stars_chunked = all_stars.reshape((chunk_count, 6685900//chunk_count))

class_weights = compute_class_weight('balanced', [0, 1, 2, 3, 4], all_stars)
class_weights = {'five': class_weights, 'ternary': np.array([class_weights[0], class_weights[1:4].sum() , class_weights[4]])}

tf_text, tf_ix, stars= {}, {}, {}

tf_text['train'], tf_text_filenames_val_test, tf_ix['train'], tf_ix_filenames_val_test, stars['train'], stars_val_test \
= train_test_split(tf_text_filenames, tf_ix_filenames, stars_chunked, random_state=random_seed, test_size = .4)

tf_text['val'], tf_text['test'], tf_ix['val'], tf_ix['test'], stars['val'], stars['test'] \
= train_test_split(tf_text_filenames_val_test, tf_ix_filenames_val_test, stars_val_test, random_state=random_seed, test_size = .5)

for split in splits:
    stars[split] = stars[split].reshape(np.product(stars[split].shape))
    N[split] = len(stars[split])

print('({}:{}:{})'.format(len(tf_text['train']), len(tf_text['val']), len(tf_text['test'])))

glove_lookup = load_pickle(os.path.join(GLOVE_DIR, 'glove-{}D-byte-float32.pkl'.format(glove_dim)))
keys_to_ix = load_pickle(os.path.join(GLOVE_DIR, 'glove-byte-keys_to_ix.pkl'))
ix_to_key = {value: key for key, value in keys_to_ix.items()}

unk_vector = np.mean(np.array(list(glove_lookup.values())), axis=0)
null_vector = np.zeros(glove_dim)
glove_lookup[UNK_KEY.encode('ascii')] = unk_vector #if using byte glove dict
glove_lookup[NULL_KEY.encode('ascii')] = null_vector

oops = 0
glove_lookup_array = []
for i in range(len(ix_to_key)):
    if ix_to_key[i] not in glove_lookup:
        oops+=1
    glove_lookup_array.append(glove_lookup.get(ix_to_key[i], null_vector))
glove_lookup_array = np.array(glove_lookup_array, dtype=np.float32)
print('oops {} times.'.format(oops))

@tf.function
def _parse_function(proto, to_ix):
    # define your tfrecord again. Remember that you saved your image as a string.
    keys_to_features = {'review': tf.io.FixedLenFeature([300,], tf.int64) if to_ix else tf.io.FixedLenFeature([300,], tf.string),}
    
    # Load one example
    parsed_features = tf.io.parse_single_example(proto, keys_to_features)
    
    return parsed_features 

embedding = tf.constant(glove_lookup_array)
@tf.function
def embed(tensor):
    return tf.gather(embedding, tensor)

@tf.function
def add_channel(tensor):
    return tf.expand_dims(tensor, -1)

# 11 GB in memory
# X = []
# x = tf.data.TFRecordDataset(tf_text['train'])
# for y in x :
#     X.append(y)

(60:20:20)
oops 1 times.


## Featurizers

In [3]:
# @tf.function
# def 

In [150]:
@tf.function
def get_review_length(review):
    return tf.cast(tf.reduce_sum(tf.cast(tf.not_equal(review, 0), tf.int32)), tf.int32)

In [151]:
def make_dataset(batch_size, to_embed=False, to_ix=False, channelize=True, to_ternary_task=False, features=None): 
    '''
    Note: once created dataset remains the same from iteration to iteration. 
    In Keras.fit, once given steps_per_epoch the data for epochs are not the same.
    Validation data is the same.
    If cell is rerun, then the same order of data will be fed.
    '''
    if to_ternary_task:
        star_mapping = tf.constant([0, 1, 1, 1, 2], dtype=tf.int32)
    def _prepare_star(star):
        if to_ternary_task: return {'stars': tf.gather(star_mapping, star)}
        return {'stars': star}
    def _parse_transform(x):
        parsed = _parse_function(x, True)['review']
        out = {}
        if features:
            out = {key: f(parsed) for key, f in features.items()}
        if to_ix:
            out['review-ix'] = tf.cast(parsed, tf.int32)
        if to_embed:
            if channelize:
                out['review-embed'] = add_channel(embed(parsed))
            else:
                out['review-embed'] = embed(parsed) 
        return out   
    
    shuffle_buffer_size = batch_size * 100
    prefetch_buffer_size = 4
    files = tf_ix
    dataset = {}
    num_batches = {}
    for split in splits:
        dataset[split] = tf.data.TFRecordDataset(files[split])
        stars_dataset = tf.data.Dataset.from_tensor_slices(stars[split])
        dataset[split] = tf.data.Dataset.zip((dataset[split], stars_dataset))
        dataset[split] = dataset[split].shuffle(shuffle_buffer_size)
        dataset[split] = dataset[split].repeat()
        dataset[split] = dataset[split].map(lambda x, y: (_parse_transform(x), _prepare_star(y)), num_parallel_calls=12)
        dataset[split] = dataset[split].batch(batch_size)
        dataset[split] = dataset[split].prefetch(prefetch_buffer_size)
        num_batches[split] = N[split] // batch_size
    print('num_batches: {}, batch_size: {}, shuffle_buffer_size: {}, prefetch_buffer_size: {}'.format(num_batches, batch_size, 
                                                                                 shuffle_buffer_size, prefetch_buffer_size))
    return dataset, num_batches

In [152]:
batch_size = 64
dataset, num_batches = make_dataset(batch_size=batch_size, to_ix=True, features={'review-length': get_review_length})

num_batches: {'train': 62680, 'val': 20893, 'test': 20893}, batch_size: 64, shuffle_buffer_size: 6400, prefetch_buffer_size: 4


In [153]:
%%time
for i, (x, y) in enumerate(dataset['train']):
    if i == 500:
        break
for i, (x, y) in enumerate(dataset['train']):
    if i == 500:
        break
# Wall time: 30.1 --> 14.2 --> 4.3 s wow (64 batchsize, glovedim = 300)

CPU times: user 9.11 s, sys: 2.12 s, total: 11.2 s
Wall time: 2.08 s


In [141]:
for x_init, y_init in dataset['train']:break

# Models

### BabyBlueberry
Convolutional Model with GLoVe embeddings.

In [7]:
@rename('all_acc')
def all_class_accuracy(y_true, y_pred):
    return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred) 

def one_class_recall(label):
    @rename('{}_recalll'.format(label))
    def single_class_recall(y_true, y_pred): # (64, 1), (64, 5) should return (64,)
        truth = K.flatten(K.cast(y_true, 'int32'))
        preds = K.cast(K.argmax(y_pred, axis=-1), 'int32')
        recall_mask = K.cast(K.equal(truth, label), 'int32')        
        recall = K.cast(K.equal(preds, truth), 'int32') * recall_mask
        recall = K.cast(K.sum(recall) / K.maximum(K.sum(recall_mask), 1), 'float32')
        return  K.ones(batch_size, dtype='float32') * recall
    return single_class_recall

def one_class_precision(label):
    @rename('{}_precision'.format(label))
    def single_class_precision(y_true, y_pred): # (64, 1), (64, 5) should return (64,)
        truth = K.flatten(K.cast(y_true, 'int32'))
        preds = K.cast(K.argmax(y_pred, axis=-1), 'int32')
        precision_mask = K.cast(K.equal(preds, label), 'int32')          
        precision = K.cast(K.equal(preds, truth), 'int32') * precision_mask
        precision = K.cast(K.sum(precision) / K.maximum(K.sum(precision_mask), 1), 'float32')
        return  K.ones(batch_size, dtype='float32') * precision
    return single_class_precision

In [8]:
star_metrics = [all_class_accuracy]
star_metrics += [one_class_recall(i) for i in range(5)] + [one_class_precision(i) for i in range(5)]

In [9]:
def get_kernel_spec(filters, height, width):
    return {'filters':filters, 'height': height, 'width': width}

In [10]:
class BabyBerry:
    
    def build(dropout_rate, out_units):
        ix_input = Input(shape= (max_length,), name='review-ix')
        glove_input = Input(shape=(max_length, glove_dim), name='review-embed')
        
        
        embedding = Embedding(input_dim=len(glove_lookup_array), output_dim=glove_dim, 
                              input_length=max_length, name='trained_embed')(ix_input)
                        
        average_embedding = Average(name='average_embed')([embedding, glove_input])
        
        
        x = LSTM(units=500, go_backwards=True, name='lstm')(average_embedding)
        x = Dropout(rate=dropout_rate, name='dropout_1')(x)
        x = Dense(100, activation='relu', name='dense_1')(x)
        out = Dense(out_units, activation='softmax', name='stars')(x)
        
        return tf.keras.models.Model(inputs={'review-embed':glove_input, 'review-ix': ix_input}, 
                                     outputs={'stars': out}, name='BabyBerry')
        
    @tf.function
    def loss_fn(truth, logits):
        loss = tf.keras.losses.sparse_categorical_crossentropy(truth, logits)
        return loss

In [None]:
batch_size = 128
dataset, num_batches = make_dataset(batch_size=batch_size, to_embed=True, to_ix=True, channelize=False, to_ternary_task=False)

In [None]:
baby = BabyBerry.build(dropout_rate=.1, out_units=5)
adam = tf.keras.optimizers.Adam()
baby.compile(optimizer=adam, metrics={'stars': star_metrics} ,loss={'stars': BabyBerry.loss_fn})

In [None]:
plot_model(baby, show_shapes=True, rankdir='LR', to_file='images/lstm_baby_berry_model.png')

In [None]:
log_dir = os.path.join(SRC_DIR, 'logs', '{}-LSTM-BabyBerry'.format(datetime.fromtimestamp(time.time()).strftime('%d-%m_%H-%M-%S')))
tb = tf.keras.callbacks.TensorBoard(log_dir=log_dir, write_graph=False, update_freq='batch')

In [None]:
_=baby.fit(dataset['train'], steps_per_epoch=200, epochs=50, 
                             validation_data= dataset['val'], validation_steps=20,
                             callbacks=[tb], class_weight=class_weights['five'], verbose=1)

In [None]:
truth, predictions = [], []
for x, y in dataset['val'].take(100):
    pred = baby(x)
    truth.append(y['stars'].numpy())
    predictions.append(tf.argmax(pred['stars'], axis=-1).numpy())
predictions=np.array(predictions).flatten()
truth = np.array(truth).flatten()
print(classification_report(truth, predictions))

In [None]:
baby_embedding = baby.layers[1].get_weights()[0]
baby_dict = {key: baby_embedding[val] for key, val in keys_to_ix.items()}

In [None]:
dists, ix = neighbors(baby_dict['money'.encode()], baby_dict)
dists[dists.columns[ix]]

In [None]:
class TeenBerry:
    
    def build(dropout_rate, lstm_units, kernels, out_units, embedding_array):
        ix_input = Input(shape=(max_length,), name='review-ix')
        
        embedding = Embedding(input_dim=len(glove_lookup_array), output_dim=glove_dim, 
                              input_length=max_length, name='trained_embed', weights=[embedding_array])(ix_input)
        channel_embedding = Reshape(target_shape=(max_length, glove_dim, 1), name='reshape_embed')(embedding)
        
        branches = []
        names = {}
        for kernel in kernels:
            if (kernel['height'], kernel['width']) in names:
                names[(kernel['height'], kernel['width'])] = i = names[(kernel['height'], kernel['width'])] + 1
            else:
                names[(kernel['height'], kernel['width'])] = i = 1
            branch = Conv2D(kernel['filters'], (kernel['height'], kernel['width']), activation='relu', 
                            name='conv_{}_{}_{}'.format(kernel['height'], kernel['width'], i), 
                            padding='valid')(channel_embedding)
            branch = GlobalMaxPool2D(name='pool_{}_{}_{}'.format(kernel['height'], kernel['width'], i))(branch)
            branches.append(branch)
        
        lstm_out = LSTM(units=lstm_units, name='lstm')(embedding)
        x = Concatenate(name='concat_conv')(branches)
        
        x = Dropout(rate=dropout_rate, name='dropout')(x)
        x = Concatenate(name='concat_conv_lstm')([x, lstm_out])
        x = Dense(500, activation='relu', name='dense')(x)
        out = Dense(out_units, activation='softmax', name='stars')(x)
        
        return tf.keras.models.Model(inputs={'review-ix': ix_input}, 
                                     outputs={'stars': out}, name='TeenBerry')
        
    @tf.function
    def loss_fn(truth, logits):
        loss = tf.keras.losses.sparse_categorical_crossentropy(truth, logits)
        return loss

In [None]:
teen = TeenBerry.build(kernels=kernels, dropout_rate=.3, out_units=3, lstm_units=600, embedding_array=learned_embedding)
adam = tf.keras.optimizers.Adam()
teen.compile(optimizer=adam, metrics={'stars': star_metrics} ,loss={'stars': TeenBerry.loss_fn})

In [None]:
plot_model(teen, show_shapes=True, rankdir='LR', to_file='teen_berry_model.png')

In [None]:
log_dir = os.path.join(SRC_DIR, 'logs', 'TeenBerry-{}'.format(datetime.fromtimestamp(time.time()).strftime('%d-%m_%H-%M-%S')))
tb = tf.keras.callbacks.TensorBoard(log_dir=log_dir, write_graph=False, update_freq='batch')

In [None]:
_=teen.fit(dataset['train'], steps_per_epoch=200, epochs=50, 
                             validation_data= dataset['val'], validation_steps=20,
                             callbacks=[tb], class_weight=class_weights['ternary'], verbose=1)

In [None]:
truth, predictions = [], []
for x, y in dataset['val'].take(100):
    pred = teen(x['review-ix'])
    truth.append(y['stars'].numpy())
    predictions.append(tf.argmax(pred['stars'], axis=-1).numpy())
predictions=np.array(predictions).flatten()
truth = np.array(truth).flatten()
print(classification_report(truth, predictions))

In [None]:
teen_embedding = teen.layers[1].get_weights()[0]
teen_dict = {key: teen_embedding[val] for key, val in keys_to_ix.items()}

In [None]:
teen_dists, teen_ix = neighbors(teen_dict['pizza'.encode()], teen_dict)
teen_dists[teen_dists.columns[teen_ix]]

In [None]:
class MatureBerry:
    
    def build(dropout_rate, lstm_units, kernels, out_units, embedding_array):
        ix_input = Input(shape=(max_length,), name='review-ix')
        
        embedding = Embedding(input_dim=len(glove_lookup_array), output_dim=glove_dim, 
                              input_length=max_length, name='trained_embed', weights=[embedding_array], trainable=False)(ix_input)
        
        channel_embedding = Reshape(target_shape=(max_length, glove_dim, 1), name='reshape_embed')(embedding)
        
        branches = []
        names = {}
        for kernel in kernels:
            if (kernel['height'], kernel['width']) in names:
                names[(kernel['height'], kernel['width'])] = i = names[(kernel['height'], kernel['width'])] + 1
            else:
                names[(kernel['height'], kernel['width'])] = i = 1
            branch = Conv2D(kernel['filters'], (kernel['height'], kernel['width']), activation='relu', 
                            name='conv_{}_{}_{}'.format(kernel['height'], kernel['width'], i), 
                            padding='valid')(channel_embedding)
            branch = GlobalMaxPool2D(name='pool_{}_{}_{}'.format(kernel['height'], kernel['width'], i))(branch)
            branches.append(branch)
        
#         lstm_out = LSTM(units=lstm_units, name='lstm')(embedding)
        x = Concatenate(name='concat_conv')(branches)
        
        x = Dropout(rate=dropout_rate, name='dropout')(x)
#         x = Concatenate(name='concat_conv_lstm')([x, lstm_out])
        x = Dense(500, activation='relu', name='dense')(x)
        out = Dense(out_units, activation='softmax', name='stars')(x)
        
        return tf.keras.models.Model(inputs={'review-ix': ix_input}, 
                                     outputs={'stars': out}, name='MatureBerry')
        
    @tf.function
    def loss_fn(truth, logits):
        loss = tf.keras.losses.sparse_categorical_crossentropy(truth, logits)
        return loss

In [None]:
mature = MatureBerry.build(kernels=kernels, dropout_rate=.2, out_units=5, lstm_units=600, embedding_array=teen_embedding)
adam = tf.keras.optimizers.Adam()
mature.compile(optimizer=adam, metrics={'stars': star_metrics} ,loss={'stars': MatureBerry.loss_fn})

In [None]:
plot_model(mature, show_shapes=True, rankdir='LR', to_file='mature_berry_model.png')

In [None]:
log_dir = os.path.join(SRC_DIR, 'logs', 'MatureBerry-{}'.format(datetime.fromtimestamp(time.time()).strftime('%d-%m_%H-%M-%S')))
tb = tf.keras.callbacks.TensorBoard(log_dir=log_dir, write_graph=False, update_freq='batch')

In [None]:
_=mature.fit(dataset['train'], steps_per_epoch=200, epochs=50, 
                             validation_data= dataset['val'], validation_steps=20,
                             callbacks=[tb], class_weight=class_weights['five'], verbose=1)

In [None]:
truth, predictions = [], []
for x, y in dataset['val'].take(100):
    pred = mature(x['review-ix'])
    truth.append(y['stars'].numpy())
    predictions.append(tf.argmax(pred['stars'], axis=-1).numpy())
predictions=np.array(predictions).flatten()
truth = np.array(truth).flatten()
print(classification_report(truth, predictions))

In [None]:
mature_embedding = mature.layers[1].get_weights()[0]
mature_dict = {key: mature_embedding[val] for key, val in keys_to_ix.items()}

In [None]:
mature_dists, mature_ix = neighbors(mature_dict['loved'.encode()], mature_dict)
mature_dists[mature_dists.columns[mature_ix]]

## Automated Testing

In [None]:
try:
    histories
except:
    histories, reports = { }, { }

In [None]:
possible_heights = list(np.arange(5, step=1)+2) + list(np.arange(10, 50, step=5)) #+ list(np.arange(50, 101, step=10))
possible_num_branches = list(np.arange(4, 11, step=2))
possible_filters = list(np.arange(200, 501, step=100))
possible_dropouts = list(np.arange(.1, .51, step=.1))

In [None]:
print('possible_heights: {}'.format(possible_heights))
print('possible_num_branches: {}'.format(possible_num_branches))
print('possible_filters: {}'.format(possible_filters))
print('possible_dropouts: {}'.format(possible_dropouts))

In [None]:
def save_report_histories():
    save_pickle('histories.pkl', reports)
    save_pickle('reports.pkl', reports)


In [None]:
def load_report_histories():
    return (load_pickle('histories.pkl'),
            load_pickle('reports.pkl'))

In [None]:
possible_heights = set(possible_heights) #+ list(np.arange(50, 101, step=10)))
possible_num_branches = shuffle(possible_num_branches)
possible_filters = shuffle(possible_filters)
possible_dropouts = shuffle(possible_dropouts)
histories, reports = load_report_histories()
i = 0
for num_branches in possible_num_branches:
    subsets = find_subsets_of_n(possible_heights, num_branches)
    subsets = shuffle(subsets)
    for heights in subsets:
        for filters in possible_filters:
            for dropout_rate in possible_dropouts:
                
                params = (tuple(heights), filters, dropout_rate)
                if params in reports:
                    continue
                print(params)
                kernels = [get_kernel_spec(filters=filters, height=h, width=glove_dim) for h in heights]
                baby = BabyBlueberry.build(kernels=kernels, dropout_rate=dropout_rate)
                adam = tf.keras.optimizers.Adam()
                baby.compile(optimizer=adam, metrics={'stars': star_metrics} ,loss={'stars': BabyBlueberry.loss_fn})
                log_dir = os.path.join(SRC_DIR, 'logs', 'CONV-{}'.format(datetime.fromtimestamp(time.time()).strftime('%H-%M-%S_%m-%d')))
                tb = tf.keras.callbacks.TensorBoard(log_dir=log_dir, write_graph=False, update_freq='batch')
                histories[params] = baby.fit(dataset['train'], steps_per_epoch=100, epochs=5, 
                                             validation_data= dataset['val'], validation_steps=10,
                                             callbacks=[tb], class_weight=class_weights, verbose=1)
                predictions, truth = [], []
                for x, y in dataset['val'].take(100):
                    pred = baby.predict_on_batch(x)
                    truth.append(y['stars'].numpy())
                    predictions.append(tf.argmax(pred['stars'], axis=-1).numpy())
                predictions=np.array(predictions).flatten()
                truth = np.array(truth).flatten()
                reports[params] = classification_report(truth, predictions)
                print(reports[params])
                save_report_histories()

In [None]:
save_pickle('reports.pkl', reports)
save_pickle('histories.pkl', histories)

In [None]:
dataset['train'].shuffle

In [None]:
for x, y in dataset['train']:break
truth = y['stars']
pred = baby(x['review'])['stars']

In [None]:
print(classification_report(truth, predictions))