In [None]:
#download glove model from http://nlp.stanford.edu/data/wordvecs/glove.840B.300d.zip and
#upload to bucket

In [1]:
#set random seeds
from numpy.random import seed
seed(1)
from tensorflow.random import set_seed
set_seed(1)

import pandas as pd
import numpy as np

#machine learning
import tensorflow as tf
print(tf.__version__)
from tensorflow.keras import layers 
from tensorflow import keras
from sklearn.model_selection import train_test_split

#accessing files
from google.cloud import storage
import os

#display charts/images
import matplotlib.pyplot as plt

#don't need
# from tensorflow.python.keras.preprocessing import sequence
# from tensorflow.python.keras.preprocessing import text
# import tensorflow_hub as hub

import time
import json


2.3.0


In [2]:
params={
    'image_size': [256, 256],
    'vocab_size': 10000,
    'text_input_length': 49,
    'nodes': 256,
    'tokenizer_start_index': 58, #index of tokenizer to signal sequence start
    'tokenizer_end_index': 57,
    'epochs': 15,
    'version': 1,
    'embedding_dim': 300
}

In [3]:
try:
    from google.colab import auth
    auth.authenticate_user()
    credentials=None

except ModuleNotFoundError:


    from google.oauth2 import service_account

    credentials = service_account.Credentials.from_service_account_file( #file location of GCS private key
        '/Users/jeremiahherberg/Downloads/hateful-memes-af65c70c1b79.json')

client = storage.Client(project='hateful-memes', credentials=credentials)

In [4]:
bucket = 'jh_coco_2014'
client = storage.Client(project='hateful-memes', credentials=credentials)
objects = client.list_blobs(bucket, prefix='coco2014')
tfrecords = []
for object_ in objects:
    path = str(object_).split(', ')[1]
    gs_path = os.path.join('gs://', bucket, path)
    tfrecords.append(gs_path) #gs_path

In [5]:
tfrecords

['gs://jh_coco_2014/coco2014_1_of_7.tfrecord',
 'gs://jh_coco_2014/coco2014_2_of_7.tfrecord',
 'gs://jh_coco_2014/coco2014_3_of_7.tfrecord',
 'gs://jh_coco_2014/coco2014_4_of_7.tfrecord',
 'gs://jh_coco_2014/coco2014_5_of_7.tfrecord',
 'gs://jh_coco_2014/coco2014_6_of_7.tfrecord',
 'gs://jh_coco_2014/coco2014_7_of_7.tfrecord']

In [6]:
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


In [7]:
def decode_example(example):
    '''
    decodes single tfexample from TFrecord file
    '''
    features = {'text': tf.io.FixedLenFeature([], tf.string),
                'image': tf.io.FixedLenFeature([], tf.string),
                'raw_image': tf.io.FixedLenFeature([], tf.string)}
    single_example = tf.io.parse_single_example(example, features)
    
    text = tf.io.parse_tensor(single_example['text'], out_type=tf.int32)
    text = tf.cast(text, tf.float32) 
    image_features = tf.io.parse_tensor(single_example['image'], out_type=tf.float32)
    image = tf.io.decode_jpeg(single_example['raw_image'], 3)
    image = tf.image.resize_with_pad(image, *params['image_size'])
    image = image / 255.0
    # label = tf.cast(label, tf.float32)
    return image_features, text


    
    

In [8]:
def combine(image, text):
    '''
    todo - work on documentation
    '''
    WORDS = tf.math.count_nonzero(text, dtype=tf.int32)
    COUNTER = tf.constant(0, dtype=tf.int32)
    y  = tf.reshape(text[:,1:WORDS], (-1,1)) #basically free

    initial_Xtext = tf.zeros((1, 49))
    initial_Xtext = tf.concat([initial_Xtext[:, :COUNTER], 
                               text[:, COUNTER:COUNTER+1], 
                               initial_Xtext[:, COUNTER+1:]], axis=-1)
    
    def condition(counter, img, img2, txt, ini_text, text_out, words ):
        return tf.less(counter, words - 2) #2 less than text seq len
    
    def body(counter, img, img2, txt, ini_text, text_out, words):
        
#         img = img #clean up img and img2 
        img = tf.concat([img, img2], axis=0) #this can be returned unchanged

        counter = tf.add(counter, 1) #add +1 to counter


        ini_text = tf.concat([ini_text[:, :counter], 
                              txt[:, counter:counter+1], 
                              ini_text[:, counter+1:]], axis=-1)
        text_out = tf.concat([text_out, ini_text], axis=0)
    
    

        
        return counter, img, img2, txt, ini_text, text_out, words
    _, image, _, _, _, txt_out, words= tf.while_loop(condition, 
                                                     body, 
                                                     [COUNTER, image, 
                                                      image, text, 
                                                      initial_Xtext, initial_Xtext,
                                                      WORDS])
    return image, txt_out, y, words
    
    

In [9]:
def create_ds(files, params):
    '''
    function to create dataset for training/validation
    
    args:
        files: list of str, filepaths of TFrecord files to be used in DS
        params: dict with the following keys:
            batch_size: int, batch size of training/validation step
            examples_per_file: int, number of examples in each TFrecord file
        train, bool, default True, indicator if the DS is for training
        test_examples, int: default 1000 number of examples in test dataset
    returns:
        ds: tensorflow input pipeline with images, text and labels
            output of ds is: (text, image), label
        ds_batches: int, number of steps in each epoch based on the batch_size
    '''
    batch_size = 1

    ds = tf.data.TFRecordDataset(filenames = files)
    ds = ds.map(decode_example, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)
    # ds = ds.map(combine)

    ds = ds.batch(batch_size, drop_remainder=True)

#     ds = ds.cache() 
    
    return ds
    
    

In [10]:
def download_file(client, bucket, file_name):
    '''
    downloads a file from a GCS bucket into working directory

    args:
        client: google.cloud.storage.Client object
        bucket: str, name of bucket to download file from
        file_name: str, file name to download
    returns: None
    
    '''
    _bucket = client.bucket(bucket)
    blob = _bucket.blob(file_name)
    blob.download_to_filename(file_name)

def create_tokenizer_from_filename(file_name,
                                  client=None,
                                  bucket=None):
    '''
    creates tf.keras.preprocessing.text.tokenizer from a 
    json config file in current working directory
    args:
        file_name: str, filename where config json file is located
        client, google.cloud.storage.Client object, default None, if an arg
            is passed, function will first check if glove_file exists in current
            directory, and if not, will download an object located at glove_file
            in the bucket passed into bucket arg
        bucket, str, default None, name of GCS bucket with an object with the
            same file name as glove_file
    returns:
        tokenizer object
    '''
    if client:
        if not os.path.isfile(file_name):
            download_file(client, bucket,file_name)
    with open(file_name) as file:
        open_file = json.load(file)
        tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(open_file)
    return tokenizer

def get_embedding_weights_from_tokenizer_glove(glove_file,
                                              tokenizer,
                                              embedding_dim,
                                              client=None,
                                              bucket=None,
                                              ):
    '''
    gets the weights to use in an embedding layer from a pretained
    model based on the tokenizer used to create sequences that will
    be passed into embedding layer
    
    args:
        glove_file: str, path of pretrained model from current directory
        tokenizer: tf.keras.preprocessing.text.tokenizer object, tokenizer
            that was used to create sequences
        embedding_dim: int, output_dim of embedding layer of pre-trained model
        client, google.cloud.storage.Client object, default None, if an arg
            is passed, function will first check if glove_file exists in current
            directory, and if not, will download an object located at glove_file
            in the bucket passed into bucket arg
        bucket, str, default None, name of GCS bucket with an object with the
            same file name as glove_file
    returns: 
        embedding_weights: numpy array, shaped* (vocab_size, embedding_dim)
            weights that can be used for embedding layer
            *vocab_size = tokenizer.num_words which is the number of words in
            the tokenizer vocabulary
        
    '''
    if client:
        if not os.path.isfile(glove_file):
            download_file(client, bucket, glove_file)
    word_values = dict()
    file = open(glove_file, encoding='utf-8')
    
    for line in file:
        coeff = line.split()
        word = coeff[0]
        coefficients = np.asarray(coeff[-300:], dtype='float32')
        word_values[word] = coefficients
    file.close()
    vocab_size = tokenizer.num_words
    embedding_weights = np.zeros((vocab_size, embedding_dim))
    for word, idx in tokenizer.word_index.items():
        if idx < vocab_size:
            word_embedding_values = word_values.get(word)
            if word_embedding_values is not None:
                embedding_weights[idx] = word_embedding_values
    
    return embedding_weights
        

In [11]:
ds = create_ds(tfrecords, params)

In [12]:
def create_model(params, embedding_weights):
    '''
    creates model to caption images
    '''
    vocab_size = params['vocab_size']
    txt_input_length = params['text_input_length']
    nodes = params['nodes']
    embedding_dim = params['embedding_dim']

    image_feature_inp = layers.Input((64, 2048), name='features_input')
    features = layers.Flatten()(image_feature_inp)
    features = layers.Dense(nodes, activation='relu')(features)
    
    txt_inp = layers.Input((txt_input_length,), name='text_input')
    embedding = layers.Embedding(vocab_size, embedding_dim, mask_zero=True)(txt_inp)
    sequences = layers.LSTM(nodes)(embedding)

    features_tst = features #layers.LSTM(nodes)(features) #not sure if this is appropiate
    #if above doesn't work well, flatten after image_feature_imp
    decoder = layers.Add()([features_tst, sequences])
    decoder = layers.Dense(nodes, activation='relu')(decoder)
    output = layers.Dense(vocab_size, activation='softmax')(decoder)
    model = keras.Model([image_feature_inp, txt_inp], output)
    model.layers[3].set_weights([embedding_weights])
    model.layers[3].trainable = False

    return model

In [13]:
tokenizer = create_tokenizer_from_filename('coco_tokenizer.json', 
                                           client,
                                           'jh_coco_2014')
embedding_weights = get_embedding_weights_from_tokenizer_glove('glove.840B.300d.txt',
                                                               tokenizer,
                                                               300,
                                                               client,
                                                               'jh_hateful_memes')


In [14]:
with strategy.scope():
    model = create_model(params, embedding_weights)
    optimizer = tf.keras.optimizers.Adam()
    loss_tracker = tf.keras.metrics.Mean(name='loss')

In [15]:
# model.layers

In [16]:
def loss_function(real, pred):
    '''
    taken from https://www.tensorflow.org/tutorials/text/image_captioning#model
    '''
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    loss = tf.reduce_mean(loss_)
    #update loss tracker
    loss_tracker.update_state(loss)

    return loss
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=False, reduction='none')

In [17]:
@tf.function(experimental_relax_shapes=True)
def train_step(image_, text_, y, y_len): # params
    '''
    todo - work on documentation
    '''

    

    def step(image, text, y_value, y_len_):
        
        img_tmp = image
#         for _ in range(y_len_): 
#             image = tf.concat([image, img_tmp], axis=0)

        with tf.GradientTape() as tape:
            preds = model((image, text), training=True)
            loss = loss_function(y_value, preds)
            trainable_variables = model.trainable_variables
        gradients = tape.gradient(loss, trainable_variables)
        optimizer.apply_gradients(zip(gradients, trainable_variables))
        # return preds, loss
    #preds, loss = 
    strategy.run(step, args=(image_, text_, y, y_len))
    # return loss


In [18]:
epochs = params['epochs']

In [19]:
for epoch in range(epochs):
    epoch_start = time.time()
    step = 0
    for image, text in ds:
        img, txt, y, y_len = combine(image, text)
        _ = train_step(img, txt, y, y_len)
        step +=1 
        if step % 10000 == 0:
            batch_time = time.time()
            time_batch = batch_time - epoch_start
            print('epoch: {}, step:{}, loss: {:.5f}, batch time:{:.3f}'.format(epoch +1,
                                                                      step,
                                                                      loss_tracker.result().numpy(),
                                                                      time_batch))
    batch_time = time.time()
    time_batch = batch_time - epoch_start
    print('epoch:{}, loss:{:.5f}, time:{:.3f}, steps:{}'.format(epoch+1, 
                                                                loss_tracker.result().numpy(), 
                                                                time_batch, step))
    loss_tracker.reset_states()
        

epoch: 1, step:10000, loss: 4.50011, batch time:359.496
epoch: 1, step:20000, loss: 4.19932, batch time:710.719
epoch: 1, step:30000, loss: 4.07132, batch time:1054.567
epoch: 1, step:40000, loss: 3.99824, batch time:1410.751
epoch: 1, step:50000, loss: 3.95309, batch time:1758.996
epoch: 1, step:60000, loss: 3.89708, batch time:2093.025
epoch:1, loss:3.87875, time:2447.732, steps:69962
epoch: 2, step:10000, loss: 6.84639, batch time:320.466
epoch: 2, step:20000, loss: 5.26412, batch time:665.019
epoch: 2, step:30000, loss: 4.72918, batch time:983.163
epoch: 2, step:40000, loss: 4.44133, batch time:1319.924
epoch: 2, step:50000, loss: 4.25317, batch time:1653.824
epoch: 2, step:60000, loss: 4.11401, batch time:1983.797
epoch:2, loss:4.03174, time:2317.720, steps:69962
epoch: 3, step:10000, loss: 3.74329, batch time:322.744
epoch: 3, step:20000, loss: 3.64490, batch time:646.677
epoch: 3, step:30000, loss: 3.61835, batch time:990.171
epoch: 3, step:40000, loss: 3.58385, batch time:1312.

In [20]:
model_num = params['version']
model_path = 'image_caption_model_v{}.h5'.format(model_num)
model.save(model_path)
model_bucket = client.bucket('jh_hateful_memes')
blob = model_bucket.blob(model_path)
blob.upload_from_filename(model_path)

In [None]:
def inference(params, image_feature, image, model, tokenizer):
    '''
    uses an image captioning model to generate a caption of an image
    
    args:
        params: dictionary with at least the following keys:
            tokenizer_start_index: int, tokenizer value that signals start
            of caption
            tokenizer_end_index: int, tokenizer value that signals end of
            caption
            text_input_length: int, len of text input of model
        image_feature: array, shaped (1, 64, 2048) output of an image being
            passed through InceptionV3 model without classification layer
        model: tensorflow functional model, model to generate caption
        tokenizer: tf.keras.preprocessing.text.tokenizer object, 
    '''
    text_len = params['text_input_length']
    text = np.zeros((1, text_len))
    results = list()
    result = params['tokenizer_start_index']
    for idx in range(text_len):
        text[:, idx] = result
        result = model((image_features, text))
        result = tf.argmax(result[0]).numpy()
        if result == params['tokenizer_end_index']:
            break
        results.append(result)
    results_converted = tokenizer.sequences_to_texts([results])[0]
    print(results_converted)
    plt.imshow(image)
    plt.show()
    
    