In [None]:
import pandas as pd
import numpy as np

#machine learning
import tensorflow as tf
print(tf.__version__)
from tensorflow.keras import layers 
from tensorflow import keras

#accessing files
from google.cloud import storage
import os

#display charts/images
import matplotlib.pyplot as plt

#don't need
# from tensorflow.python.keras.preprocessing import sequence
# from tensorflow.python.keras.preprocessing import text
# import tensorflow_hub as hub


In [None]:
params = {
    'image_size': [225, 225],
    'text_input': (58,),
    'batch_size': 128,
    'vocab_size': 30000,
    'examples_per_file': 850, #will not change
    'test_examples_per_file': 500
}

In [None]:
try:
    from google.colab import auth
    auth.authenticate_user()
    credentials=None

except ModuleNotFoundError:


    from google.oauth2 import service_account

    credentials = service_account.Credentials.from_service_account_file( #file location of GCS private key
        '/Users/jeremiahherberg/Downloads/hateful-memes-af65c70c1b79.json')

    client = storage.Client(project='hateful-memes', credentials=credentials)

In [None]:
#make into a fn
bucket = 'jh_hateful_memes_test'
client = storage.Client(project='hateful-memes', credentials=credentials)
objects = client.list_blobs(bucket, prefix='hatefulmemes_')
tfrecords = []
for object_ in objects:
    path = str(object_).split(', ')[1]
    gs_path = os.path.join('gs://', bucket, path)
    tfrecords.append(gs_path) #gs_path

In [None]:
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
def decode_example_testds(example):
    '''
    decodes single tfexample from TFrecord file
    '''
    features = {'id': tf.io.FixedLenFeature([], tf.int64),
                'text': tf.io.FixedLenFeature([], tf.string),
                'text_lemma': tf.io.FixedLenFeature([], tf.string),
                'text_lemma_no_stopwords': tf.io.FixedLenFeature([], tf.string),
                'text_no_stopwords':tf.io.FixedLenFeature([], tf.string),
                'image': tf.io.FixedLenFeature([], tf.string)}
    single_example = tf.io.parse_single_example(example, features)
    
    text = tf.io.parse_tensor(single_example['text'], out_type=tf.int32)
    image = tf.io.decode_jpeg(single_example['image'], 3)
    image = tf.image.resize_with_pad(image, *params['image_size'])
    image = image / 255.0
    label = single_example['id']
    return text, image, label

def no_tpu_output(text, image, label): #needs to be called last
    '''
    transforms ds output from text, image, label -> (text, image), label
    
    args:
        text: text output in ds
        image: image output in ds
        label: label output in ds
    returns:
        (text, image), label
        args will be otherwise unchanged
    '''
    return (text, image), label
    
    

In [None]:
def create_test_ds(files, batch_size=params['batch_size'], tpu=tpu, file_size=params['test_examples_per_file']):
    '''
    function to create dataset for test data
    ***clean up documentation for testds***
    args:
        files: list of str, filepaths of TFrecord files to be used in DS
        batch_size: int, batch size of training/validation step
        tpu: bool, default 'tpu' global variable, True is TPU is being used - not a bool update
        file_size: int, default num_examples_per_tfrecordfile variable,
            number of examples in each TFrecord file
    ***todo - update batch_size and file_size to params
    returns:
        ds: tensorflow input pipeline with images, text and labels
            if tpu is not None, output of ds is: text, image, label
            if tpu is None, output of ds is: (text, image), label
        ds_batches: int, number of steps in each epoch based on the batch_size
    '''
    ds = tf.data.TFRecordDataset(filenames = files)
    ds = ds.map(decode_example_testds, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    #consider adding augmentation to image - can't flip(?)
    if tpu is None:
        ds = ds.map(no_tpu_output, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds = ds.batch(batch_size, drop_remainder=False).prefetch(tf.data.experimental.AUTOTUNE)
#     ds = ds.cache() # -- confirm if dataset is small enough to be cached
    
    ds_batches = (len(files) * file_size) // batch_size
    if (len(files) * file_size) % batch_size > 0:
        ds_batches += 1
    return ds, ds_batches
    
    

In [None]:
test_ds, test_steps = create_test_ds(tfrecords)

In [None]:
with strategy.scope():
    #get this file to/from a gcs bucket
    model = keras.models.load_model('/Users/jeremiahherberg/Downloads/hateful_memes_v1.h5')

In [None]:
predictions = model.predict(test_ds.map(lambda img, igs: img), steps=test_steps)

In [None]:
prediction_ids = next(iter(test_ds.
                          map(lambda img, ids:ids).
                          unbatch().
                          batch(1000))).numpy().astype('str')

In [None]:
prediction_dict = {
    'id': prediction_ids,
    'proba': np.concatenate(predictions),
    'label': np.ones(1000, int)
}
submission_ds = pd.DataFrame(prediction_dict)



In [None]:
submission_ds.to_csv('submission.csv', index=False)