In [1]:
#set random seeds
from numpy.random import seed
seed(1)
from tensorflow.random import set_seed
set_seed(1)

import pandas as pd
import numpy as np

#machine learning
import tensorflow as tf
print(tf.__version__)
from tensorflow.keras import layers 
from tensorflow import keras
from sklearn.model_selection import train_test_split

#accessing files
from google.cloud import storage
import os

#display charts/images
import matplotlib.pyplot as plt

#don't need
from tensorflow.python.keras.preprocessing import sequence
# from tensorflow.python.keras.preprocessing import text
import tensorflow_hub as hub

import json




2.3.0


In [2]:
params = {
    'image_size': [299, 299],
    'text_input': (58,),
    'batch_size': 512,
    'vocab_size': 30000,
    'examples_per_file': 850, #will not change
    'test_examples_per_file': 500,
    'version': 10, #model version number
    'caption_text_input_length': 49,
    'caption_model_version': 4,
    'meme_text_length': 58,
    'caption_embedding_dim': 300,
    'caption_vocab_size' : 10000,
    'tokenizer_start_index': 58, #index of tokenizer to signal sequence start
    'tokenizer_end_index': 57,

}




In [3]:
try:
    from google.colab import auth
    auth.authenticate_user()
    credentials=None

except ModuleNotFoundError:


    from google.oauth2 import service_account

    credentials = service_account.Credentials.from_service_account_file( #file location of GCS private key
        '/Users/jeremiahherberg/Downloads/hateful-memes-af65c70c1b79.json')

client = storage.Client(project='hateful-memes', credentials=credentials)

In [4]:
def get_list_files_from_bucket(client_, bucket_,
                              prefix_='hatefulmemes_'):
    '''
    gets list of files from bucket with predefined prefix
    
    args:
        client_: google.cloud.storage.Client object
        bucket_: str, name of bucket
        prefix_: str, default 'hatefulmemes_' prefix of file names
    returns:
        paths to files in bucket with above prefix
    '''
    objects = client_.list_blobs(bucket_, prefix=prefix_)
    files = []
    for object_ in objects:
        path = str(object_).split(', ')[1]
        gs_path = os.path.join('gs://', bucket_, path)
        files.append(gs_path)
    return files

In [5]:
tfrecords = get_list_files_from_bucket(client,
                                      bucket_='jh_hateful_memes')

In [6]:
# tfrecords

In [7]:
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


In [8]:
def decode_example_train(example):
    '''
    decodes single tfexample from TFrecord file
    '''
    features = {'label': tf.io.FixedLenFeature([], tf.int64),
                'text': tf.io.FixedLenFeature([], tf.string),
                'text_lemma': tf.io.FixedLenFeature([], tf.string),
                'text_lemma_no_stopwords': tf.io.FixedLenFeature([], tf.string),
                'text_no_stopwords':tf.io.FixedLenFeature([], tf.string),
                'image': tf.io.FixedLenFeature([], tf.string)}
    single_example = tf.io.parse_single_example(example, features)
    
    text = tf.io.parse_tensor(single_example['text'], out_type=tf.int32)
    textL = tf.io.parse_tensor(single_example['text_lemma'], out_type=tf.int32)
    # text = tf.cast(text, tf.float32) 
    image = tf.io.decode_jpeg(single_example['image'], 3)
    image = tf.image.resize_with_pad(image, *params['image_size'])
    image = image / 127.5
    image = image -1
    label = single_example['label']
    # label = tf.cast(label, tf.float32)
    return text, image, label

def decode_example_test(example):
    '''
    decodes single tfexample from TFrecord file
    '''
    features = {'id': tf.io.FixedLenFeature([], tf.int64),
                'text': tf.io.FixedLenFeature([], tf.string),
                'text_lemma': tf.io.FixedLenFeature([], tf.string),
                'text_lemma_no_stopwords': tf.io.FixedLenFeature([], tf.string),
                'text_no_stopwords':tf.io.FixedLenFeature([], tf.string),
                'image': tf.io.FixedLenFeature([], tf.string)}
    single_example = tf.io.parse_single_example(example, features)
    
    text = tf.io.parse_tensor(single_example['text'], out_type=tf.int32)
    textL = tf.io.parse_tensor(single_example['text_lemma'], out_type=tf.int32)
    # text = tf.cast(text, tf.float32) 
    image = tf.io.decode_jpeg(single_example['image'], 3)
    image = tf.image.resize_with_pad(image, *params['image_size'])
    image = image / 127.5
    image = image -1
    label = single_example['id']
    # label = tf.cast(label, tf.float32)
    return text, image, label



In [9]:
def create_ds(files, params, train=True, test_examples=1000):
    '''
    function to create dataset for training/validation
    
    args:
        files: list of str, filepaths of TFrecord files to be used in DS
        params: dict with the following keys:
            batch_size: int, batch size of training/validation step
            examples_per_file: int, number of examples in each TFrecord file
        train, bool, default True, indicator if the DS is for training
        test_examples, int: default 1000 number of examples in test dataset
    returns:
        ds: tensorflow input pipeline with images, text and labels
            output of ds is: (text, image), label
        ds_batches: int, number of steps in each epoch based on the batch_size
    '''
    file_size = params['examples_per_file'] 
    batch_size = file_size * len(files)

    ds = tf.data.TFRecordDataset(filenames = files)
    if train:
        ds = ds.map(decode_example_train, 
                    num_parallel_calls=tf.data.experimental.AUTOTUNE)
    else:
        ds = ds.map(decode_example_test)
    if train:
        ds = ds.batch(batch_size, drop_remainder=True)
    else:
        ds = ds.batch(test_examples)
    
    ds_batches = 10 #(len(files) * file_size) // batch_size
    return ds, ds_batches
    
    

In [10]:
def download_file(client, bucket, file_name):
    '''
    downloads a file from a GCS bucket into working directory

    args:
        client: google.cloud.storage.Client object
        bucket: str, name of bucket to download file from
        file_name: str, file name to download
    returns: None
    
    '''
    _bucket = client.bucket(bucket)
    blob = _bucket.blob(file_name)
    blob.download_to_filename(file_name)

In [11]:
def get_caption_model(params, client, bucket):
    '''
    creates pretrained image caption model, and inception model less last layer
    '''
    
    model_num = params['caption_model_version']
    model_path = 'image_caption_model_v{}.h5'.format(model_num)
    if not os.path.isfile(model_path):
        download_file(client, bucket,model_path)
    
    
    caption_model = tf.keras.models.load_model(model_path)
    
    model = tf.keras.applications.InceptionV3(include_top=True, input_shape=(299, 299, 3))
    inp = model.input
    out = model.layers[-2].output
    mdl = tf.keras.Model(inp, out)

    return caption_model, mdl

In [12]:
def get_image_captions(params, images, image_texts):
    '''
    creates captions to a group of images
    
    args:
        params: dictionary with at least the following keys:
            caption_text_input_length: int, length of captions
            tokenizer_start_index: int, value to signal start of caption
            tokenizer_end_index: int, value to signal end of caption
            
        images: tensor, dtype: tf.float32 shaped (None, 299, 299, 3) None is the 
        number of images, each image should be normalized to have
        pixel values of -1 to 1. Images to be captioned
        image_texts, tensor, dtype: tf.int32, shaped (None, 1, NONE) None is the 
        number of images, and NONE is an arbitrary number. Text of each image

            
    returns:
        captions: list of lists, dtype float, shaped 
        (None, params['caption_text_input_length'])None is the number of 
        images, image caption sequences
        texts: list of lists, shaped same as captions, image text sequences
    '''
    num_images = len(images)
    caption_len = params['caption_text_input_length']
    caption_end_index = params['tokenizer_end_index']




    
    captions = list()
    texts = list()
    for image in range(num_images):
        img_ = images[image]
        img_ = tf.expand_dims(img_, axis=0)
        img = get_image_features(img_)
        
        txt_input = np.zeros((caption_len))
        result = params['tokenizer_start_index']
        for idx in range(caption_len):
            txt_input[idx] = result
            result = get_capt(img, txt_input)
            result = result.numpy()[0] #.values[0]
            if result == caption_end_index:
                break
        txt_input_ = txt_input.tolist()
        captions.append(txt_input_)
        text = image_texts[image].numpy().tolist()
        texts.append(text)
    return captions, texts 

@tf.function
def get_image_features(image):
    def features(img):
        pred = feature_model(img)
        return pred
    result = strategy.run(features, args=(image,))
    return result
        
@tf.function
def get_capt(img, txt):
    def caption_step(image_, text_):
        '''
        evaluate model here
        '''
        txt_ = tf.expand_dims(text_, axis=0)
        pred = caption_model((image_, txt_))
        pred_ = tf.argmax(pred, axis=-1, name='model_prediction')


        return pred_
    result = strategy.run(caption_step, args=(img, txt))
    return result
    

In [13]:
dataset, _ = create_ds(tfrecords, params)

In [14]:
tf.random.set_seed(1)
np.random.seed(1)
with strategy.scope():

    caption_model, feature_model = get_caption_model(params, client, 'jh_hateful_memes')

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels.h5


In [15]:
for text, image, label in dataset:
#     texts = text
    labels = label
    image_captions, texts = get_image_captions(params, image, text)
    
    break

 

In [16]:
def upload_inference_csv(params, text_list, img_caption_list, label_list,
                        bucket_, client, prefix):
    '''
    function to upload image captions, meme texts, and labels as CSV files 
    to a bucket
    args:
        params, dictionary with the following key:
            caption_model_version: int, image caption version number
        text_list: array of meme text sequences
        img_caption_list: array of image caption sequences
        label_list: list, image labels
        bucket_: str, bucket to uplaod CSV files to
        client: google.cloud.storage.Client object
        prefix: str, prefix to append to the beginng of each file name
    
    returns: None
    '''
    caption_version = params['caption_model_version']
    text_df = pd.DataFrame(text_list)
    text_file = '{}_caption_model_text_v{}.csv'.format(prefix, caption_version)
    text_df.to_csv(text_file, index=False)
    caption_df = pd.DataFrame(img_caption_list)
    caption_file = '{}_caption_model_captions_v{}.csv'.format(prefix, caption_version)
    caption_df.to_csv(caption_file, index=False)
    label_df = pd.DataFrame(label_list)
    label_file = '{}_caption_model_label_v{}.csv'.format(prefix, caption_version)
    label_df.to_csv(label_file, index=False)
    _bucket = client.bucket(bucket_)
    for file in [text_file, caption_file, label_file]:
        blob = _bucket.blob(file)
        blob.upload_from_filename(file)
    
    

In [17]:
upload_inference_csv(params, texts, image_captions, labels,
                    'jh_hateful_memes', client, 'training')

In [18]:
tfrecords = get_list_files_from_bucket(client, 
                                       bucket_='jh_hateful_memes_test')

In [19]:
test_ds, test_steps = create_ds(tfrecords, params, train=False)

In [20]:
for text, image, label in test_ds:
    # texts = text
    labels = label
    image_captions, texts = get_image_captions(params, image, text)
    
    break



In [21]:
upload_inference_csv(params, texts, image_captions, labels,
                    'jh_hateful_memes', client, 'test_seen')

In [22]:
tfrecords = get_list_files_from_bucket(client, 
                                       bucket_='jh_hateful_memes_test_unseen')
test_ds_unseen, _ = create_ds(tfrecords, params, train=False)

for text, image, label in test_ds_unseen:
    # texts = text
    labels = label
    image_captions, texts = get_image_captions(params, image, text)
    
    break

upload_inference_csv(params, texts, image_captions, labels,
                    'jh_hateful_memes', client, 'test_unseen')

In [1]:
import requests

In [2]:
url = 'http://ipinfo.io/json'
response = requests.get(url)
response.text


'{\n  "ip": "104.198.217.94",\n  "hostname": "94.217.198.104.bc.googleusercontent.com",\n  "city": "Council Bluffs",\n  "region": "Iowa",\n  "country": "US",\n  "loc": "41.2619,-95.8608",\n  "org": "AS15169 Google LLC",\n  "postal": "51502",\n  "timezone": "America/Chicago",\n  "readme": "https://ipinfo.io/missingauth"\n}'