In [None]:
import pandas as pd
import numpy as np

#machine learning
import tensorflow as tf
from tensorflow.keras import layers 
from tensorflow import keras
from sklearn.model_selection import train_test_split

#accessing files
from google.cloud import storage
import os

#display charts/images
import matplotlib.pyplot as plt

#don't need
# from tensorflow.python.keras.preprocessing import sequence
# from tensorflow.python.keras.preprocessing import text
# import tensorflow_hub as hub


In [None]:
try:
    from google.colab import auth
    auth.authenticate_user()
    credentials=None

except ModuleNotFoundError:


    from google.oauth2 import service_account

    credentials = service_account.Credentials.from_service_account_file( #file location of GCS private key
        '/Users/jeremiahherberg/Downloads/hateful-memes-af65c70c1b79.json')

    client = storage.Client(project='hateful-memes', credentials=credentials)

In [None]:
num_examples_per_tfrecordfile = 850 #this will not change

In [None]:
bucket = 'jh_hateful_memes_dev'
client = storage.Client(project='hateful-memes', credentials=credentials)
objects = client.list_blobs(bucket, prefix='hatefulmemes_')
tfrecords = []
for object_ in objects:
    path = str(object_).split(', ')[1]
    gs_path = os.path.join('gs://', bucket, path)
    tfrecords.append(path) #gs_path

In [None]:
# tfrecords

In [None]:
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
def preprocess(img):
    features = {'label': tf.io.FixedLenFeature([], tf.int64),
                'text': tf.io.FixedLenFeature([], tf.string),
                'text_lemma': tf.io.FixedLenFeature([], tf.string),
                'text_lemma_no_stopwords': tf.io.FixedLenFeature([], tf.string),
                'text_no_stopwords':tf.io.FixedLenFeature([], tf.string),
                'image': tf.io.FixedLenFeature([], tf.string)}
    img = tf.io.parse_single_example(img, features)

    text = tf.io.parse_tensor(img['text'], out_type=tf.int32)
    image = tf.io.decode_jpeg(img['image'], 3)
    image = tf.image.resize_with_pad(image, 225, 225)
    image = image / 255.0
    label = img['label']
    return (text, image), label
    
    

In [None]:
def create_train_ds(files, batch_size, tpu=tpu):
    '''
    function to create dataset
    
    args:
        files: list of str, filepaths of TFrecord files to be used in DS
        batch_size: int, batch size of training/validation step
        tpu: bool, default 'tpu' global variable, True is TPU is being used
    
    returns:
        ds: tensorflow input pipeline with images, text and labels
        if tpu is True, output of ds is: text, image, label
        if tpu is False, output of ds is: (text, image), label
    '''
    

In [None]:
ds = tf.data.TFRecordDataset(filenames = [tfrecords]).map(preprocess).batch(3)

In [None]:
def create_model():
    '''
    creates model with two inputs and out output
    '''
    input_text = layers.Input()

In [None]:
#todo - make into function
input_text = layers.Input((58,))
embedding = layers.Embedding(input_dim=30000, output_dim=64, input_length=58, mask_zero=True)(input_text)
x_text = embedding
x_text = layers.SeparableConv1D(filters=64, kernel_size=4, activation='relu', padding='same')(x_text)
x_text = layers.SeparableConv1D(filters=64, kernel_size=4, activation='relu', padding='same')(x_text)
x_text = layers.MaxPooling1D(4, padding='same')(x_text)
x_text = layers.SeparableConv1D(filters=64, kernel_size=4, activation='relu', padding='same')(x_text)
x_text = layers.SeparableConv1D(filters=64, kernel_size=4, activation='relu', padding='same')(x_text)
x_text = layers.MaxPooling1D(4, padding='same')(x_text)
x_text = layers.Flatten()(x_text)
output_layer_text = layers.Dense(4, activation='sigmoid')(x_text)

input_image = layers.Input((225, 225, 3))#todo -make the 225 call a variable
x_img = input_image
x_img = layers.Conv2D(filters=64, kernel_size=5, padding='same')(x_img)
x_img = layers.MaxPooling2D(2, 2)(x_img)
x_img = layers.BatchNormalization()(x_img)
x_img = layers.ReLU()(x_img)
x_img = layers.Dense(16, activation='tanh')(x_img)
x_img = layers.Conv2D(filters=128, kernel_size=5, padding='same')(x_img)
x_img = layers.MaxPooling2D(2, 2)(x_img)
x_img = layers.BatchNormalization()(x_img)
x_img = layers.ReLU()(x_img)
x_img = layers.Dense(16, activation='tanh')(x_img)
x_img = layers.Flatten()(x_img)
output_layer_image = layers.Dense(4, activation='sigmoid')(x_img)

x = layers.Concatenate()([output_layer_text, output_layer_image])
x = layers.Dense(1, activation='sigmoid')(x)


model = keras.Model([input_text, input_image], x)
model.summary()
metrics = [
      keras.metrics.TruePositives(name='tp'),
#           keras.metrics.FalsePositives(name='fp'),
#           keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
#           keras.metrics.Precision(name='precision'),
#           keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
]
model.compile(
optimizer=tf.keras.optimizers.Adam(0.0003),
loss = tf.keras.losses.BinaryCrossentropy(label_smoothing = 0.01),
metrics=metrics)

In [None]:
history = model.fit(ds, epochs=4, steps_per_epoch=100)