In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.preprocessing import text
import json
import matplotlib.pyplot as plt
import spacy
from google.cloud import storage


In [2]:
from google.oauth2 import service_account

credentials = service_account.Credentials.from_service_account_file( #file location of GCS private key
    '/Users/jeremiahherberg/Downloads/hateful-memes-af65c70c1b79.json')

client = storage.Client(project='hateful-memes', credentials=credentials)


In [3]:
def load_jsonl_file(file_path):
    '''
    loads jsonl file and creates a list of dicts
    
    args:
        file_path: str, path of jsonl file to load
        
    returns: list of dicts in the file located at fle_path
    '''
    with open(file_path) as file:
        json_list = list(file)
    list_of_jsons = []
    for json_line in json_list:
        line = json.loads(json_line)
        list_of_jsons.append(line)
    
    return list_of_jsons
        

In [4]:
dev_ds = load_jsonl_file('dev.jsonl')
train_ds = load_jsonl_file('train.jsonl')

In [5]:
def int_feature(int_):
    '''
    creates a feature that is an int to be used in a TFexample
    
    args:
        int_: int, value to be used as the feature
        
    returns: feature that can be used in a TFexample
    '''
    feature = tf.train.Feature(int64_list=tf.train.Int64List(value=[int_]))
    return feature
    

In [24]:
def text_feature(text):
    '''
    creates a feature from a string of text to be used in a TFexample
    
    args:
        text: str, string to be used as the feature
    
    returns: feature that can be used in a TFexample
    '''
    text_serialized = tf.io.serialize_tensor(text[0]).numpy() #.tolist()
    feature = tf.train.Feature(bytes_list=tf.train.BytesList(value=[text_serialized]))

#     feature = tf.train.Feature(int64_list=tf.train.Int64List(value=text.flatten() ))
    return feature

In [25]:
def imageString_feature(img_string):
    '''
    creates a feature from a bytestring to be used in a TFexample
    
    args:
        img_string: bytestring, image to be used as the feature
        
    returns: feature that can be used in a TFexample
    
    intended to be used to put images into TFrecords, however 
    this can be used for any bytestring
    '''
    feature = tf.train.Feature(bytes_list=tf.train.BytesList(value=[img_string]))
    return feature

In [26]:
def get_imgBytestring_from_filePath(path):
    '''
    converts an image file into a bytestring
    
    args:
        path: str, file path of image file that will be
        converted into bystestring
        
    returns: bytestring of image file
    '''
    return open(path, 'rb').read()
    

In [27]:
def transform_to_lemma(doc, remove_stop=False, language='en_core_web_sm'):
    '''
    transforms each word in a text to lemma words
    
    args:
        doc: str, text to be transformed
        
        remove_stop, Bool, default: False, if set to True, stopwords
        will be removed
        
        language: str, default: 'en_core_web_sm', 
    
    returns:
        lemma_text: str, original text converted to lemma words
    '''
    lemma_text = ''
    nlp = spacy.load(language)
    doc = nlp(doc)
    for word in doc:
        if remove_stop:
            if word.is_stop == False:
                lemma_text = '{} {}'.format(lemma_text, word.lemma_)
        else:
            lemma_text = '{} {}'.format(lemma_text, word.lemma_)
    return lemma_text
    

In [28]:
def remove_stopwords(doc, language='en_core_web_sm'):
    '''
    removes stopwords from a text
    
    args:
        doc: str, text to be transformed
        
        language: str, default: 'en_core_web_sm', 
    
    returns:
        no_stops: str, original text with stopwords removed
    '''
    no_stops = ''
    nlp = spacy.load(language)
    doc = nlp(doc)
    for word in doc:
        if word.is_stop == False:
            no_stops = '{} {}'.format(no_stops, word)
    return no_stops

In [29]:
def tokenize(string, tokenizer, padding):
    '''
    calls .texts_to_sequences on a tokenizer using string as input
    
    args:
        string: str, text to transform into a sequence
        tokenizer: keras.preprocessing.text.Tokenizer object
        padding: int, length of output vector. If len of output vector is 
        less than padding, zeros will be added to beginning, if len is greater
        than len of output vector, it will be truncated 
    
    returns: output of tokenizer.texts_to_sequences with string as input
    with a len of padding
    '''
    vector = tokenizer.texts_to_sequences([string])
    return sequence.pad_sequences(vector, maxlen=padding)

In [30]:
def create_TFexample(dict_, tokenizer, padding):
    '''
    creates a TFexample with the following features:
        image
        label
        id
        text
        text_lemma
        text_lemma_no_stopwords
        text_no_stopwords
    
    args:
        dict_: dictionary with the following keys:
            id: int, id of image
            img: str, file path of image
            label: int, indicator if meme is hateful or not
            text: str, text on meme
        tokenizer: keras.preprocessing.text.Tokenizer object that will be used to preprocess text
        padding: int, length of each text vector. If text length is less, zeros will be added to 
        beginning, and if the text length is greater than padding, it will be truncated
        
    
    returns: TFexample with above features
    '''
    
    features = {
        'image': imageString_feature(get_imgBytestring_from_filePath(dict_['img'])),
        'label': int_feature(dict_['label']),
        'id': int_feature(dict_['id']),
        'text': text_feature(tokenize(dict_['text'], tokenizer, padding)),
        #add stopwords and lemons
        'text_lemma' : text_feature(tokenize(transform_to_lemma(dict_['text']), tokenizer, padding)),
        'text_lemma_no_stopwords' : text_feature(tokenize(transform_to_lemma(dict_['text'], remove_stop=True),
                                                          tokenizer, padding)),
        'text_no_stopwords' : text_feature( tokenize(remove_stopwords(dict_['text']), tokenizer,
                                                     padding))
        
    }
    example = tf.train.Example(features=tf.train.Features(feature=features))
    return example

In [31]:
def upload_TFrecord_gcs(filepath, client, bucket):
    '''
    function to upload TFrecord filepath to gcs bucket
    
    intended to be used for TFrecord files, but can be used for any filetype
    
    args:
        filepath: str, path of file to be uploaded
        client: gcs google.storage.Client object
        bucket: str, existing gcs bucket to upload file to
    
    returns:
        None 
    '''
    gcs_bucket = client.bucket(bucket)
    blob = gcs_bucket.blob(filepath)
    blob.upload_from_filename(filepath)

In [32]:
def create_TFrecord(meme_list, 
                    start_idx, end_idx,
                    tokenizer, padding,
                    tfr_file_num, ttl_tfr_files=10):
    '''
    creates a TFrecord file
    
    args:
        meme_list: list
    
    
    returns:
        TFrecord_filepath, str, file path of newly created tfrecord file
    '''
    TFrecord_filepath = 'hatefulmemes_{}_of_{}.tfrecord'.format(tfr_file_num,
                                                               ttl_tfr_files)
    with tf.io.TFRecordWriter(TFrecord_filepath) as writer:
        for idx in range(start_idx, end_idx + 1):
            TFexample = create_TFexample(meme_list[idx], tokenizer, padding)
            writer.write(TFexample.SerializeToString())
    
    return TFrecord_filepath
    ###continue working on documentation

In [33]:
def calc_idxs(meme_list, num_splits=10):
    '''
    calculate start and end index's of a list in order to split up a list
    evenly
    
    args:
        meme_list: list, list that needs to be split up
        num_splits, int, default 10, number of splits the list needs to be 
        split up into
    
    returns:
        idxs: zip of start and end indexes of meme_list that will evenly split up
        meme_list by num_splits
    
    raises:
        ValueError: if length of meme_list is not evenly divisible by num_splits
    '''
    len_ = len(meme_list)
    if len_ % num_splits > 0:
        raise ValueError('meme_list must be evenly divisible by num_splits')
    
    start_idxs = []
    end_idxs = []
    start_idx = 0
    end_idx = len_ / num_splits - 1
    for _ in range(num_splits):
        start_idxs.append(int(start_idx))
        end_idxs.append(int(end_idx))
        start_idx += len_ / num_splits
        end_idx += len_ / num_splits
    
    idxs = zip(start_idxs, end_idxs)
    return idxs
        

In [34]:
def create_tokenizer(input_ds, top_words, preprocess_fn):
    '''
    creates keras.preprocessing.text.Tokenizer object based on
    input dataset, top number of words, and nlp preprocessing
    functions
    
    args:
        input_ds: list of dicts, each dict has the following key:
            'text': str, text that needs to be tokenized
        top_words: int, top number of words to be tokenized
        preprocess_fn: function, the text in the input_ds will be
        passed into this function to train the tokenizer
        (in input_ds text will also not be passed into the preprocess_fn)
    
    returns:keras.preprocessing.text.Tokenizer object
    '''
    word_list = [] #list of texts
    for item in input_ds:
        words = item['text']
        word_list.append(words)
        preprocessed_words = preprocess_fn(words)
        word_list.append(preprocessed_words)
    
    tokenizer = text.Tokenizer(num_words=top_words)
    tokenizer.fit_on_texts(word_list)
    return tokenizer

    

In [35]:
def main(ds_path, client, bucket, num_splits=10, top_words=20000, padding=41, preprocess=transform_to_lemma):
    '''
    creates all TFrecord files
    '''
    ds = load_jsonl_file(ds_path)
    tokenizer = create_tokenizer(ds, top_words, preprocess)###
    tokenizer_json = tokenizer.to_json()
    json_file_name = 'tokenizer.json'
    with open (json_file_name, 'w') as json_file:
        json.dump(tokenizer_json, json_file)
    #upload tokenizer json file
    upload_TFrecord_gcs(json_file_name, client, bucket)
    startEnd_idxs = calc_idxs(ds, num_splits)
    file_num = 1
    for startIdx, endIdx in startEnd_idxs:
        TFrecord_path = create_TFrecord(ds, startIdx, endIdx,
                                        tokenizer, padding,
                                        file_num, num_splits)
        upload_TFrecord_gcs(TFrecord_path, client, bucket)
        file_num +=1
    

In [36]:
main('dev.jsonl', client, 'jh_hateful_memes_dev', padding=41, top_words=30000)#padding - 41 for dev, 58 for train