# Cleaning Data

Increase the sizes of train val and test sets for better accuracy

In [1]:
TRAIN_SIZE = 500
VAL_SIZE = 50

PATH_TO_DATASET = "./multimodal_data/"

In [2]:
import numpy as np
import pandas as pd

In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('stopwords')

# reading all training, validation, and testing files for the multimodal data
train = pd.read_csv(PATH_TO_DATASET + "multimodal_train.tsv", sep = '\t')[:TRAIN_SIZE]
val = pd.read_csv(PATH_TO_DATASET + "multimodal_validate.tsv", sep = '\t')[:VAL_SIZE]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\saikr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\saikr\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saikr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saikr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
lemmatizer = WordNetLemmatizer()

def remove_stop_words(tokens):
    no_stop = []
    for token in tokens:
        if token not in stopwords.words('english'):
            no_stop.append(token)
    return no_stop

def lemmatize(tokens):
    lemmatized = []
    for token in tokens:
        lemmatized.append(lemmatizer.lemmatize(token))
    return " ".join(lemmatized)

def data_clean(data):
    print("Total no. of rows in data:", len(data))
    print("Total no. of NaNs in 'clean_title' column:", data['clean_title'].isnull().sum())
    
    data = data[data['clean_title'].notna()]
    data = data[data.image_url.notna()]
    print("Total no. of rows in data after removing NaNs:", len(data))
    
    data.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 
               'Unnamed: 0.1.1.1', 'author', 'created_utc', 'domain', 
               'id', 'linked_submission_id', 'num_comments', 'score', 
               'subreddit', 'title', 'upvote_ratio'], axis=1, inplace=True, errors='ignore')
    
    print("Count of true and false titles:", data['2_way_label'].value_counts())
    print("Ratio of true and false titles:", data['2_way_label'].value_counts(normalize=True))
    
    data['word_count'] = data['clean_title'].str.split().str.len()
    print("Average count of words in true and false titles:", data.groupby('2_way_label')['word_count'].mean())
    
    tokenized_messages = data['clean_title'].str.lower().apply(word_tokenize)
    tokenized_messages = tokenized_messages.apply(remove_stop_words)
    tokenized_messages = tokenized_messages.apply(lemmatize)
    
    data['clean_title'] = tokenized_messages
    return data

In [5]:
train = data_clean(train)
val = data_clean(val)

Total no. of rows in data: 500
Total no. of NaNs in 'clean_title' column: 0
Total no. of rows in data after removing NaNs: 499
Count of true and false titles: 0    290
1    209
Name: 2_way_label, dtype: int64
Ratio of true and false titles: 0    0.581162
1    0.418838
Name: 2_way_label, dtype: float64
Average count of words in true and false titles: 2_way_label
0    5.875862
1    9.483254
Name: word_count, dtype: float64
Total no. of rows in data: 50
Total no. of NaNs in 'clean_title' column: 0
Total no. of rows in data after removing NaNs: 50
Count of true and false titles: 0    29
1    21
Name: 2_way_label, dtype: int64
Ratio of true and false titles: 0    0.58
1    0.42
Name: 2_way_label, dtype: float64
Average count of words in true and false titles: 2_way_label
0     6.137931
1    11.476190
Name: word_count, dtype: float64


# Text Processing with Bert

In [6]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

In [7]:
bert_model_name = 'bert_en_uncased_L-12_H-768_A-12' 

map_name_to_handle = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_base/2',
    'electra_small':
        'https://tfhub.dev/google/electra_small/2',
    'electra_base':
        'https://tfhub.dev/google/electra_base/2',
    'experts_pubmed':
        'https://tfhub.dev/google/experts/bert/pubmed/2',
    'experts_wiki_books':
        'https://tfhub.dev/google/experts/bert/wiki_books/2',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
}

map_model_to_preprocess = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
    'electra_small':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'electra_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_pubmed':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_wiki_books':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

BERT model selected           : https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3


In [8]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

In [9]:
text_test = ['this is such an amazing movie!', "HELLO WORLD"]
text_preprocessed = bert_preprocess_model(text_test)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

Keys       : ['input_type_ids', 'input_mask', 'input_word_ids']
Shape      : (2, 128)
Word Ids   : [ 101 2023 2003 2107 2019 6429 3185  999  102    0    0    0]
Input Mask : [1 1 1 1 1 1 1 1 1 0 0 0]
Type Ids   : [0 0 0 0 0 0 0 0 0 0 0 0]


# Image Processing with Resnet 50

In [10]:
resnet = tf.keras.applications.resnet50.ResNet50()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels.h5


In [11]:
import requests
import concurrent.futures
import threading

from PIL import Image

thread_local = threading.local()
error_urls = []

def get_session():
    if not hasattr(thread_local, "session"):
        thread_local.session = requests.Session()
    return thread_local.session

def extract_image(row):
    print(f"\r{row[0]}", end="")
    row = row[1]
    session = get_session()
    if row.get("hasImage", False):
        url = row.get("image_url", "")
        if url in ["nan", ""]:
            return
        try:
            with session.get(url, stream=True) as response:
                im = Image.open(response.raw)
                im = im.resize((224,224))
                img = np.asarray(im)
                if(len(img.shape) != 3 or img.shape[-1] != 3):
                    raise Exception("B/W image")
                img_features = tf.reshape(resnet(np.expand_dims(img,0)),(-1)).numpy()
                text = row.get("clean_title")
                text_tokens = bert_preprocess_model([text])['input_word_ids'][0]
                tokens = np.concatenate((text_tokens, img_features), axis=0)
                label = row.get("2_way_label")
                return tokens, label
        except Exception as e:
            error_urls.append(url)
            return
    return

In [12]:
import concurrent.futures
error_urls = []
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    results = executor.map(extract_image, train.iterrows())
print(f"\rFailed URLS: {len(error_urls)}")
df = pd.DataFrame(results, columns=["features", "labels"]).dropna()
train_ds = tf.data.Dataset.from_tensor_slices((df['features'].to_list(), df['labels']))

Failed URLS: 27


In [13]:
error_urls = []
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    results = executor.map(extract_image, val.iterrows())
print(f"\rFailed URLS: {len(error_urls)}")
df = pd.DataFrame(results, columns=["features", "labels"]).dropna()
val_ds = tf.data.Dataset.from_tensor_slices((df['features'].to_list(), df['labels']))

Failed URLS: 2


In [16]:
train_ds = train_ds.shuffle(TRAIN_SIZE).batch(32)
val_ds = val_ds.shuffle(VAL_SIZE).batch(32)

In [17]:
import tensorflow as tf
class MultiModal(tf.keras.Model):
    
    def __init__(self, combine_alg="max"):
        super().__init__()
        self.dropout = tf.keras.layers.Dropout(0.5)
        self.dense_text = tf.keras.layers.Dense(128)
        self.dense_img = tf.keras.layers.Dense(128)
        self.init_mm_dense()
        self.combine_alg = combine_alg
    
    def init_mm_dense(self):
        self.mm_dense_1 = tf.keras.layers.Dense(512, activation="relu")
        self.mm_dense_2 = tf.keras.layers.Dense(256, activation="relu")
        self.mm_dense_3 = tf.keras.layers.Dense(64, activation="relu")
        self.out = tf.keras.layers.Dense(1, activation="sigmoid")

    def mm_dense(self, inputs, training=False):
        out1 = self.mm_dense_1(inputs)
        if training:
            out1 = self.dropout(out1)
        out2 = self.mm_dense_2(out1)
        if training:
            out2 = self.dropout(out2)
        out3 = self.mm_dense_3(out2)
        if training:
            out3 = self.dropout(out3)
        outputs = self.out(out3)
        return outputs
    
    def combine(self, t1, t2):
        if self.combine_alg == "max":
            return tf.math.maximum(t1, t2)
        elif self.combine_alg == "sum":
            return tf.math.add(t1, t2)
        else:
            return tf.math.add(t1, t2)/2
    
    def call(self, inputs, training=False):
        text_tokens = inputs[:,:128]
        img_tokens = inputs[:,128:]
        text_features = self.dense_text(text_tokens)
        img_features = self.dense_img(img_tokens)
        features = self.combine(text_features, img_features)
        outputs = self.mm_dense(features)
        return outputs        
    

In [18]:
model = MultiModal() 

callbacks = [tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=1e-5,
    patience=2,
    mode='auto',
    restore_best_weights=True
)]
model.compile(
    optimizer="adam", 
    loss=tf.keras.losses.BinaryCrossentropy(), 
    metrics=[tf.keras.metrics.BinaryAccuracy()]
)

In [19]:
model.fit(train_ds, epochs=100, validation_data=val_ds, callbacks=callbacks)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100


<keras.callbacks.History at 0x1e40bac0400>

In [20]:
model.evaluate(val_ds)



[0.5226325988769531, 0.6666666865348816]