In [2]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import resample,shuffle
from sklearn.model_selection import train_test_split
import requests

In [3]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.inception_v3 import preprocess_input
from transformers import GPT2Tokenizer, TFGPT2Model
from transformers import BertTokenizer, TFBertForQuestionAnswering
from transformers import pipeline, AutoTokenizer
from tensorflow.keras.layers import Multiply
from tensorflow.keras.utils import to_categorical

In [4]:
device_name = tf.test.gpu_device_name()

In [5]:
device_name

In [6]:
# Use InceptionV3 model for image feature extraction
# Instantiate CV model feature extractor and freeze layers
base_model = tf.keras.applications.InceptionV3(
    include_top=False,
    weights="imagenet",
    input_tensor=None,
    input_shape=(None, None, 3),
    pooling='max',
    classifier_activation="softmax",
)
base_model.trainable = False

# Use BERT question answering model from Hugging Face
# Download text feature extractor
tokenizer = BertTokenizer.from_pretrained("bert-base-cased", )
bertmodel = TFBertForQuestionAnswering.from_pretrained("bert-base-cased")
feature_extraction = pipeline('feature-extraction', model=bertmodel, tokenizer=tokenizer)

In [7]:
import tensorflow.keras.preprocessing.image as image
from skimage import io
 
# - Extract features from image
# - Extract features from question
# - Combine image + question features
imgsize = (600, 600)

def extract_image(image_url):
    imag = io.imread(image_url)
    x = image.img_to_array(imag)
    n = preprocess_input(x)
    
    # resize so all photos have same dim
    size = imgsize
    n = tf.keras.preprocessing.image.smart_resize(n, size)
    return n


def extract_image_features(n):
    feature_vector = base_model.predict(n) 
    return feature_vector


# Gets the most common answer for a given sample
from scipy import stats as s
def compute_answers(ans):
    y = []
    for i in ans:
        y.append(i['answer'])
        
    answer = s.mode(y)[0]
    return answer
  

# Gets most common 3000 answers out of given dataset
from collections import Counter
def init_answer_info(data):
    answers = []
    for i in data:
        for j in i['answers']:
            answers.append(j['answer'])
        
    occurence_count = Counter(answers)
    most_common = occurence_count.most_common(3000)
    most_common_words = []
    for i in most_common:
        most_common_words.append(i[0])

    num_answers = len(most_common_words)
    return num_answers, most_common_words


# function to get embeddings from questions
def extract_pooled_text_embeddings(data, start, stop, dictval='question', T = 45):
    token_ids = np.array([np.zeros(T)])
    attn_mask = np.array([np.zeros(T)])
    seg_ids = np.array([np.zeros(T)])
    for vq in data[start:stop]:
        question = vq[dictval]
        tokens = tokenizer.tokenize(question)
        tokens = ['[CLS]'] + tokens + ['[SEP]']

        padded_tokens=tokens + ['[PAD]' for _ in range(T-len(tokens))]
        this_attn_mask=[1 if token != '[PAD]' else 0 for token in padded_tokens]
        seg_id=[0 for _ in range(len(padded_tokens))]
        sent_ids=tokenizer.convert_tokens_to_ids(padded_tokens)

        # put batch together by appending
        try:
            token_ids = np.vstack([token_ids, sent_ids])
            attn_mask = np.vstack([attn_mask, this_attn_mask])
            seg_ids = np.vstack([seg_ids, seg_id])
        
        # some are too long so just cut off the ends
        # TODO: fix code to just remove these samples from dataset
        except: 
            print('except')
            sent_ids = sent_ids[0:T]
            this_attn_mask = this_attn_mask[0:T]
            seg_id = seg_id[0:T]
            token_ids = np.vstack([token_ids, sent_ids])
            attn_mask = np.vstack([attn_mask, this_attn_mask])
            seg_ids = np.vstack([seg_ids, seg_id])

            
    # Finished compiling batch, now feed batch to model
    token_ids = np.delete(token_ids, 0, 0)
    attn_mask = np.delete(attn_mask, 0, 0)
    seg_ids = np.delete(seg_ids, 0, 0)

    token_ids = token_ids.astype(np.int64)
    tattn_mask = attn_mask.astype(np.int64)
    seg_ids = seg_ids.astype(np.int64)

    hidden, pooled = bertmodel(token_ids, attention_mask = attn_mask,token_type_ids = seg_ids, return_dict=False)
    pooled_embeddings = np.array(pooled)
        
    return pooled_embeddings

# function to get embeddings from answers from the data dictionary. Largely the same as above.
def extract_pooled_answer_embeddings(data, start, stop, dictval='answer', T = 10):
    token_ids = np.array([np.zeros(T)])
    attn_mask = np.array([np.zeros(T)])
    seg_ids = np.array([np.zeros(T)])
    for vq in data:
        question = vq[0]
        tokens = tokenizer.tokenize(question)
        tokens = ['[CLS]'] + tokens + ['[SEP]']
#         print(f"Final Length: {len(tokens)}")

        padded_tokens=tokens + ['[PAD]' for _ in range(T-len(tokens))]
        this_attn_mask=[1 if token != '[PAD]' else 0 for token in padded_tokens  ]
        seg_id=[0 for _ in range(len(padded_tokens))]
        sent_ids=tokenizer.convert_tokens_to_ids(padded_tokens)

        try:
            token_ids = np.vstack([token_ids, sent_ids])
            attn_mask = np.vstack([attn_mask, this_attn_mask])
            seg_ids = np.vstack([seg_ids, seg_id])
        except: 
            print('answer except')
            sent_ids = sent_ids[0:T]
            this_attn_mask = this_attn_mask[0:T]
            seg_id = seg_id[0:T]
            token_ids = np.vstack([token_ids, sent_ids])
            attn_mask = np.vstack([attn_mask, this_attn_mask])
            seg_ids = np.vstack([seg_ids, seg_id])

    # Finished compiling batch, now feed batch to model
    token_ids = np.delete(token_ids, 0, 0)
    attn_mask = np.delete(attn_mask, 0, 0)
    seg_ids = np.delete(seg_ids, 0, 0)

    token_ids = token_ids.astype(np.int64)
    tattn_mask = attn_mask.astype(np.int64)
    seg_ids = seg_ids.astype(np.int64)

    hidden, pooled = bertmodel(token_ids, attention_mask = attn_mask,token_type_ids = seg_ids, return_dict=False)
    pooled_embeddings = np.array(pooled)
        
    return pooled_embeddings


# function to get embeddings from answers in the answer bank in list format. Largely the same as above.
# TODO: rework functions lol
def extract_pooled_answerbank_embeddings(data, start, stop, dictval='answer', T = 10):
    token_ids = np.array([np.zeros(T)])
    attn_mask = np.array([np.zeros(T)])
    seg_ids = np.array([np.zeros(T)])
    for question in data:
        tokens = tokenizer.tokenize(question)
        tokens = ['[CLS]'] + tokens + ['[SEP]']
#         print(f"Final Length: {len(tokens)}")

        padded_tokens=tokens + ['[PAD]' for _ in range(T-len(tokens))]
        this_attn_mask=[1 if token != '[PAD]' else 0 for token in padded_tokens  ]
        seg_id=[0 for _ in range(len(padded_tokens))]
        sent_ids=tokenizer.convert_tokens_to_ids(padded_tokens)

        try:
            token_ids = np.vstack([token_ids, sent_ids])
            attn_mask = np.vstack([attn_mask, this_attn_mask])
            seg_ids = np.vstack([seg_ids, seg_id])
        except: 
            print('answer except')
            sent_ids = sent_ids[0:T]
            this_attn_mask = this_attn_mask[0:T]
            seg_id = seg_id[0:T]
            token_ids = np.vstack([token_ids, sent_ids])
            attn_mask = np.vstack([attn_mask, this_attn_mask])
            seg_ids = np.vstack([seg_ids, seg_id])

    # Finished compiling batch, now feed batch to model
    token_ids = np.delete(token_ids, 0, 0)
    attn_mask = np.delete(attn_mask, 0, 0)
    seg_ids = np.delete(seg_ids, 0, 0)

    token_ids = token_ids.astype(np.int64)
    tattn_mask = attn_mask.astype(np.int64)
    seg_ids = seg_ids.astype(np.int64)

    hidden, pooled = bertmodel(token_ids, attention_mask = attn_mask,token_type_ids = seg_ids, return_dict=False)
    pooled_embeddings = np.array(pooled)
        
    return pooled_embeddings

In [8]:
# Function to extract all features and concat into single vector for each sample
def get_feature_vectors(data, start = 0, stop = 50):
    img_train = np.zeros((1, imgsize[0], imgsize[1], 3))
    # Extract features describing the image
    for i, vq in enumerate(data[start:stop]):
          image_name = vq['image']
          image_url = img_dir + image_name
          image_vec = extract_image(image_url)
          n1, n2, n3 = image_vec.shape
          image_vec = np.reshape(image_vec, (1, n1, n2, n3))
          img_train = np.vstack([img_train, image_vec])

    img_train = np.delete(img_train, 0, 0)    
    image_feature = extract_image_features(img_train)

    # Extract features describing the question
    question_feature = extract_pooled_text_embeddings(data, start, stop)
        
    # Create a multimodal feature to represent both the image and question
    multimodal_features = np.concatenate([question_feature, image_feature], axis=1)

    # Each sample has 10 manually entered answers. Get the mode of each sample's answers and use that. For now.
    vq = data[start]
    answers = vq['answers']
    label = compute_answers(answers)
    y=label
    for vq in data[start+1:stop]:
        answers = vq['answers']
        label = compute_answers(answers)
        y = np.vstack([y, label])
        
    return multimodal_features, y

In [9]:
# Download training data and create answer bank 
img_dir = "https://vizwiz.cs.colorado.edu//VizWiz_visualization_img/"
split = 'train' 
annotation_file = "https://ivc.ischool.utexas.edu/VizWiz_final/vqa_data/Annotations/%s.json" %split

split_data = requests.get(annotation_file, allow_redirects=True)
data = split_data.json()

# top_train_answers is the answer bank / list of most common 3000 answers
num_top_answers, top_train_answers = init_answer_info(data)  

In [None]:
# extract features from the training samples
max_ans_length = 10
X_train = np.zeros([1, 2093])  # 2093 is the length of the final multimodal feature vector
y_train = np.zeros(max_ans_length)

# batch size of 50
for start in range(0, 15000, 50):
    print(start)
    stop = start+50
    X_train_iter, y_train_iter = get_feature_vectors(data, start=start, stop=stop)

    # get answer label embeddings
    y_embed = extract_pooled_answer_embeddings(y_train_iter, start, stop, dictval='answer', T = max_ans_length)
    
    # append and save each iteration
    X_train = np.vstack([X_train, X_train_iter])
    y_train = np.vstack([y_train, y_embed])
    np.save('trainX_data', X_train)
    np.save('trainy_data', y_train)

# delete the top row of np.zeros and save again
X_train = np.delete(X_train, 0, 0)
y_train = np.delete(y_train, 0, 0)

np.save('trainX_data', X_train)
np.save('trainy_data', y_train)

In [None]:
# repeat above cells for validation and test data

In [10]:
# Clean data

# Remove training samples where label is not in the answer bank or that are 'unanswerable', 'unsuitable', 'unsuitable image'
def remove_unanswerable_samples(X_train, y_train, embeddings_to_remove = [0, 1, 12]):
    count = 0
    new_X = []
    new_y = []
    
    for X, y in zip(X_train, y_train):
        exit = 0
        for vec in embeddings_to_remove:
            if (y == question_feature[vec]).all():
                count = count+1
                exit = 1
                continue
        if exit == 1:
            continue
        new_X.append(X)
        new_y.append(y)
            
    print(count)
    return np.array(new_X), np.array(new_y)

# Get embeddings for answer bank
max_answer_length = 10
d = top_train_answers
question_feature = extract_pooled_answerbank_embeddings(d, 0, len(d), dictval='answer', T = max_answer_length)

# these are the embeddings corresponding to 'unanswerable', 'unsuitable', 'unsuitable image'
embeddings_to_remove = [0, 1, 12]
X_train, y_train = remove_unanswerable_samples(X_train, y_train, embeddings_to_remove = [0, 1, 12])

np.save('trainX_data_latest', X_train)
np.save('trainy_data_latest', y_train)

In [139]:
X_train = np.load('../input/bert-answer-embeddings/trainX_data_latest.npy')
y_train = np.load('../input/bert-answer-embeddings/trainy_data_latest.npy')
y_labels = np.load('../input/bert-answer-embeddings/trainlabels_latest.npy')

In [140]:
# Now create the multimodal model
import keras
from keras import layers, Input, Model, optimizers

max_answer_length = 10  # 10 output layer nodes corresponding to 1x10 BERT text embedding

inputs = Input(shape=(1, X_train.shape[1]))
x = layers.Bidirectional(layers.LSTM(100))(inputs)
x = layers.Dropout(0.1)(x)
x = layers.Dense(256, activation='relu')(x)
x = layers.Dropout(0.1)(x)
x = layers.Flatten()(x)
out = layers.Dense(max_answer_length)(x) 

model = Model(inputs=inputs, outputs=out)
model.compile(
  optimizer = 'adam',
  loss=tf.keras.losses.MeanSquaredError(),
  metrics=['accuracy'],
)


model.summary()

In [141]:
# Reshape inputs
n1, n2 = X_train.shape
X_train =  X_train.reshape(n1, 1, n2)
y_train =  y_train.reshape(n1, max_answer_length)

# n1, n2 = X_val.shape
# X_val = X_val.reshape(n1, 1, n2)
# y_val_index = y_val_index.reshape(n1,  max_answer_length)

In [None]:
%%timeit
# Train the model
info = model.fit(X_train, y_train, batch_size=1000, epochs=100, )

In [146]:
# preprocess
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
enc = LabelEncoder()
y = enc.fit_transform(top_train_answers)
X = question_feature

# setup kNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X, y)

# make predictions and reverse map with kNN
predictions = model.predict(X_train)
knn_preds = knn.predict(predictions)
final_preds = enc.inverse_transform(knn_preds)
results = final_preds

In [187]:
import pandas as pd
a = pd.DataFrame(results)
a.value_counts()

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train, predictions)

In [113]:
y_labels

In [112]:
results

In [189]:
# All answers
gtlist = [x['answers'] for x in data]

# Save the accuracies
acc_list = []
i = 0


# Compute accuracy for each image
for pred in results:

    # Get the GT answer list and preprocess
    gt_ans = gtlist[i] 
    gt_ans = [x['answer'] for x in gt_ans]
    gt_ans = [x.lower() for x in gt_ans]

    # Compute accuracy (compare with at least 3 human answers)
    cur_acc = np.minimum(1.0, gt_ans.count(pred)/3.0)

    acc_list.append(cur_acc)
    i +=1

print ('Accuracy: {}'.format(round(np.mean(acc_list), 2)))

## save results to results.csv
import pandas as pd
df = pd.DataFrame(results)
df.to_csv("results.csv", header = None, index = None)