In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import resample,shuffle
from sklearn.model_selection import train_test_split
import requests

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.inception_v3 import preprocess_input
from transformers import GPT2Tokenizer, TFGPT2Model
from transformers import BertTokenizer, TFBertForQuestionAnswering
from transformers import pipeline, AutoTokenizer
from tensorflow.keras.layers import Multiply
from tensorflow.keras.utils import to_categorical

In [3]:
device_name = tf.test.gpu_device_name()

2022-03-30 21:32:00.083582: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
device_name

''

In [5]:
# Use InceptionV3 model for image feature extraction
# Instantiate CV model feature extractor and freeze layers
base_model = tf.keras.applications.InceptionV3(
    include_top=False,
    weights="imagenet",
    input_tensor=None,
    input_shape=(None, None, 3),
    pooling='max',
    classifier_activation="softmax",
)
base_model.trainable = False

# Use BERT question answering model from Hugging Face
# Download text feature extractor
tokenizer = BertTokenizer.from_pretrained("bert-base-cased", )
bertmodel = TFBertForQuestionAnswering.from_pretrained("bert-base-cased")
feature_extraction = pipeline('feature-extraction', model=bertmodel, tokenizer=tokenizer)

All model checkpoint layers were used when initializing TFBertForQuestionAnswering.

Some layers of TFBertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [196]:
import tensorflow.keras.preprocessing.image as image
from skimage import io

## PART 2: 
# - Extract features from image
# - Extract features from question
# - Combine image + question features
imgsize = (600, 600)

def padding(array, target_len=30):
    array = np.array(array)
    h = array.shape[0]
    if h < target_len:
        a = target_len - h
        new = np.pad(array, pad_width=(0, a), mode='constant')
    else:
        new = array[0:target_len]

    return new

def extract_image(image_url):
    imag = io.imread(image_url)
    x = image.img_to_array(imag)
    n = preprocess_input(x)
    
    # resize so all photos have same dim
    size = imgsize
    n = tf.keras.preprocessing.image.smart_resize(n, size)
    return n

def extract_image_features(n):
    feature_vector = base_model.predict(n) 
    return feature_vector

# Gets the most common answer for a given sample
from scipy import stats as s
def compute_answers(ans):
    y = []
    for i in ans:
        y.append(i['answer'])
        
    answer = s.mode(y)[0]
    return answer
  

# Gets most common 3000 answers out of given dataset
from collections import Counter
def init_answer_info(data):
    answers = []
    for i in data:
        for j in i['answers']:
            answers.append(j['answer'])
        
    occurence_count = Counter(answers)
    most_common = occurence_count.most_common(3000)
    most_common_words = []
    for i in most_common:
        most_common_words.append(i[0])

    num_answers = len(most_common_words)
    return num_answers, most_common_words

def extract_pooled_text_embeddings(data, start, stop, dictval='question', T = 30):
    token_ids = np.array([np.zeros(T)])
    attn_mask = np.array([np.zeros(T)])
    seg_ids = np.array([np.zeros(T)])
    for vq in data[start:stop]:
        question = vq[dictval]
        tokens = tokenizer.tokenize(question)
        tokens = ['[CLS]'] + tokens + ['[SEP]']

        padded_tokens=tokens + ['[PAD]' for _ in range(T-len(tokens))]
        this_attn_mask=[1 if token != '[PAD]' else 0 for token in padded_tokens  ]
        seg_id=[0 for _ in range(len(padded_tokens))]
        sent_ids=tokenizer.convert_tokens_to_ids(padded_tokens)

        try:
            token_ids = np.vstack([token_ids, sent_ids])
            attn_mask = np.vstack([attn_mask, this_attn_mask])
            seg_ids = np.vstack([seg_ids, seg_id])
        except: 
            print('except')
            sent_ids = sent_ids[0:30]
            this_attn_mask = this_attn_mask[0:30]
            seg_id = seg_id[0:30]
            token_ids = np.vstack([token_ids, sent_ids])
            attn_mask = np.vstack([attn_mask, this_attn_mask])
            seg_ids = np.vstack([seg_ids, seg_id])

    # Finished compiling batch, now feed batch to model
    token_ids = np.delete(token_ids, 0, 0)
    attn_mask = np.delete(attn_mask, 0, 0)
    seg_ids = np.delete(seg_ids, 0, 0)

    token_ids = token_ids.astype(np.int64)
    tattn_mask = attn_mask.astype(np.int64)
    seg_ids = seg_ids.astype(np.int64)

    hidden, pooled = bertmodel(token_ids, attention_mask = attn_mask,token_type_ids = seg_ids, return_dict=False)
    pooled_embeddings = np.array(pooled)
        
    return pooled_embeddings


def extract_pooled_answer_embeddings(data, start, stop, dictval='answer', T = 6):
    token_ids = np.array([np.zeros(T)])
    attn_mask = np.array([np.zeros(T)])
    seg_ids = np.array([np.zeros(T)])
    for vq in data:
        question = vq[0]
        tokens = tokenizer.tokenize(question)
        tokens = ['[CLS]'] + tokens + ['[SEP]']
#         print(f"Final Length: {len(tokens)}")

        padded_tokens=tokens + ['[PAD]' for _ in range(T-len(tokens))]
        this_attn_mask=[1 if token != '[PAD]' else 0 for token in padded_tokens  ]
        seg_id=[0 for _ in range(len(padded_tokens))]
        sent_ids=tokenizer.convert_tokens_to_ids(padded_tokens)

        try:
            token_ids = np.vstack([token_ids, sent_ids])
            attn_mask = np.vstack([attn_mask, this_attn_mask])
            seg_ids = np.vstack([seg_ids, seg_id])
        except: 
            print('answer except')
            sent_ids = sent_ids[0:T]
            this_attn_mask = this_attn_mask[0:T]
            seg_id = seg_id[0:T]
            token_ids = np.vstack([token_ids, sent_ids])
            attn_mask = np.vstack([attn_mask, this_attn_mask])
            seg_ids = np.vstack([seg_ids, seg_id])

    # Finished compiling batch, now feed batch to model
    token_ids = np.delete(token_ids, 0, 0)
    attn_mask = np.delete(attn_mask, 0, 0)
    seg_ids = np.delete(seg_ids, 0, 0)

    token_ids = token_ids.astype(np.int64)
    tattn_mask = attn_mask.astype(np.int64)
    seg_ids = seg_ids.astype(np.int64)

    hidden, pooled = bertmodel(token_ids, attention_mask = attn_mask,token_type_ids = seg_ids, return_dict=False)
    pooled_embeddings = np.array(pooled)
        
    return pooled_embeddings

In [197]:
# You can build and train any model using the input images, input questions, and labels
max_length = 30
def get_feature_vectors(data, start = 0, stop = 50):
    img_train = np.zeros((1, imgsize[0], imgsize[1], 3))
    for i, vq in enumerate(data[start:stop]):
          # Extract features describing the image
          image_name = vq['image']
          image_url = img_dir + image_name
          image_vec = extract_image(image_url)
          n1, n2, n3 = image_vec.shape
          image_vec = np.reshape(image_vec, (1, n1, n2, n3))
          img_train = np.vstack([img_train, image_vec])

    img_train = np.delete(img_train, 0, 0)    
    image_feature = extract_image_features(img_train)

    # Extract features describing the question
    question_feature = extract_pooled_text_embeddings(data, start, stop)
        
    # # Create a multimodal feature to represent both the image and question (e.g. concatenate, multiply, etc.)
    multimodal_features = np.concatenate([question_feature, image_feature], axis=1)

    # get answers
    vq = data[start]
    answers = vq['answers']
    label = compute_answers(answers)
    y=label
    for vq in data[start+1:stop]:
        answers = vq['answers']
        label = compute_answers(answers)
        y = np.vstack([y, label])
        
    return multimodal_features, y

In [188]:
img_dir = "https://vizwiz.cs.colorado.edu//VizWiz_visualization_img/"
split = 'train' 
annotation_file = "https://ivc.ischool.utexas.edu/VizWiz_final/vqa_data/Annotations/%s.json" %split

split_data = requests.get(annotation_file, allow_redirects=True)
data = split_data.json()
num_top_answers, top_train_answers = init_answer_info(data)

ConnectionError: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))

In [None]:
# download the training samples - start with 3000 for now

X_train = np.zeros([1, 2078])
y_train = np.zeros(8)
# y_train = np.load('trainy_data.npy')
# X_train = np.load('trainX_data (1).npy')

for start in range(0, 2000, 50):
    print(start)
    stop = start+100
    X_train_iter, y_train_iter = get_feature_vectors(data, start=start, stop=stop)

    # get answer label embeddings
    y_embed = extract_pooled_answer_embeddings(y_train_iter, start, stop, dictval='answer', T = 8)
    
    X_train = np.vstack([X_train, X_train_iter])
    y_train = np.vstack([y_train, y_embed])
    
    np.save('trainX_data_2000_v2', X_train)
    np.save('trainy_data_2000_v2', y_train)

0


In [None]:
X_train = np.delete(X_train, 0, 0)
y_train = np.delete(y_train, 0, 0)

In [None]:
np.save('trainX_data_2000_v2', X_train)
np.save('trainy_data_2000_v2', y_train)

In [99]:
X_train_temp.shape# = X_train.copy()
#y_train_temp = y_train.copy()

In [61]:
X_train = np.delete(X_train, 0, 0)
y_train = np.delete(y_train, 0, 0)

# np.save('trainX_data', X_train)
# np.save('trainy_data', y_train)

In [13]:
X_train = np.load('trainX_data_5000.npy')
y_train = np.load('trainy_data_5000.npy')

In [63]:
# d = top_train_answers
# question_feature = feature_extraction(d)
# question_feature = np.array(question_feature)

# embedding_matrix = {}

# max_answer_length = 0
# for q in question_feature:
#   if len(q) > max_answer_length:
#     max_answer_length = len(q[0])


# print(f'Max answer length: {max_answer_length}')
# try:
#     for i, (key, qfeats)  in enumerate(zip(d, question_feature)):
#         word_embedding = padding(question_feature[i][0], target_len = max_answer_length)
#         embedding_matrix[key] = word_embedding
 
# except:
#     print(f'Except: {i}')

In [16]:
# Load embeddings
import pandas as pd
# pd.DataFrame(embedding_matrix).to_csv('answer_embedding_matrix2')

d = pd.read_csv('../input/bert-answer-embeddings/answer_embedding_matrix')
d.drop(columns='Unnamed: 0', inplace=True)

embedding_matrix = {}
for col in d.columns:
  embedding_matrix[col] = np.array(d[col])

max_answer_length = 6

In [32]:
val_data[0]

{'image': 'VizWiz_val_00000000.jpg',
 'question': 'Ok. There is another picture I hope it is a better one.',
 'answers': [{'answer': 'unanswerable', 'answer_confidence': 'yes'},
  {'answer': 'unanswerable', 'answer_confidence': 'yes'},
  {'answer': 'unanswerable', 'answer_confidence': 'yes'},
  {'answer': 'unanswerable', 'answer_confidence': 'yes'},
  {'answer': 'unanswerable', 'answer_confidence': 'maybe'},
  {'answer': 'unanswerable', 'answer_confidence': 'yes'},
  {'answer': 'unanswerable', 'answer_confidence': 'yes'},
  {'answer': 'unanswerable', 'answer_confidence': 'no'},
  {'answer': 'cannot repair this computer automatically',
   'answer_confidence': 'maybe'},
  {'answer': 'blank screen', 'answer_confidence': 'yes'}],
 'answer_type': 'unanswerable',
 'answerable': 0}

In [22]:
#Load validation set
split = 'val'
# split = 'test'
annotation_file = "https://ivc.ischool.utexas.edu/VizWiz_final/vqa_data/Annotations/%s.json" %split
split_data = requests.get(annotation_file, allow_redirects=True)
val_data = split_data.json()


X_val = np.zeros([1, 2078])
y_val = np.zeros(1)
for start in range(0, 1000, 50):
    print(start)
    stop = start+50
    X_val_iter, y_val_iter = get_feature_vectors(val_data, start=start, stop=stop)

    X_val = np.vstack([X_val, X_val_iter])
    y_val = np.vstack([y_val, y_val_iter])
    
    np.save('valX_data', X_val)
    np.save('valy_data', y_val)

0


  question_feature = np.array(question_feature)


(50, 2078)
50
(50, 2078)
100
(50, 2078)
150
(50, 2078)
200
(50, 2078)
250
(50, 2078)
300
(50, 2078)
350
(50, 2078)
400
(50, 2078)
450
(50, 2078)
500
(50, 2078)
550
(50, 2078)
600
(50, 2078)
650
(50, 2078)
700
(50, 2078)
750
(50, 2078)
800
(50, 2078)
850
(50, 2078)
900
(50, 2078)
950
(50, 2078)


In [35]:
X_val = np.delete(X_val, 0, 0)
y_val = np.delete(y_val, 0, 0)
np.save('valX_data', X_val)
np.save('valy_data', y_val)

In [78]:
y_train[0][0]

In [None]:
for i, y in enumerate(y_train):
    if y[0] in ['unanswerable', 'unsuitable', 'unsuitable image']:
        print('es')

In [80]:
# Remove training samples where label is not in the answer bank
# remove training samples that are 'unanswerable', 'unsuitable', 'unsuitable image'
num_not_found = 0
index_not_found = []
y_train_index  = []
for i, y in enumerate(y_train):
    if y[0] in ['unanswerable', 'unsuitable', 'unsuitable image']:
        index_not_found.append(i)
        continue
    try:
        y_train_index.append(embedding_matrix[y[0]])
    except:
        num_not_found+=1
        index_not_found.append(i)
        
print(num_not_found)

# remove the data if label not in dict
y_train = np.delete(y_train, [index_not_found])
X_train = np.delete(X_train, [index_not_found], axis=0)

y_train_index = np.array(y_train_index)

In [81]:
# Repeat for validation set
num_not_found = 0
index_not_found = []
y_val_index = []
for i, y in enumerate(y_val):
    if y[0] in ['unanswerable', 'unsuitable', 'unsuitable image']:
        index_not_found.append(i)
        continue
    try:
        y_val_index.append(embedding_matrix[y[0]])
    except:
        num_not_found+=1
        index_not_found.append(i)
        
print(num_not_found)

# remove the data if label not in dict
y_val = np.delete(y_val, [index_not_found])
X_val = np.delete(X_val, [index_not_found], axis=0)

y_val_index = np.array(y_val_index)

In [92]:
# Now create the multimodal model
import keras
from keras import layers, Input, Model, optimizers

inputs = Input(shape=(1, X_train.shape[1]))
x = layers.Bidirectional(layers.LSTM(100))(inputs)
x = layers.Dropout(0.1)(x)
x = layers.Dense(256, activation='relu')(x)
x = layers.Dropout(0.1)(x)
x = layers.Flatten()(x)
out = layers.Dense(max_answer_length, activation="softmax")(x) 

model = Model(inputs=inputs, outputs=out)
model.compile(
  optimizer = 'adam',
  loss=tf.keras.losses.MeanSquaredError(),
  metrics=['accuracy'],
)


model.summary()

In [85]:
X_train.shape

In [86]:
# Reshape train and test inputs
n1, n2 = X_train.shape
X_train =  X_train.reshape(n1, 1, n2)
y_train_index =  y_train_index.reshape(n1, max_answer_length)

n1, n2 = X_val.shape
X_val = X_val.reshape(n1, 1, n2)
y_val_index = y_val_index.reshape(n1,  max_answer_length)

In [None]:
# run answers thru bert to get bert embedding
# get embedding matrix
# loss function = mse -> minimize euclidean distance
# last layer outputs vector that is same size as bert embeddings



In [90]:
%%timeit
# Train the model
info = model.fit(X_train, y_train_index, batch_size=1000, epochs=100, validation_data=(X_val, y_val_index))

In [91]:
info.history

In [None]:
## create fake results
import numpy as np

# All answers
gtlist = [x['answers'] for x in data]

# Save the accuracies
acc_list = []
i = 0

# Compute accuracy for each image
for pred in results:

    # Get the GT answer list and preprocess
    gt_ans = gtlist[i] 
    gt_ans = [x['answer'] for x in gt_ans]
    gt_ans = [x.lower() for x in gt_ans]

    # Compute accuracy (compare with at least 3 human answers)
    cur_acc = np.minimum(1.0, gt_ans.count(pred)/3.0)

    acc_list.append(cur_acc)
    i +=1

print ('Accuracy: {}'.format(round(np.mean(acc_list), 2)))

## save results to results.csv
import pandas as pd
df = pd.DataFrame(results)
df.to_csv("results.csv", header = None, index = None)