<a href="https://colab.research.google.com/github/julialromero/VQA---Visual-Question-Answering/blob/main/PA4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
! pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 4.7 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 35.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 3.4 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 40.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 40.6 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

In [9]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import resample,shuffle
from sklearn.model_selection import train_test_split
import requests

In [10]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.inception_v3 import preprocess_input
from transformers import GPT2Tokenizer, TFGPT2Model
from transformers import BertTokenizer, TFBertForQuestionAnswering, FeatureExtractionMixin
from transformers import pipeline, AutoTokenizer
from tensorflow.keras.layers import Multiply
from tensorflow.keras.utils import to_categorical

In [11]:
device_name = tf.test.gpu_device_name()

In [12]:
device_name

'/device:GPU:0'

In [42]:
# Use InceptionV3 model for image feature extraction
# Instantiate CV model feature extractor and freeze layers
base_model = tf.keras.applications.InceptionV3(
    include_top=False,
    weights="imagenet",
    input_tensor=None,
    input_shape=(None, None, 3),
    pooling='average',
    classifier_activation="softmax",
)
base_model.trainable = False

# Use BERT question answering model from Hugging Face
# Download text feature extractor
tokenizer = BertTokenizer.from_pretrained("bert-base-cased", )
bertmodel = TFBertForQuestionAnswering.from_pretrained("bert-base-cased")
feature_extraction = pipeline('feature-extraction', model=bertmodel, tokenizer=tokenizer)

All model checkpoint layers were used when initializing TFBertForQuestionAnswering.

Some layers of TFBertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
import tensorflow.keras.preprocessing.image as image
from skimage import io

## PART 2: 
# - Extract features from image
# - Extract features from question
# - Combine image + question features
imgsize = (600, 600)

def padding(array, xx, target_len=30):
    array = np.array(array)
    h = array.shape[0]
    if h < target_len:
        a = target_len - h
        new = np.pad(array, pad_width=(0, a), mode='constant')
    else:
        new = array[0:target_len]

    return new

def extract_image(image_url):
    imag = io.imread(image_url)
    x = image.img_to_array(imag)
    n = preprocess_input(x)
    
    # resize so all photos have same dim
    size = imgsize
    n = tf.keras.preprocessing.image.smart_resize(n, size)
    return n

def extract_image_features(n):
    feature_vector = base_model.predict(n) 
    return feature_vector

## Template code for extracting features from a question
def extract_question_features(question):
    features = feature_extraction(question)
    feature_vector = np.array(features)
    
    # reshape
    #feature_vector = np.reshape(feature_vector, (-1))
    return feature_vector


# Gets the most common answer for a given sample
from scipy import stats as s
def compute_answers(ans):
    y = []
    for i in ans:
        y.append(i['answer'])
        
    answer = s.mode(y)[0]
    return answer


# Gets most common 3000 answers out of given dataset
from collections import Counter
def init_answer_info(data):
    answers = []
    for i in data:
        for j in i['answers']:
            answers.append(j['answer'])
        
    occurence_count = Counter(answers)
    most_common = occurence_count.most_common(3000)
    most_common_words = []
    for i in most_common:
        most_common_words.append(i[0])

    num_answers = len(most_common_words)
    return num_answers, most_common_words

In [44]:
# You can build and train any model using the input images, input questions, and labels
max_length = 30
def get_feature_vectors(data, num_VQs = 50):
    img_train = np.zeros((1, imgsize[0], imgsize[1], 3))
    for i, vq in enumerate(data[0:num_VQs]):
          # Extract features describing the image
          image_name = vq['image']
          image_url = img_dir + image_name
          image_vec = extract_image(image_url)
          n1, n2, n3 = image_vec.shape
          image_vec = np.reshape(image_vec, (1, n1, n2, n3))
          img_train = np.vstack([img_train, image_vec])

    img_train = np.delete(img_train, 0, 0)    
    image_feature = extract_image_features(img_train)
    n1, n2, n3, n4 = image_feature.shape
    image_feature = np.reshape(image_feature, (n1, n2*n3*n4)) # flatten feature vectors. Now there is a row for each sample


    # Extract features describing the question
    question_train = []
    for vq in data[0:num_VQs]:
        question = vq['question']
        question_train.append(question)

    question_feature = feature_extraction(question_train, max_length=max_length, padding='max_length', truncation=True)
    question_feature = np.array(question_feature)
    feature_vectors = np.array([np.zeros(max_length )])
    try:
        for i, q in enumerate(question_feature):
            feature_vectors = np.vstack([feature_vectors, padding(question_feature[i][0], xx = max_length )])
        question_feature = np.delete(feature_vectors, 0, 0)
    except:
        print(f'Except: {i}')
        
    # # Create a multimodal feature to represent both the image and question (e.g. concatenate, multiply, etc.)
    multimodal_features = np.concatenate([question_feature, image_feature], axis=1)
    print(multimodal_features.shape)

    # get answers
    vq = data[0]
    answers = vq['answers']
    label = compute_answers(answers)
    y=label
    for vq in data[1:num_VQs]:
        answers = vq['answers']
        label = compute_answers(answers)
        y = np.vstack([y, label])
        
    return multimodal_features, y

In [45]:
import requests

In [46]:
img_dir = "https://vizwiz.cs.colorado.edu//VizWiz_visualization_img/"
split = 'train' 
annotation_file = "https://ivc.ischool.utexas.edu/VizWiz_final/vqa_data/Annotations/%s.json" %split

split_data = requests.get(annotation_file, allow_redirects=True)
data = split_data.json()
num_top_answers, top_train_answers = init_answer_info(data)

X_train, y_train = get_feature_vectors(data, num_VQs = 20)

(20, 591902)




In [47]:
# create our answer bank for the 3000 most common labels
index = [i for i in range(len(top_train_answers))]

# one hot encode output values
answer_bank = to_categorical(index)
train_answer_dict = dict(zip(top_train_answers, answer_bank))

In [48]:
#Load validation set
split = 'val'
# split = 'test'
annotation_file = "https://ivc.ischool.utexas.edu/VizWiz_final/vqa_data/Annotations/%s.json" %split
split_data = requests.get(annotation_file, allow_redirects=True)
val_data = split_data.json()
X_val, y_val = get_feature_vectors(val_data, num_VQs = 20)

(20, 591902)




In [49]:
load_X_val = X_val.copy()
load_y_val = y_val.copy()
load_X_train = X_train.copy()
load_y_train = y_train.copy()

In [50]:
num_not_found = 0
index_not_found = []
y_train_index  = []
for i, y in enumerate(y_train):
    try:
        y_train_index.append(train_answer_dict[y[0]].tolist())
    except:
        num_not_found+=1
        index_not_found.append(i)
        
print(num_not_found)

# remove the data if label not in dict
y_train = np.delete(y_train, [index_not_found])
X_train = np.delete(X_train, [index_not_found], axis=0)

y_train_index = np.array(y_train_index)

1


In [51]:
num_not_found = 0
index_not_found = []
y_val_index = []
for i, y in enumerate(y_val):
    try:
        y_val_index.append(train_answer_dict[y[0]].tolist())
    except:
        num_not_found+=1
        index_not_found.append(i)
        
print(num_not_found)

# remove the data if label not in dict
y_val = np.delete(y_val, [index_not_found])
X_val = np.delete(X_val, [index_not_found], axis=0)

y_val_index = np.array(y_val_index)

3


In [52]:
X_val.shape

(17, 591902)

In [68]:
data

[{'answer_type': 'other',
  'answerable': 1,
  'answers': [{'answer': 'basil leaves', 'answer_confidence': 'yes'},
   {'answer': 'basil leaves', 'answer_confidence': 'yes'},
   {'answer': 'basil', 'answer_confidence': 'yes'},
   {'answer': 'basil', 'answer_confidence': 'yes'},
   {'answer': 'basil leaves', 'answer_confidence': 'yes'},
   {'answer': 'basil leaves', 'answer_confidence': 'yes'},
   {'answer': 'basil leaves', 'answer_confidence': 'yes'},
   {'answer': 'basil leaves', 'answer_confidence': 'yes'},
   {'answer': 'basil leaves', 'answer_confidence': 'yes'},
   {'answer': 'basil', 'answer_confidence': 'yes'}],
  'image': 'VizWiz_train_00000000.jpg',
  'question': "What's the name of this product?"},
 {'answer_type': 'other',
  'answerable': 1,
  'answers': [{'answer': 'soda', 'answer_confidence': 'yes'},
   {'answer': 'coca cola', 'answer_confidence': 'yes'},
   {'answer': 'coca cola', 'answer_confidence': 'maybe'},
   {'answer': 'unsuitable', 'answer_confidence': 'yes'},
   {'

In [64]:
X_train[0]

array([[-0.04600029,  0.02132051,  0.13935724, ...,  1.23623717,
         1.05633521,  0.        ]])

In [54]:
# Now create the multimodal model
import keras
from keras import layers, Input, Model, optimizers
# gpu_options = tf.GPUOptions(allow_growth=True)
# session = tf.InteractiveSession(config=tf.ConfigProto(gpu_options=gpu_options))

inputs = Input(shape=(1, X_train.shape[1]))
x = layers.Bidirectional(layers.LSTM(50))(inputs)
#x = layers.Dense(32, activation='tanh')(x)
x = layers.Flatten()(x)
out = layers.Dense(num_top_answers, activation="softmax")(x) 

model = Model(inputs=inputs, outputs=out)
model.compile(
  optimizer = 'adam',
  loss="categorical_crossentropy",
  metrics=['accuracy'],
)


model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 1, 591902)]       0         
                                                                 
 bidirectional_1 (Bidirectio  (None, 100)              236781200 
 nal)                                                            
                                                                 
 flatten_1 (Flatten)         (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 3000)              303000    
                                                                 
Total params: 237,084,200
Trainable params: 237,084,200
Non-trainable params: 0
_________________________________________________________________


In [55]:
n1, n2 = X_train.shape
X_train =  X_train.reshape(n1, 1, n2)
y_train_index =  y_train_index.reshape(n1, 3000)

n1, n2 = X_val.shape
X_val = X_val.reshape(n1, 1, n2)
y_val_index = y_val_index.reshape(n1,  3000)

In [None]:
# run answers thru bert to get bert embedding
# get embedding matrix
# loss function = mse -> minimize euclidean distance
# last layer outputs vector that is same size as bert embeddings



# run model 3000 times, 1 per answer
# 

In [37]:
# Train the model
info = model.fit(X_train, y_train_index, batch_size=15, epochs=5, validation_data=(X_val, y_val_index))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [62]:
info.history

{'accuracy': [0.0,
  0.8421052694320679,
  0.8947368264198303,
  0.8947368264198303,
  0.8947368264198303],
 'loss': [7.977046966552734,
  7.421048641204834,
  7.306158065795898,
  7.209148406982422,
  7.1127848625183105],
 'val_accuracy': [0.1764705926179886,
  0.23529411852359772,
  0.23529411852359772,
  0.23529411852359772,
  0.23529411852359772],
 'val_loss': [7.860090255737305,
  7.838127613067627,
  7.81396484375,
  7.793353080749512,
  7.764293670654297]}

In [4]:
## create fake results
import numpy as np

# All answers
gtlist = [x['answers'] for x in data]

# Save the accuracies
acc_list = []
i = 0

# Compute accuracy for each image
for pred in results:

    # Get the GT answer list and preprocess
    gt_ans = gtlist[i] 
    gt_ans = [x['answer'] for x in gt_ans]
    gt_ans = [x.lower() for x in gt_ans]

    # Compute accuracy (compare with at least 3 human answers)
    cur_acc = np.minimum(1.0, gt_ans.count(pred)/3.0)

    acc_list.append(cur_acc)
    i +=1

print ('Accuracy: {}'.format(round(np.mean(acc_list), 2)))

## save results to results.csv
import pandas as pd
df = pd.DataFrame(results)
df.to_csv("results.csv", header = None, index = None)

NameError: ignored