In [7]:
import csv
import os
import json

data_dir = '../data'
dataset_name = 'vizwiz'
dataset = []
with open(os.path.join(data_dir, 'crowd_results_%s.csv'%dataset_name) , 'r') as csvfile:
    fid = csv.reader(csvfile, delimiter=',')
    column_names = ('qid', 'ans_dis_labels', 'src_dataset', 'ans_type', 'image', 'question', 'answers')
    for i, row in enumerate(fid):
        if i == 0: 
            continue      
        data = (row[0], map(int, row[1:11]), row[11], row[12], row[13], 
                row[14], row[15:25])
        data = dict(zip(column_names, data))
        dataset.append(data)
json.dump(dataset, open(os.path.join(data_dir, dataset_name + '.json'), 'w'), ensure_ascii=False)

n_train = int(len(dataset) * 0.65)
n_test = int(len(dataset) * 0.25)
n_val = len(dataset) - n_train - n_test

json.dump(dataset[:n_train], 
          open(os.path.join(data_dir, dataset_name + '_train.json'), 'w'), ensure_ascii=False)

json.dump(dataset[n_train:n_train + n_val], 
          open(os.path.join(data_dir, dataset_name + '_val.json'), 'w'), ensure_ascii=False)

json.dump(dataset[:n_train + n_val], 
          open(os.path.join(data_dir, dataset_name + '_trainval.json'), 'w'), ensure_ascii=False)

json.dump(dataset[n_train + n_val:], 
          open(os.path.join(data_dir, dataset_name + '_test.json'), 'w'), ensure_ascii=False)

In [8]:
print(len(dataset), n_train, n_test, n_val)

(30004, 19502, 7501, 3001)


In [9]:
import numpy as np
np.set_printoptions(precision=2, suppress=True)

test_set = dataset[n_train + n_val:]

tmp1 = np.array([x['ans_dis_labels'] for x in test_set])
tmp1 = tmp1 >= 2

print('test:', list(np.sum(tmp1, axis=0)))

test_set = dataset
tmp2 = np.array([x['ans_dis_labels'] for x in test_set])
tmp2 = tmp2 >= 2
print('data:', list(np.sum(tmp2, axis=0)))

print(np.sum(tmp1, axis=0) * 1.0 / np.sum(tmp2, axis=0))

('test:', [1804, 2425, 1240, 359, 5643, 392, 4986, 5521, 113, 32])
('data:', [7160, 9729, 4852, 1486, 22773, 1575, 20132, 22284, 418, 143])
[0.25 0.25 0.26 0.24 0.25 0.25 0.25 0.25 0.27 0.22]


# Build vocabulary for question

In [11]:
# create json file for vocabulary
import json
import os
import nltk
from nltk.stem.snowball import *
from tqdm import *
from collections import Counter, OrderedDict
import string

data_dir = '../data'
dataset_name = 'vizwiz'
dataset = json.load(open(os.path.join(data_dir, dataset_name + '.json')), encoding='cp1252')

## question
q_counter = Counter()
n_sample = 0
maxlen = 0

for one_data in tqdm(dataset):
    n_sample += 1
    question = one_data['question']
    question = question.lower()
    tokens = nltk.word_tokenize(question)
    token_len = len(tokens)
    maxlen = max([maxlen,token_len])
    q_counter.update(tokens)
print('number of sample = ' + str(n_sample))
print('max len = ' + str(maxlen))
q_word_counts = [x for x in q_counter.items()]
q_word_counts.sort(key=lambda x: x[1], reverse=True)
json.dump(q_word_counts, open('q_word_counts.json', "w"), indent=2)

### build vocabulary based on question
vocab = [x[0] for x in q_word_counts if x[1] >= 0]
unk_word = '<UNK>'
vocab = [unk_word] + vocab
vocab = OrderedDict(zip(vocab,range(len(vocab))))
json.dump(vocab, open('word2vocab_id.json', 'w'), indent=2)
print('vocab size: {}'.format(len(vocab)))

100%|██████████| 30004/30004 [00:03<00:00, 8293.50it/s]


number of sample = 30004
max len = 74
vocab size: 4535


In [12]:
import numpy as np

def create_glove_embedding_init(idx2word, glove_file):
    word2emb = {}
    with open(glove_file, 'r') as f:
        entries = f.readlines()
    emb_dim = len(entries[0].split(' ')) - 1
    print('embedding dim is %d' % emb_dim)
    weights = np.zeros((len(idx2word), emb_dim), dtype=np.float32)

    for entry in entries:
        vals = entry.split(' ')
        word = vals[0]
        vals = list(map(float, vals[1:]))
        word2emb[word] = np.array(vals)
    for idx, word in idx2word.items():
        if word not in word2emb:
            print(word)
            continue
        weights[idx] = word2emb[word]
    return weights, word2emb

emb_dim = 300
glove_file = 'glove.6B.%dd.txt'%emb_dim
idx2word = {v:k for k, v in vocab.items()}
weights, word2emb = create_glove_embedding_init(idx2word, glove_file)
np.save('glove6b_init_%dd.npy' % emb_dim, weights)

embedding dim is 300
<UNK>
k-cup
tshirt
vizwiz
dayquil
ok.
chobani
gevalia
..
c.d
coffeemate
bizwiz
scallywag
gatorades
rice-a-roni
plextalk
self-rising
rightside
ganics
ios5
eye-drop
scentsy
hormell
koolaid
pop-tarts
vocab
eyedrop
nightquil
k-cups
htis
stouffers
schwans
temputure
benedrill
tossimo
probook
this/
4.
miles..it
multi-tool
still-is
pasta-roni
earpods
36x32s
screen.okay
slowcooker
68.
diffrent
lemon-aid
brela
gaviscon
cragganmore
crock-pot
turbo-tax
nyqyl
image.i
jailbreakable
biz-wiz
6.1.3
tilex
79.54
b-a-l-m
vitamic
ammex
veniburg
pop-tart
wheee
i-
afraid.i
an______
baby-ganic
1845.
frischenmeier
lawrys
sodapop
giftcard
aspal
papadoms
twelfths
lawrey
ibprofen
telephone-style
ptr-1
15.
mini-usb
counter-top
lipozene
anti-itch
6790
teatree
drywin
frecuency
talking..
streetview
web-workers
discribe
anythings
purple-ish
ibuprofin
on..
mello.com
this..
espressp
future.i
seeing-eye
uncrinkle
tossamo
pacco
..color
convectional
brenburgee
..is
cumulo-nimbus
you.you
coca-colas
glen

# Build Vocabulary for answer

In [13]:
## answer
q_counter = Counter()
n_sample = 0
maxlen = 0

for one_data in tqdm(dataset):
    n_sample += 1
    tokens = [x.lower() for x in one_data['answers']]
    token_len = len(tokens)
    maxlen = max([maxlen,token_len])
    q_counter.update(tokens)
print('number of sample = ' + str(n_sample))
print('max len = ' + str(maxlen))
q_word_counts = [x for x in q_counter.items()]
q_word_counts.sort(key=lambda x: x[1], reverse=True)
json.dump(q_word_counts, open('ans_counts.json', "w"), indent=2)

### build vocabulary
vocab = [x[0] for x in q_word_counts if x[1] >= 5]
vocab = OrderedDict(zip(vocab,range(len(vocab))))
json.dump(vocab, open('ans2id.json', 'w'), indent=2)
print('vocab size: {}'.format(len(vocab)))

100%|██████████| 30004/30004 [00:00<00:00, 98097.15it/s] 


number of sample = 30004
max len = 10
vocab size: 6250
