forked from asampat3090/arctic-captions
-
Notifications
You must be signed in to change notification settings - Fork 5
/
make_dictionary.py
executable file
·58 lines (42 loc) · 1.76 KB
/
make_dictionary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import sys
codegit_root = '/home/intuinno/codegit'
sys.path.insert(0, codegit_root)
from anandlib.dl.caffe_cnn import *
import pandas as pd
import numpy as np
import os
import scipy
import json
import cPickle
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TreebankWordTokenizer
import pdb
TRAIN_SIZE = 6000
TEST_SIZE = 1000
annotation_path = '/home/intuinno/project/pointTeach/data/Flicker8k/Flickr8k.token.txt'
vgg_deploy_path = '/home/intuinno/codegit/caffe/models/vgg_ilsvrc_19/VGG_ILSVRC_19_layers_deploy.prototxt'
vgg_model_path = '/home/intuinno/codegit/caffe/models/vgg_ilsvrc_19/VGG_ILSVRC_19_layers.caffemodel'
flickr_image_path = '/home/intuinno/project/pointTeach/data/Flicker8k/preprocessedImages'
feat_path='feat/flickr8k'
cnn = CNN(deploy=vgg_deploy_path,
model=vgg_model_path,
batch_size=20,
width=224,
height=224)
def my_tokenizer(s):
return s.split()
pdb.set_trace()
annotations = pd.read_table(annotation_path, sep='\t', header=None, names=['image', 'caption'])
annotations['image_num'] = annotations['image'].map(lambda x: x.split('#')[1])
annotations['image'] = annotations['image'].map(lambda x: os.path.join(flickr_image_path,x.split('#')[0]))
captions = annotations['caption'].values
vectorizer = CountVectorizer(lowercase=False, analyzer=str.split).fit(captions)
dictionary = vectorizer.vocabulary_
dictionary_series = pd.Series(dictionary.values(), index=dictionary.keys()) + 2
dictionary = dictionary_series.to_dict()
pdb.set_trace()
# Sort dictionary in descending order
from collections import OrderedDict
dictionary = OrderedDict(sorted(dictionary.items(), key=lambda x:x[1], reverse=True))
with open('data/flickr8k/dictionary.pkl', 'wb') as f:
cPickle.dump(dictionary, f)