In [5]:
from pycocotools.coco import COCO
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import os
import random
import json
import spacy

In [6]:
nlp = spacy.load("en_core_web_sm")

## Convert captions to nouns only

In [215]:
annotations_file = '../datasets/coco/annotations/captions_train2017.json'
noun_only_annotations_file = '../datasets/coco/annotations/captions_noun_only_train2017.json'

In [216]:
dataset = json.load(open(annotations_file, 'r'))

for ann in dataset['annotations']:
    caption = ann['caption']
    noun_set = set()
    for tok in nlp(caption):
        if 'NN' in tok.tag_:
            noun_set.add(str(tok))
                
    noun_str = ' '.join(list(noun_set))
    ann['caption'] = noun_str
    
with open(noun_only_annotations_file, 'w') as f:
    json.dump(dataset, f)

## Investigate Bert tokenizer tokenization scheme

In [None]:
from transformers import BertTokenizer
from maskrcnn_benchmark.modeling.language_backbone.transformers import BERT
from maskrcnn_benchmark.config import cfg

In [42]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BERT(cfg)
_ = bert.to('cuda')

In [44]:
text = "Humility is"

In [45]:
tokens = tokenizer.tokenize(text)
tokens

['hum', '##ility', 'is']

In [46]:
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[14910, 15148, 2003]

In [50]:
enc = bert([text])

In [53]:
enc['input_ids']

tensor([[  101, 14910, 15148,  2003,   102]], device='cuda:0')

In [64]:
tokenizer.convert_ids_to_tokens(enc['input_ids'][0].tolist())

['[CLS]', 'hum', '##ility', 'is', '[SEP]']

In [65]:
enc['special_tokens_mask']

tensor([[1, 0, 0, 0, 1]], device='cuda:0')

### How to club words in a phrase

In [83]:
tokenizer.tokenize('honda#motorcycle')

['honda', '#', 'motorcycle']

## Club Noun phrase testing

In [67]:
text = "a brown stuffed teddy bear wearing a red knitted hat"
tokens = nlp(text)

In [71]:
for np in tokens.noun_chunks:
    print(np.text)

a brown stuffed teddy bear
a red knitted hat


In [77]:
for np in tokens.noun_chunks:
    for t in nlp(np.text):
        print(t, ':', t.tag_)
    print('--------------')

a : DT
brown : JJ
stuffed : VBN
teddy : NN
bear : NN
--------------
a : DT
red : JJ
knitted : VBN
hat : NN
--------------


## Club Noun phrase in data

In [203]:
annotations_file = '../datasets/coco/annotations/captions_val2017.json'
noun_phrase_annotations_file = '../datasets/coco/annotations/captions_noun_phrase_val2017.json'

In [204]:
dataset = json.load(open(annotations_file, 'r'))

for ann in dataset['annotations']:
    caption = ann['caption']
    noun_set = set()
    for np in nlp(caption).noun_chunks:
        noun_words = list()
        for tok in nlp(np.text):
            if 'NN' in tok.tag_:
                noun_words.append(str(tok))
                
        noun_words_str = '#'.join(noun_words)
        noun_set.add(str(noun_words_str))
                
    noun_str = ' '.join(list(noun_set))
    ann['caption'] = noun_str

In [205]:
with open(noun_phrase_annotations_file, 'w') as f:
    json.dump(dataset, f)

## Club Noun phrases while training

In [109]:
import numpy as np

In [94]:
annotations_file = '../datasets/coco/annotations/captions_val2017.json'
noun_only_annotations_file = '../datasets/coco/annotations/captions_noun_phrase_val2017.json'

In [95]:
dataset = json.load(open(annotations_file, 'r'))

for ann in dataset['annotations'][:10]:
    caption = ann['caption']
    noun_set = set()
    for np in nlp(caption).noun_chunks:
        noun_words = list()
        for tok in nlp(np.text):
            if 'NN' in tok.tag_:
                noun_words.append(str(tok))
                
        noun_words_str = '#'.join(noun_words)
        noun_set.add(str(noun_words_str))
                
    noun_str = ' '.join(list(noun_set))
    ann['caption'] = noun_str
    
    print(caption)
    print(noun_str)

A black Honda motorcycle parked in front of a garage.
Honda#motorcycle front garage
A Honda motorcycle parked in a grass driveway
Honda#motorcycle grass#driveway
An office cubicle with four different types of computers.
types computers office#cubicle
A small closed toilet in a cramped space.
toilet space
Two women waiting at a bench next to a street.
street women bench
A black Honda motorcycle with a dark burgundy seat.
Honda#motorcycle burgundy#seat
A tan toilet and sink combination in a small room.
sink#combination room toilet
The home office space seems to be very cluttered.
home#office#space
A beautiful dessert waiting to be shared by two people
people dessert
A woman sitting on a bench and a woman standing waiting for the bus.
bus woman bench


In [143]:
text = 'Honda#motorcycle humility wedding#cake burgundy#seat#humility#tok'

In [144]:
tokens = tokenizer.tokenize(text)
tokens

['honda',
 '#',
 'motorcycle',
 'hum',
 '##ility',
 'wedding',
 '#',
 'cake',
 'burgundy',
 '#',
 'seat',
 '#',
 'hum',
 '##ility',
 '#',
 'to',
 '##k']

In [145]:
mask = np.array([1 for _ in range(len(tokens))])
phrase_stack = []
word_stack = []
word_start_idx = 0
phrase_start_idx = 0

for idx, t in enumerate(tokens):
    # Processing words
    ## If intermediate sub-word
    if t[:2] == '##':
        word_stack.append(t)
        continue
    else:
        ## stack is empty; add the current token to stack
        if len(word_stack) == 0:
            word_stack.append(t)
            start_word_idx = idx
            if len(tokens) > 1 and tokens[idx+1][:2] == '##':
                continue
        ## stack has a full word previously; remove it and add the current word in expectation of it being
        ## the first part of sub-word
        elif len(word_stack) == 1:
            word_stack.pop()
            word_stack.append(t)
            start_word_idx = idx
        ## Else we need to club the subwords
        else:
            sub_words = word_stack
            tokens[start_word_idx] = ''.join(sub_words)
            mask[start_word_idx:idx] = 0
            word_stack = [t]
            
    current_word_idx = start_word_idx
    start_word_idx = idx
            
    # Processing phrase
    if current_word_idx == 0:
        phrase_stack.append(current_word_idx)
        continue
        
    if t == '#':
        continue
    
    if tokens[current_word_idx-1] == '#':
        phrase_stack.append(current_word_idx)
    else:
        if len(phrase_stack) == 1:
            phrase_stack.pop()
            phrase_stack.append(current_word_idx)
        else:
            words_to_combine = phrase_stack
            w = []
            for i in words_to_combine:
                mask[i] = 0
                w.append(tokens[i])
            tokens[words_to_combine[0]] = '-'.join(w)
            phrase_stack = [current_word_idx]

if len(word_stack) > 1:
    sub_words = word_stack
    tokens[start_word_idx] = ''.join(sub_words)
    mask[start_word_idx:idx] = 0
    word_stack = [t]
        
current_word_idx = start_word_idx
start_word_idx = idx
        
tokens

['honda-motorcycle',
 '#',
 'motorcycle',
 'hum##ility-cake',
 '##ility',
 'wedding',
 '#',
 'cake',
 'burgundy',
 '#',
 'seat',
 '#',
 'hum##ility',
 '##ility',
 '#',
 'to##k',
 '##k']

In [104]:
a = [] 
len(a) == 0

True

In [107]:
a = [0 for _ in range(5)]
a[:2] = 1

TypeError: can only assign an iterable

In [209]:
text_list = ['giraffes', 'person']

In [210]:
tokenized_batch = tokenizer.batch_encode_plus(text_list, 
            add_special_tokens=True, 
            pad_to_max_length=True,
            return_special_tokens_mask=True,
        )

In [211]:
tokenized_batch['input_ids']

[[101, 21025, 27528, 7959, 2015, 102], [101, 2711, 102, 0, 0, 0]]

In [194]:
tokenized_batch['special_tokens_mask']

[[1, 0, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 1, 1, 1, 1], [1, 0, 0, 0, 1, 1, 1, 1]]

In [195]:
tokenized_batch['attention_mask']

[[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 1, 0, 0, 0]]

In [212]:
tokenizer.convert_ids_to_tokens([101, 21025, 27528, 7959, 2015, 102])

['[CLS]', 'gi', '##raf', '##fe', '##s', '[SEP]']

In [214]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize('giraffes'))

[21025, 27528, 7959, 2015]