In [None]:
import cv2
from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize, word_tokenize
import glob
import numpy as np
import math
import os

In [None]:
import nltk
nltk.download('punkt')

In [None]:
# '/coco2017/' for use on Nautilus
# './'for local use
path_prefix = './'

In [None]:
# this selects the first 1975 images, change the regex if you want more
img_paths = glob.glob(path_prefix + 'train2017/*.jpg')
training_imgs_num = len(img_paths)
print(f'loaded a total of {training_imgs_num} imgs')

In [None]:
print(f'Cleaning and saving {training_imgs_num} imgs to /coco2017/cleaned-data')
os.mkdir(path_prefix + 'cleaned-data') if not os.path.exists(path_prefix + 'cleaned-data') else None

for img_path in img_paths:
    img = cv2.imread(img_path)
    y,x,_ = img.shape #(y, x) not (x, y)
    
    margin = abs(y-x)/2
    if x > y:        
        # Image is tall
        img = img[:,int(math.floor(margin)):int(math.floor(x-margin))]
    elif y > x:
        # Image is wide
        img = img[int(math.floor(margin)):int(math.floor(y-margin)),:]

    if (img.shape[0] != img.shape[1]):
        print('Dim mismatch')
        
    img = cv2.resize(img, (128,128))
    cv2.imwrite(os.path.join(path_prefix + 'cleaned-data', img_path.split('/')[-1]), img)

In [None]:
# category_mapping = {}

# numeric_mapping = { 'person' : 1, 'vehicle' : 2, 'outdoor' : 3, 'animal' : 4, 'accessory' : 5, 'sports' : 6, \
# 'kitchen' : 7, 'food' : 8, 'furniture' : 9, 'electronic' : 10, 'appliance' : 11, 'indoor' : 12 }

# for dic in instances_df.categories:
#     category_mapping[dic['id']] = numeric_mapping[dic['supercategory']]

# def getSuperCat(cat):
#     return category_mapping[cat]
    

In [None]:
def multihot_encode(data, num_classes=12, clip=False):
    mhe, _ = np.histogram(data,bins=num_classes,range=(0,num_classes-1))
    if clip:
        return np.clip(mhe,0,1)
    
    mhe = [x/sum(mhe) if sum(mhe) > 0 else x/1.0 for x in mhe]
    return mhe

def embed_caption(caption):
    print('a')
    caption_conv = []
    for word in word_tokenize(caption.lower()):
        caption_conv.append(w2v_model[word])
    return caption_conv

In [None]:
import pandas as pd

def clean_caption(cap):
    cap = cap.replace('.', ' ')
    cap = cap + "."
    return cap
    

annot_df = None
with open(path_prefix + 'annots/annotations/captions_train2017.json') as annot_file:
    captions_df = pd.read_json(annot_file, typ='series')
    annot_df = pd.DataFrame(data=captions_df['annotations'])
    annot_df = annot_df.astype({'image_id': 'int32'})
    image_df = pd.DataFrame(data= captions_df['images'])
    image_df = image_df.astype({'id': 'int32'}) # annot_df image_id matches image_df id
    annot_df.sort_values(by=['image_id'], axis=0, inplace=True)
    image_df.sort_values(by=['id'], axis=0, inplace=True)
    annot_df['caption'] = annot_df['caption'].apply(clean_caption)
    
    # Embed the captions and insert as another column
    w2v_model = Word2Vec.load(path_prefix + 'resources/text_encoding_full.bin')
    annot_df.insert(len(annot_df.columns), 'embedded_caption', )

    # Aggregate all captions per image into one row
    annot_df = annot_df.groupby('image_id')['image_id', 'caption'].agg('|'.join)
    
    # Insert file names
    annot_df.insert(len(annot_df.columns), 'file_name', image_df['file_name'].values)
    
    
    # Insert class and superclass data
    with open(path_prefix + 'annots/annotations/instances_train2017.json') as instance_file:
        instances_df = pd.read_json(instance_file, typ='series')
        
        category_mapping = {}

        numeric_mapping = { 'person' : 1, 'vehicle' : 2, 'outdoor' : 3, 'animal' : 4, 'accessory' : 5, 'sports' : 6, \
        'kitchen' : 7, 'food' : 8, 'furniture' : 9, 'electronic' : 10, 'appliance' : 11, 'indoor' : 12 }

        for dic in instances_df.categories:
            category_mapping[dic['id']] = numeric_mapping[dic['supercategory']]
        
        image_cats = {i : [] for i in list(annot_df.index)}
        image_supercats = {i : [] for i in list(annot_df.index)}
        
        for row in instances_df.annotations:
            image_cats[row['image_id']].append(row['category_id'])
            image_supercats[row['image_id']].append(category_mapping[row['category_id']])
        
        annot_df.insert(1, 'categories', image_cats.values())
        annot_df.insert(2, 'super_categories', list(map(multihot_encode, list(image_supercats.values()))))

In [None]:
annot_df.to_csv(path_prefix + 'resources/coco-captions-with-categories.csv')

In [None]:
list(map(embed_caption, list(annot_df['caption'].values), w2v_model))

In [None]:
list(annot_df['caption'].values)[0]

In [None]:
mapped = list(map(embed_caption, list(annot_df['caption'].values)))

In [None]:
len(mapped)

In [7]:
import pandas as pd
row = None
with open(path_prefix + 'annots/annotations/captions_train2017.json') as annot_file:
    
    captions_df = pd.read_json(annot_file, typ='series')
    annot_df = pd.DataFrame(data=captions_df['annotations'])
    row = annot_df[annot_df['caption'].str.contains('lift-kit')]

In [11]:
row['caption'].values

array(["A black and white image of a pick-up truck on a lift-kit with a large tire in it's bed. "],
      dtype=object)