### Small interactive script to preprocess the inputs:
1) Load the image information at the output of a Faster R-CNN
2) Retrieve the image features
3) Encode the texts using a pretrained Bert Tokenizer
4) save everything in pickle files

The image information is obtained from the output of the fc6 of a pretrained Faster R-CNN with a ResNeXt-152 backbone. They can be downloaded at https://dl.fbaipublicfiles.com/mmf/data/datasets/hateful_memes/defaults/features/features.tar.gz. They are downloaded in the lmdb format. In the corresponding directory, the image information is stored as keys-values, where the keys correspond to the id numbers of the images and the values are dictionnaries containing the following information: "feature_path", "features" (N=100, 2048), "image_height", "image_width", "num_boxes" (N,), "objects"(N,), "cls_prob" (N, 1601), "bbox" (100, 4).

We only keep the image features. Then, we append the features to the Pandas DataFrames obtained from the .json data files (train, dev and test data). 

Then, we use a pretrained BERT Tokenizer to encode the texts. The text encodings are also appended to the Pandas DataFrames. They correspond to dictionnaries with the following information: "input_tokens" (max_seq_length=128,), "input_ids" (max_seq_length=128,), "segment_ids" (max_seq_length=128,), "input_mask" (max_seq_length=128,)

Finally, we save the resulting DataFrames in pickle format.

In [1]:
import os
import lmdb
import pickle
import pandas as pd 
import numpy as np  

In [2]:
lmdb_dir = '/Users/guillaumevalette/Desktop/HMDataset/detectron.lmdb/'

train_path = '/Users/guillaumevalette/Desktop/HM_challenge/HMDataset/train.jsonl'
dev_path = '/Users/guillaumevalette/Desktop/HM_challenge/HMDataset/dev.jsonl'
test_path = '/Users/guillaumevalette/Desktop/HM_challenge/HMDataset/test.jsonl'

train_data = '/Users/guillaumevalette/Desktop/HM_challenge/data/VisualBert/train_data'
dev_data = '/Users/guillaumevalette/Desktop/HM_challenge/data/VisualBert/dev_data'
test_data = '/Users/guillaumevalette/Desktop/HM_challenge/data/VisualBert/test_data'

In [3]:
def img_id_to_img_features(db_dir, img_id):
    """ Utils function to retrieve an image features from its id."""
    # open lmdb environment
    env_db = lmdb.open(path=db_dir, subdir=True, readonly=True, readahead=False)

    # start transaction
    txn = env_db.begin()
    
    # convert img_id to proper byte string for keys in database
    id_str = str(img_id)
    if len(id_str) < 5:
        id_str = '0' + id_str
    assert len(id_str) == 5
    
    # retreive value in database (in pickle format) associated to id_str
    value = txn.get(id_str.encode()) 
    img_info = pickle.loads(value)

    # reteive image features only
    img_feat = img_info["features"]

    # close lmdb environment
    env_db.close()

    return img_feat

In [5]:
file_paths = [train_path, dev_path, test_path]
file_datas = [train_data, dev_data, test_data]

for file_path, file_data in zip(file_paths, file_datas):
    # create dataframe from .json data file
    df = pd.read_json(file_path, lines=True)

    df.rename(columns={'img': 'img_name'}, inplace=True)

    # append "features" column 
    df['features'] = df['id'].map(lambda img_id: 
                                   img_id_to_img_features(lmdb_dir, img_id))
    
    # save to pickle format
    df.to_pickle(file_data)

### Check if the above works

In [6]:
train_df = pd.read_pickle(train_data)

In [79]:
train_df.head()

Unnamed: 0,id,img_name,label,text,features
0,42953,img/42953.png,0,its their character not their color that matters,"[[0.0, 0.0, 0.0, 0.0, 9.599549, 2.1708376, 13...."
1,23058,img/23058.png,0,don't be afraid to love again everyone is not ...,"[[0.0, 0.0, 0.0, 0.0, 12.636864, 0.0, 7.682766..."
2,13894,img/13894.png,0,putting bows on your pet,"[[0.0, 0.0, 0.0, 0.83242446, 5.372245, 2.75794..."
3,37408,img/37408.png,0,i love everything and everybody! except for sq...,"[[0.0, 0.0, 0.0, 0.0, 11.302303, 0.0, 0.0, 0.0..."
4,82403,img/82403.png,0,"everybody loves chocolate chip cookies, even h...","[[1.1141618, 0.0, 0.7985689, 0.0, 17.840979, 0..."


In [82]:
train_df["label"][3]

0

In [13]:
type(train_df["features"][0])

numpy.ndarray

In [83]:
train_df["features"][0]

array([[ 0.       ,  0.       ,  0.       , ...,  0.       ,  0.       ,
         0.       ],
       [ 0.       ,  0.       ,  0.       , ...,  0.       ,  1.0427783,
         0.       ],
       [ 0.       ,  0.       ,  0.       , ...,  0.       ,  0.       ,
         0.       ],
       ...,
       [ 7.8925724,  0.       ,  1.5643792, ...,  0.       ,  6.162105 ,
         0.       ],
       [ 0.       ,  0.       ,  4.50093  , ...,  0.       ,  0.       ,
         0.       ],
       [14.124166 ,  0.       ,  0.       , ...,  4.5912385,  0.       ,
         0.       ]], dtype=float32)

In [85]:
import torch

In [86]:
torch.tensor(train_df["features"][0])

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  1.0428,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 7.8926,  0.0000,  1.5644,  ...,  0.0000,  6.1621,  0.0000],
        [ 0.0000,  0.0000,  4.5009,  ...,  0.0000,  0.0000,  0.0000],
        [14.1242,  0.0000,  0.0000,  ...,  4.5912,  0.0000,  0.0000]])

In [16]:
for item in train_df["features"]:
    if item.shape != (100, 2048):
        print('problem')
        break

In [77]:
ids = np.random.randint(low=0, high=8500, size=3000)

In [78]:
for id in img_id: 
    id_str = str(id)
    if len(id_str) < 5:
        id_str = '0' + id_str
    env_db = lmdb.open(path=lmdb_dir, subdir=True, readonly=True, readahead=False)
    txn = env_db.begin()
    value = txn.get(id_str.encode()) 
    img_info = pickle.loads(value)
    img_feat = img_info["features"]
    env_db.close()

    if not np.array_equal(img_feat, train_df.loc[train_df["id"] == id].iloc[0]["features"]):
        print('problem')
        break
