In [1]:
import torch, torchvision
import matplotlib.pyplot as plt
import json
import cv2
import numpy as np
from copy import deepcopy
from detectron2.modeling import build_model
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.structures.image_list import ImageList
from detectron2.data import transforms as T
from detectron2.modeling.box_regression import Box2BoxTransform
from detectron2.modeling.roi_heads.fast_rcnn import FastRCNNOutputs
from detectron2.structures.boxes import Boxes
from detectron2.layers import nms
from detectron2 import model_zoo
from detectron2.config import get_cfg

### detectron2

In [2]:
cfg_path = "COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml"

def load_config_and_model_weights(cfg_path):
    cfg = get_cfg()
    cfg.merge_from_file(model_zoo.get_config_file(cfg_path))

    # ROI HEADS SCORE THRESHOLD
    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5

    # Comment the next line if you're using 'cuda'
    cfg['MODEL']['DEVICE']='cuda'

    cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url(cfg_path)

    return cfg

cfg = load_config_and_model_weights(cfg_path)

In [3]:
def get_model(cfg):
    # build model
    det_model = build_model(cfg)

    # load weights
    checkpointer = DetectionCheckpointer(det_model)
    checkpointer.load(cfg.MODEL.WEIGHTS)

    # eval mode
    det_model.eval()
    return det_model

det_model = get_model(cfg)

The checkpoint state_dict contains keys that are not used by the model:
  [35mproposal_generator.anchor_generator.cell_anchors.{0, 1, 2, 3, 4}[0m


In [4]:
def prepare_image_inputs(cfg, img_list):
    # Resizing the image according to the configuration
    transform_gen = T.ResizeShortestEdge(
                [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST
            )
    img_list = [transform_gen.get_transform(img).apply_image(img) for img in img_list]

    # Convert to C,H,W format
    convert_to_tensor = lambda x: torch.Tensor(x.astype("float32").transpose(2, 0, 1))

    batched_inputs = [{"image":convert_to_tensor(img), "height": img.shape[0], "width": img.shape[1]} for img in img_list]

    # Normalizing the image
    num_channels = len(cfg.MODEL.PIXEL_MEAN)
    pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(num_channels, 1, 1)
    pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).view(num_channels, 1, 1)
    normalizer = lambda x: (x - pixel_mean) / pixel_std
    images = [normalizer(x["image"]) for x in batched_inputs]

    # Convert to ImageList
    images =  ImageList.from_tensors(images,det_model.backbone.size_divisibility)
    
    return images, batched_inputs

#images, batched_inputs = prepare_image_inputs(cfg, [img_bgr1, img_bgr2])

In [5]:
def get_features(model, images):
    features = model.backbone(images.tensor.cuda())
    return features

#features = get_features(model, images)

In [6]:
def get_proposals(model, images, features):
    proposals, _ = model.proposal_generator(images, features)
    return proposals

#proposals = get_proposals(model, images, features)

In [7]:
def get_box_features(model, features, proposals):
    features_list = [features[f] for f in ['p2', 'p3', 'p4', 'p5']]
    box_features = model.roi_heads.box_pooler(features_list, [x.proposal_boxes for x in proposals])
    box_features = model.roi_heads.box_head.flatten(box_features)
    box_features = model.roi_heads.box_head.fc1(box_features)
    box_features = model.roi_heads.box_head.fc_relu1(box_features)
    box_features = model.roi_heads.box_head.fc2(box_features)

    box_features = box_features.reshape(1, 1000, 1024) # depends on your config and batch size
    return box_features, features_list

#box_features, features_list = get_box_features(model, features, proposals)

In [8]:
def get_prediction_logits(model, features_list, proposals):
    cls_features = model.roi_heads.box_pooler(features_list, [x.proposal_boxes for x in proposals])
    cls_features = model.roi_heads.box_head(cls_features)
    pred_class_logits, pred_proposal_deltas = model.roi_heads.box_predictor(cls_features)
    return pred_class_logits, pred_proposal_deltas

#pred_class_logits, pred_proposal_deltas = get_prediction_logits(model, features_list, proposals)

def get_box_scores(cfg, pred_class_logits, pred_proposal_deltas, proposals):
    box2box_transform = Box2BoxTransform(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS)
    smooth_l1_beta = cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA

    outputs = FastRCNNOutputs(
        box2box_transform,
        pred_class_logits,
        pred_proposal_deltas,
        proposals,
        smooth_l1_beta,
    )

    boxes = outputs.predict_boxes()
    scores = outputs.predict_probs()
    image_shapes = outputs.image_shapes

    return boxes, scores, image_shapes

#boxes, scores, image_shapes = get_box_scores(cfg, pred_class_logits, pred_proposal_deltas)

def get_output_boxes(boxes, batched_inputs, image_size):
    proposal_boxes = boxes.reshape(-1, 4).clone()
    scale_x, scale_y = (batched_inputs["width"] / image_size[1], batched_inputs["height"] / image_size[0])
    output_boxes = Boxes(proposal_boxes)

    output_boxes.scale(scale_x, scale_y)
    output_boxes.clip(image_size)

    return output_boxes

#output_boxes = [get_output_boxes(boxes[i], batched_inputs[i], proposals[i].image_size) for i in range(len(proposals))]

In [9]:
def select_boxes(cfg, output_boxes, scores):
    test_score_thresh = cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST
    test_nms_thresh = cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST
    cls_prob = scores.detach()
    cls_boxes = output_boxes.tensor.detach().reshape(1000,80,4)
    max_conf = torch.zeros((cls_boxes.shape[0])).cpu()
    for cls_ind in range(0, cls_prob.shape[1]-1):
        cls_scores = cls_prob[:, cls_ind+1]
        det_boxes = cls_boxes[:,cls_ind,:]
        keep = np.array(nms(det_boxes, cls_scores, test_nms_thresh).cpu())
        max_conf[keep] = torch.where(cls_scores.cpu()[keep] > max_conf[keep], cls_scores.cpu()[keep], max_conf[keep])
    keep_boxes = torch.where(max_conf >= test_score_thresh)[0]
    return keep_boxes, max_conf

In [10]:
MIN_BOXES=10
MAX_BOXES=100
def filter_boxes(keep_boxes, max_conf, min_boxes, max_boxes):
    if len(keep_boxes) < min_boxes:
        keep_boxes = np.argsort(max_conf).numpy()[::-1][:min_boxes]
    elif len(keep_boxes) > max_boxes:
        keep_boxes = np.argsort(max_conf).numpy()[::-1][:max_boxes]
    return keep_boxes

#keep_boxes = [filter_boxes(keep_box, mx_conf, MIN_BOXES, MAX_BOXES) for keep_box, mx_conf in zip(keep_boxes, max_conf)]

In [11]:
def get_visual_embeds(box_features, keep_boxes):
    return box_features[keep_boxes.copy()]

#visual_embeds = [get_visual_embeds(box_feature, keep_box) for box_feature, keep_box in zip(box_features, keep_boxes)]

In [12]:
def return_visual_embeds(img_path):
  img = plt.imread(img_path, 0)
  img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
  images, batched_inputs = prepare_image_inputs(cfg, [img_bgr])
  features = get_features(det_model, images)
  proposals = get_proposals(det_model, images, features)
  box_features, features_list = get_box_features(det_model, features, proposals)
  pred_class_logits, pred_proposal_deltas = get_prediction_logits(det_model, features_list, proposals)
  boxes, scores, image_shapes = get_box_scores(cfg, pred_class_logits, pred_proposal_deltas, proposals)
  output_boxes = [get_output_boxes(boxes[i], batched_inputs[i], proposals[i].image_size) for i in range(len(proposals))]
  temp = [select_boxes(cfg, output_boxes[i], scores[i]) for i in range(len(scores))]
  keep_boxes, max_conf = [],[]
  for keep_box, mx_conf in temp:
      keep_boxes.append(keep_box)
      max_conf.append(mx_conf)

  MIN_BOXES=10
  MAX_BOXES=100
  def filter_boxes(keep_boxes, max_conf, min_boxes, max_boxes):
      if len(keep_boxes) < min_boxes:
          keep_boxes = np.argsort(max_conf).numpy()[::-1][:min_boxes]
      elif len(keep_boxes) > max_boxes:
          keep_boxes = np.argsort(max_conf).numpy()[::-1][:max_boxes]
      return keep_boxes

  keep_boxes = [filter_boxes(keep_box, mx_conf, MIN_BOXES, MAX_BOXES) for keep_box, mx_conf in zip(keep_boxes, max_conf)]

  visual_embeds = [get_visual_embeds(box_feature, keep_box) for box_feature, keep_box in zip(box_features, keep_boxes)]

  return visual_embeds


### prepare meme dataset

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import tqdm
from tqdm import trange
import torch
from torch import nn, optim
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [14]:
# Read the data
train = pd.read_csv('memes_train.csv')
test = pd.read_csv('memes_test.csv')
print('training dataset: ', train.shape)
print('testing dataset: ', test.shape)

training dataset:  (6625, 10)
testing dataset:  (1657, 10)


In [15]:
# use cuda if available
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

In [16]:
# prepare the dataloader
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torchvision import transforms
from torchvision.transforms import ToTensor
from PIL import Image
import requests

class MemeDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # read the data from the dataframe
        row = self.df.iloc[idx]
        image = row['image_path']
        # image = Image.open(image).convert('RGB')
        # if self.transform:
        #     image = Image.open(image).convert('RGB')
        #     image = self.transform(image)        
        text = str(row['image_text'])
        # date = row['created_at']
        label = row['label']

        return image, text, label

# define transforms
transform = {
        'train' : transforms.Compose([transforms.Resize((224, 224)),
                                      transforms.RandomHorizontalFlip(), 
                                      transforms.RandomRotation(50),
                                      transforms.ToTensor()]),

        'valid' : transforms.Compose([transforms.Resize((224, 224)),
                                      transforms.RandomHorizontalFlip(),
                                      transforms.ToTensor()])
        }


In [17]:
# splite the training set into train and valid
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(train, test_size=0.2, random_state=42)
# get 2000 samples for training, 500 for validation
random_state = 21
train_df = train_df.sample(n=3000, random_state=random_state)
valid_df = valid_df.sample(n=800, random_state=random_state)

print('training dataset: ', train_df.shape)
print('validation dataset: ', valid_df.shape)

training dataset:  (3000, 10)
validation dataset:  (800, 10)


In [18]:
# load the dataset
train_dataset = MemeDataset(train_df)
valid_dataset = MemeDataset(valid_df)
test_dataset = MemeDataset(test)

In [19]:
# define the dataloader
batch_size = 32
num_workers = 2
train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=20, num_workers=num_workers, shuffle=False)

### visualbert and training process

In [20]:
# load the model
from transformers import BertTokenizer, VisualBertModel
import torch

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre")

Some weights of the model checkpoint at uclanlp/visualbert-vqa-coco-pre were not used when initializing VisualBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing VisualBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing VisualBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [70]:
image = '/home/keyan/Documents/GitHub/meme_models/images/Cnmi68uWEAAmxD6.jpg'

inputs = tokenizer("The capital of France is Paris.", return_tensors="pt", padding=True, truncation=True)
visual_embeds = return_visual_embeds(image)[0].unsqueeze(0)
visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)

In [71]:
visual_embeds.shape

torch.Size([1, 100, 1024])

In [72]:
inputs.update(
    {
        "visual_embeds": visual_embeds,
        "visual_token_type_ids": visual_token_type_ids,
        "visual_attention_mask": visual_attention_mask,
    }
)
inputs.to(device)
model = model.to(device)
outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state

RuntimeError: mat1 and mat2 shapes cannot be multiplied (100x1024 and 2048x768)