## Step by Step of Front Part

#### Imports

In [15]:
import cv2
import torch
import numpy as np
import torch.nn as nn

In [2]:
class ImageEmbedding(nn.Module):
    def __init__(self, image_channel_type='I', output_size=1024, mode='train',
                 extract_features=False, features_dir=None):
        super(ImageEmbedding, self).__init__()
        self.extractor = models.resnet50(pretrained=True)
        # freeze feature extractor (ResNet50) parameters
        for param in self.extractor.parameters():
            param.requires_grad = False

        extactor_fc_layers = list(self.extractor.children())[:-1]
        if image_channel_type.lower() == 'normi':
            extactor_fc_layers.append(Normalize(p=2))
        self.extractor.classifier = nn.Sequential(*extactor_fc_layers)

        self.fflayer = nn.Sequential(
            nn.Linear(1000, output_size),
            nn.Tanh())

        # TODO: Get rid of this hack
        self.mode = mode
        self.extract_features = extract_features
        self.features_dir = features_dir

    def forward(self, image):
        # Pdb().set_trace()
        if not self.extract_features:
            image = self.extractor(image)
            # if self.features_dir is not None:
            #     utils.save_image_features(image, image_ids, self.features_dir)
        
        image_embedding = self.fflayer(image)
        return image_embedding

### Image Embedding Class

In [35]:
#From senior's code (detect.py)
image = cv2.imread("images/joTest3.png")
#bg_img = cv2.resize(image, (512, 512))
orig_image = image.copy()
# # BGR to RGB
image = cv2.cvtColor(orig_image, cv2.COLOR_BGR2RGB).astype(np.float32)
# make the pixel range between 0 and 1
image /= 255.0
# bring color channels to front
image = np.transpose(image, (2, 0, 1)).astype(np.float64)
# # convert to tensor
image = torch.tensor(image, dtype=torch.float)
# # add batch dimension
image = image.unsqueeze_(0)

In [4]:
import torchvision.models as models

In [36]:
embed = ImageEmbedding()

In [37]:
output = embed(image)

## Descriptor Embeddings
sBert for embeddings, need to use the preprocessors to get the descriptors

In [38]:
from sentence_transformers import SentenceTransformer

In [39]:
model = SentenceTransformer('sentence-transformers/stsb-roberta-base')

#### 10/170489.png File

In [40]:
qa = ["I should buy a Butterfinger because it will make my life more fun. ", "I should eat butterfingers because the simpsoms say", "I should buy this product because it will make me as funny as Bart Simpson."]
sentiment = [["6", "14"], ["12"], ["11"]]
strat = [["5"], ["6"], ["2", "5"], ["2", "6", "8"], ["5", "6", "9"]]
topic = ["2", "2", "2"]

In [None]:
def transform(target_lst):
    """
    Transform the target_lst of sentiments provided by the PITTs dataset to
    a Pytorch tensor based on the Word2Vec model.

    target_list: a list of lists where each element is a number
    """
    # flatten list
    lst = [item for sublist in target_lst for item in sublist]

    # convert to int
    lst = [int(num) for num in lst]

    return max(lst, key=lst.count)

### Get from preprocessors, I think need to do all this inside Ads Dataset

In [56]:
transform(sentiment) #amazed

6

In [57]:
transform(topic) #chocolate

2

In [58]:
transform(strat) #culture

5

### Embeddings for each  of the descriptors

In [10]:
sent = torch.tensor(model.encode("amazed"))
stra = torch.tensor(model.encode("culture"))
top = torch.tensor(model.encode("chocolate"))

In [11]:
first = torch.tensor(model.encode(qa[0]))

In [68]:
1024/4 #because I want to concatenate 4 qa combinations to have the size of 1024

256.0

In [12]:
qa_proc = nn.Sequential(nn.Linear(768,256),nn.Tanh()) #to make the 300-d word embeddings into 256

In [13]:
qa_small = qa_proc(first) #get the 256-d word embedding for the question

In [14]:
qa_stra = torch.mul(first, stra) #multiply qa and strategy
qa_stra = qa_proc(qa_stra)

In [15]:
qa_top = torch.mul(first, top) #multiply qa and topic
qa_top = qa_proc(qa_top)

In [16]:
qa_sent = torch.mul(first, sent) #multiply qa and sentiment
qa_sent = qa_proc(qa_sent)

In [17]:
qa_res = torch.cat([qa_sent, qa_small, qa_top, qa_stra], dim=0) #concatenate the 4 256-d embeddings to become 1024-d

## Mutan Fusion & MLP from VQA
For Lydia to see how the mlp output will be like

In [18]:
class MutanFusion(nn.Module):
    def __init__(self, input_dim, out_dim, num_layers):
        super(MutanFusion, self).__init__()
        self.input_dim = input_dim
        self.out_dim = out_dim
        self.num_layers = num_layers

        hv = []
        for i in range(self.num_layers):
            do = nn.Dropout(p=0.5)
            lin = nn.Linear(input_dim, out_dim)

            hv.append(nn.Sequential(do, lin, nn.Tanh()))
        #
        self.image_transformation_layers = nn.ModuleList(hv)
        #
        hq = []
        for i in range(self.num_layers):
            do = nn.Dropout(p=0.5)
            lin = nn.Linear(input_dim, out_dim)
            hq.append(nn.Sequential(do, lin, nn.Tanh()))
        #
        self.ques_transformation_layers = nn.ModuleList(hq)

    def forward(self, ques_emb, img_emb):
        # Pdb().set_trace()
        batch_size = img_emb.size()[0]
        x_mm = []
        for i in range(self.num_layers):
            x_hv = img_emb
            x_hv = self.image_transformation_layers[i](x_hv)

            x_hq = ques_emb
            x_hq = self.ques_transformation_layers[i](x_hq)
            x_mm.append(torch.mul(x_hq, x_hv))
        #
        x_mm = torch.stack(x_mm, dim=1)
        x_mm = x_mm.sum(1).view(batch_size, self.out_dim)
        x_mm = F.tanh(x_mm)
        return x_mm

In [19]:
mutan = MutanFusion(1024, 1024, 5)
mlp = nn.Sequential(nn.Linear(1024, 50))

In [20]:
import torch.nn.functional as F

In [21]:
combined = mutan(qa_res, output)



In [22]:
final = mlp(combined)

## Max understanding

In [26]:
import torch

In [70]:
_, preds = torch.max(final, 1)

In [40]:
check1 = torch.tensor([1,1,1,15])

In [41]:
(preds == check1).data

tensor([False, False, False,  True])

In [35]:
torch.sum((preds == check1).data)

tensor(1)

In [31]:
preds

tensor([15])

In [39]:
check1.size(0)

1

In [38]:
torch.max(final, 1)

torch.return_types.max(
values=tensor([0.2206], grad_fn=<MaxBackward0>),
indices=tensor([15]))

## Training Pipeline

In [1]:
from dataset import VQADataset
import torch
import cv2

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Userr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
ds = VQADataset()

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
def create_train_test_dataset(dataset: VQADataset):
    """Split the dataset into training and testing

    Args:
        dataset (AdsDataset): a Pytorch Dataset

    Returns:
        (AdsDataset, AdsDataset): train dataset, test dataset
    """
    # randomly select the training and testing indices
    indices = list(range(len(dataset.info_path)))
    train_indices, test_indices = train_test_split(
        indices, train_size=0.85, shuffle=True, random_state=24)

    # split the dataset into train and test
    train_dataset = torch.utils.data.Subset(VQADataset(transforms=get_transform(train=True)), train_indices)
    test_dataset = torch.utils.data.Subset(VQADataset(transforms=get_transform(train=False)), test_indices)

    return train_dataset, test_dataset

In [5]:
from main import get_transform

In [6]:
train_ds, test_ds = create_train_test_dataset(ds)

## Compile Combos

In [None]:
all_combinations = {}


In [11]:
lst = [6,3,3]
max(lst, key=lst.count)

3

In [3]:
from preprocess.descriptors import SentimentPreProcessor, StrategiesPreProcessor, TopicsPreProcessor

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Userr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
sent_proc = SentimentPreProcessor()
strat_proc = StrategiesPreProcessor()
top_proc = TopicsPreProcessor()

In [15]:
def basic_transform(target_lst):
    """
    Transform the target_lst of sentiments provided by the PITTs dataset to
    a Pytorch tensor based on the Word2Vec model.

    target_list: a list of lists where each element is a number
    """
    # flatten list
    lst = [item for sublist in target_lst for item in sublist]

    # convert to int
    lst = [int(num) for num in lst]

    return max(lst, key=lst.count)

In [16]:
from preprocess.descriptors import cosine_sim

In [31]:
def transform(target_lst, descriptor):
    """
    Transform the target_lst of topics provided by the PITTs dataset to
    a Pytorch tensor based on the sBERT model.

    target_list: a list of lists, each element may contain a number or text
    """

    # flatten list
    target_lst = [item for sublist in target_lst for item in sublist]

    count = 0
    vec_lst = []
    num_lst = []

    if descriptor == "sentiment":
        proc = sent_proc
    elif descriptor == "topic":
        proc = top_proc
    else:
        proc = strat_proc

    for el in target_lst:
        try:
            x = int(el)
            num_lst.append(x)
            count += 1
        except ValueError:
            # Get the vector representation of this phrase
            vec_lst.append(proc.text_embed_model.get_vector_rep(el))

    if count == 0:
        # The target list has all user text inputs so try to find the
        # most represented phrase
        cosines = [0] * len(vec_lst)
        for i in range(len(vec_lst)):
            for j in range(len(vec_lst)):
                if i != j:
                    cosines[i] += cosine_sim(vec_lst[i], vec_lst[j])

        max_val = max(cosines)
        max_index = cosines.index(max_val)

        final = target_lst[max_index]

    else:
        if 0 in num_lst:
            num_lst.remove(0)
        final = proc.id_to_word[max(num_lst, key=num_lst.count)]

    return final

In [43]:
all_data = {}

In [30]:
sent_proc.id_to_word.keys()

dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30])

In [44]:
main_idx = 1
for image_id, qa, sentiments, strategies, topics, slogans in ds:
    #get top id for sentiments, strategies, topics
    sentiment = transform(sentiments, "sentiment")
    strategy = transform(strategies, "strategy")
    topic = transform(topics, "topic")
    #combine best s,s,t with each combination of qa and slogan
    for question in qa:
        for slogan in slogans:
            all_data[main_idx] = {"Slogan id": main_idx, "Image": image_id, "Sentiment": sentiment, "Strategy": strategy, "Topic":topic, "QA": question, "Slogan": slogan}
            main_idx += 1
    



DOES NOT EXIST hilariouss
DOES NOT EXIST 12oz
DOES NOT EXIST threat/violence
DOES NOT EXIST textee
DOES NOT EXIST 02/03/2015
DOES NOT EXIST die/reincarnate
DOES NOT EXIST iceburg
DOES NOT EXIST chimmney


In [41]:
import json

In [45]:
json_object = json.dumps(all_data)
with open("slogan_descriptor_combos.json", "w") as outfile:
    outfile.write(json_object)
outfile.close()

## Continue Training Pipeline

In [7]:
train_ds.indices[1]

1084

In [8]:
train_ds.dataset['1084']

(tensor([[[0.6353, 0.6392, 0.6392,  ..., 0.6275, 0.6314, 0.6314],
          [0.6392, 0.6392, 0.6353,  ..., 0.6275, 0.6275, 0.6314],
          [0.6353, 0.6392, 0.6392,  ..., 0.6275, 0.6275, 0.6275],
          ...,
          [0.6784, 0.6784, 0.6784,  ..., 0.6431, 0.6392, 0.6314],
          [0.6784, 0.6824, 0.6824,  ..., 0.6431, 0.6431, 0.6431],
          [0.9294, 0.9373, 0.9294,  ..., 0.9294, 0.9216, 0.9294]],
 
         [[0.7529, 0.7569, 0.7569,  ..., 0.7451, 0.7490, 0.7490],
          [0.7569, 0.7569, 0.7529,  ..., 0.7451, 0.7451, 0.7490],
          [0.7529, 0.7569, 0.7569,  ..., 0.7451, 0.7451, 0.7451],
          ...,
          [0.8039, 0.8000, 0.8039,  ..., 0.7608, 0.7608, 0.7529],
          [0.7882, 0.7882, 0.7922,  ..., 0.7490, 0.7569, 0.7490],
          [0.9647, 0.9647, 0.9647,  ..., 0.9569, 0.9569, 0.9569]],
 
         [[0.7922, 0.7961, 0.7961,  ..., 0.7843, 0.7882, 0.7882],
          [0.7961, 0.7961, 0.7922,  ..., 0.7843, 0.7843, 0.7882],
          [0.7922, 0.7961, 0.7961,  ...,

In [10]:
data = test_ds.dataset.combos['10']
image_id = data["Image"]

In [21]:
train_ds.dataset.combos['16949']

{'Slogan id': 16949,
 'Image': '10/172579.png',
 'Sentiment': 'creative',
 'Strategy': 'Literal',
 'Topic': 'restaurant',
 'QA': 'I should use Citibank for my financial needs because they are on the cutting edge of online banking and convenient service that works for me.',
 'Slogan': 'vI like being myself. Maybe just slimmer, with a few less wrinkles.'}

In [8]:
image, target, answer = train_ds.dataset['1084']

In [9]:
qa = target["qa"]
sentiment = target["sentiment"]
strategy = target["strategy"]
topic = target["topic"]
slogan = target["slogan"]

In [10]:
from vqa import VQAModel

In [11]:
model = VQAModel(output_size=len(ds.info_path)).double()

In [12]:
image = image.unsqueeze_(0)

In [13]:
ans_scores = model(image, torch.from_numpy(qa), torch.from_numpy(sentiment), torch.from_numpy(strategy), torch.from_numpy(topic))



In [14]:
_, preds = torch.max(ans_scores, 1)
criterion = nn.CrossEntropyLoss()
loss = criterion(ans_scores, slogan_id)

NameError: name 'nn' is not defined

In [14]:
import numpy as np

In [15]:
#From senior's code (detect.py)
image = cv2.imread("data/" + image_id)
#bg_img = cv2.resize(image, (512, 512))
orig_image = image.copy()
# # BGR to RGB
image = cv2.cvtColor(orig_image, cv2.COLOR_BGR2RGB).astype(np.float32)
# make the pixel range between 0 and 1
image /= 255.0
# bring color channels to front
image = np.transpose(image, (2, 0, 1)).astype(np.float64)
# # convert to tensor
image = torch.tensor(image, dtype=torch.float)
# # add batch dimension
image = image.unsqueeze_(0)

In [16]:
# ans_scores = model(image, qa, sentiment, strategy, topic)
ans_scores = model(image, torch.from_numpy(qa), torch.from_numpy(sentiment), torch.from_numpy(strategy), torch.from_numpy(topic))
_, preds = torch.max(ans_scores, 1)
criterion = nn.CrossEntropyLoss()
#loss = criterion(ans_scores, slogan_id)

In [19]:
slogan_id = torch.tensor([1048])
loss = criterion(ans_scores, slogan_id)

In [20]:
check = torch.Tensor([1,2,3,4,5,5])

In [24]:
loss.backward()

In [27]:
input = torch.randn(3, 5, requires_grad=True)
target = torch.randn(3, 5).softmax(dim=1)
output = criterion(input, target)
output.backward()

In [33]:
loss.item()

10.432587396908128

In [17]:
from vqa import RANQ, ImageEmbedding, QaEmbedding

In [18]:
checq = RANQ()

In [19]:
img_emv, desc_emb = checq.forward(image, torch.from_numpy(qa), torch.from_numpy(sentiment), torch.from_numpy(strategy), torch.from_numpy(topic))

In [1]:
from vqa import MutanFusion

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Userr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
muton = MutanFusion(1024, 1024, 5).double()

In [31]:
ok_imgh = img_emv

In [21]:
muton(img_emv, desc_emb)

RuntimeError: expected scalar type Double but found Float