# Visual Reasoning Baseline Model

1. Data preprocessing 어떻게 하면될까?
 - Image feature를 resnet으로 미리 추출해둘까? --> baseline돌리기 까다로움
 - DataLoader 구성을 어떻게하면 좋을까? --> DataSet class 안에서 feature를 return 할까? 아니면 ResNet을 밖에둘까


2. Baseline Model 설계를 어떻게?
 - Encoder: image encoding 어떻게 하면 될까? Answer Image 3개를 한꺼번에 encoding? 혹은 각각 encoding 한 후 Weight Sum??
 - Decoder: Answer 후보 중 정답후보를 어떻게 고를까? Similarity 기준? KL 같은 분포기준? Attention 기반 Scoring??


3. SOTA Model 설계를 어떻게? 

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

## 1. Import all packages

In [3]:
import pandas as pd
import numpy as np
import json
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision
import torch.optim as optim
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from PIL import Image
import cv2

import matplotlib
import matplotlib.pyplot as plt
#import matplotlib.font_manager as fm
#fm.get_fontconfig_fonts()
#font_location = './NanumGothic.ttf'
#font_location = 'C:/Windows/Fonts/NanumGothic.ttf' # For Windows
#font_name = fm.FontProperties(fname=font_location).get_name()
#matplotlib.rc('font', family=font_name)

%matplotlib inline
# 브라우저에서 바로 이미지를 그린다.


## 2. Preparation of datasets

In [4]:
class Config:
    def __init__(self, args=None):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.distributed = False
        self.gpu_id = "0,1"
        self.HOME_DIR = "./datasets/"
        self.TASK_NAME = "similarity1/"
        self.FOLDER_NAME = "000003/"
        self.IMAGE_LIST = [
                "0d73dee440ef4291ae926fb5cb4ec55e.jpg", 
                "18f827e0d01742d495d3ecbaffb6255a.jpg", 
                "1b243a095866423da8f4a8f19d78ecf4.jpg",
                "4db9fa5bd947497d91bffd8de3b07e6e.jpg",
                "95ac10e3608e4abba9407a2a1cae2883.jpg",
                "a05e9630fd754a249b1fba0be5f386ed.jpg",
                "ea1f8a29ba464af4b2b393d4bb50d7c0.jpg"
             ]
        self.JSON_NAME = "000003"+".json"
        self.input_dim = 512
        self.mlp_hidden = 1024


#config = Config()
#a_image_file = config.HOME_DIR+config.TASK_NAME+config.FOLDER_NAME+config.IMAGE_LIST[0]
#a_image = plt.imread(a_image_file)
#plt.imshow(a_image)

In [5]:
def get_data(config):
    
    home_dir = config.HOME_DIR
    task_name = config.TASK_NAME
    dir_list = os.listdir(home_dir+task_name)
    sample_list = []
    
    for directory in dir_list:
        FOLDER_NAME = directory+"/"
        JSON_NAME = directory+".json"
        FILE_PATH = home_dir+task_name+FOLDER_NAME
        a_data = json.load(open(FILE_PATH+JSON_NAME))
        a_data["file_path"] = FILE_PATH
        a_data["answer1"] = [a_data["Answers"][0]]
        a_data["answer2"] = [a_data["Answers"][1]]
        del a_data["Answers"]
        sample_list.append(a_data)
        
    return sample_list

def get_img_argumentation():
    #이미지 전처리를 위한 이미지 크기 변환 및 각도조정을 위한 transform 선언

    transform = transforms.Compose(
        [
            transforms.Resize((356, 356)),
            transforms.RandomCrop((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ]
    )
    
    return transform


In [6]:
class Similarity1_Dataset(torch.utils.data.Dataset):
    
    def __init__(self, df, config=None, transform=None):
        self.df = df
        self.config = config
        self.transform = transform
        
        
    def __len__(self):
        return len(self.df)
        
    def __getitem__(self, idx):
        sample = self.df.iloc[idx]
        
        target = sample["correct_answer_group_ID"][0]
        q_img = sample["file_path"] + sample["Questions"][0]["images"][0]["image_url"]
        a1_img = [sample["file_path"] + ans_img["image_url"] for ans_img in sample["answer1"][0]["images"]]
        a2_img = [sample["file_path"] + ans_img["image_url"] for ans_img in sample["answer2"][0]["images"]]
        
        q_img_feature = Image.open(q_img).convert('RGB')  #이미지 데이터를 RGB형태로 읽음 
        q_img_feature = self.transform(q_img_feature)  #이미지 데이터의 크기 및 각도등을 변경
        
        a1_img_feature = [self.transform(Image.open(q_img).convert('RGB')) for img in a1_img]
        a2_img_feature = [self.transform(Image.open(q_img).convert('RGB')) for img in a1_img]
        
        return {
            "target": target,
            "q_img": q_img_feature,
            "a1_imgs": a1_img_feature,
            "a2_imgs": a2_img_feature
        }
        
    

In [7]:
def make_sequential(in_channels, out_channels, *args, **kwargs):
    return nn.Sequential(nn.Conv2d(in_channels, out_channels, *args, **kwargs),
           nn.BatchNorm2d(out_channels),
           nn.ReLu(),
           nn.MaxPool2d(*args, **kwargs))

class VRSimilarity(nn.Module):
    def __init__(self, config):
        super(VRSimilarity, self).__init__()
        
        self.config = config
        self.backborne = torchvision.models.resnet50(pretrained=True)
        self.backborne.fc = nn.Linear(self.backborne.fc.in_features, self.config.input_dim)
        self.fc = nn.Sequential(
                    nn.Linear(self.config.input_dim*4, self.config.mlp_hidden),
                    nn.ReLU(),
                    nn.Dropout(0.2),
                    nn.Linear(self.config.mlp_hidden, 1),
        )


    def forward(self, q_img, a1_img1, a1_img2, a1_img3, a2_img1, a2_img2, a2_img3):
        
        #Question Image Feature
        q_img = self.backborne(q_img)
        
        #Answer1 Image Feature
        a1_img1 = self.backborne(a1_img1)
        a1_img2 = self.backborne(a1_img2)
        a1_img3 = self.backborne(a1_img3)
        
        #Answer2 Image Feature
        a2_img1 = self.backborne(a2_img1)
        a2_img2 = self.backborne(a2_img2)
        a2_img3 = self.backborne(a2_img3)
        
        q_a1 = torch.cat([q_img, a1_img1, a1_img2, a1_img3], axis=1)
        q_a2 = torch.cat([q_img, a2_img1, a2_img2, a2_img3], axis=1)
        
        q_a1_logit = self.fc(q_a1)
        q_a2_logit = self.fc(q_a2)
        
        return {
            "q_a1_logit": q_a1_logit,
            "q_a2_logit": q_a2_logit
        }

    '''
    def forward(self, samples):
        
        #Question Image Feature
        q = self.backborne(samples["q_img"])
        
        #Answer1 Image Feature
        a1_img1 = self.backborne(samples["a1_imgs"][0])
        a1_img2 = self.backborne(samples["a1_imgs"][1])
        a1_img3 = self.backborne(samples["a1_imgs"][2])
        
        #Answer2 Image Feature
        a2_img1 = self.backborne(samples["a2_imgs"][0])
        a2_img2 = self.backborne(samples["a2_imgs"][1])
        a2_img3 = self.backborne(samples["a2_imgs"][2])
        
        q_a1 = torch.cat([q, a1_img1, a1_img2, a1_img3], axis=1)
        q_a2 = torch.cat([q, a2_img1, a2_img2, a2_img3], axis=1)
        
        q_a1_logit = self.fc(q_a1)
        q_a2_logit = self.fc(q_a2)
        
        return {
            "q_a1_logit": q_a1_logit,
            "q_a2_logit": q_a2_logit
        }
    '''

In [8]:
def train_fn(model, train_loader, optimizer, loss_fn, config):
    
    total_count_correct = 0
    total_num_example = 0
    total_loss = []
    
    model.train()
    device = config.device
    
    for batch in tqdm(train_loader):   
        optimizer.zero_grad()
        
        '''
        #Question Image Feature
        batch["q_img"] = batch["q_img"].to(device)
        
        #Answer1 Image Feature
        batch["a1_imgs"][0] = batch["a1_imgs"][0].to(device)
        batch["a1_imgs"][1] = batch["a1_imgs"][1].to(device)
        batch["a1_imgs"][2] = batch["a1_imgs"][2].to(device)
        
        #Answer2 Image Feature
        batch["a2_imgs"][0] = batch["a2_imgs"][0].to(device)
        batch["a2_imgs"][1] = batch["a2_imgs"][1].to(device)
        batch["a2_imgs"][2] = batch["a2_imgs"][2].to(device)
        logits = model(batch)
        '''

        #Question Image Feature
        q_img = batch["q_img"].to(device)
        
        #Answer1 Image Feature
        a1_img1 = batch["a1_imgs"][0].to(device)
        a1_img2 = batch["a1_imgs"][1].to(device)
        a1_img3 = batch["a1_imgs"][2].to(device)
        
        #Answer2 Image Feature
        a2_img1 = batch["a2_imgs"][0].to(device)
        a2_img2 = batch["a2_imgs"][1].to(device)
        a2_img3 = batch["a2_imgs"][2].to(device)
        logits = model(q_img, a1_img1, a1_img2, a1_img3, a2_img1, a2_img2, a2_img3)
        
        target_a1, target_a2 = batch["target"].float(), (batch["target"] == 0).type(torch.float)
        loss_a1 = loss_fn(torch.sigmoid(logits["q_a1_logit"]), target_a1)
        loss_a2 = loss_fn(torch.sigmoid(logits["q_a2_logit"]), target_a2)
        loss = loss_a1 + loss_a2
        total_loss.append(loss)
        
        predicted_a1 = (torch.sigmoid(logits["q_a1_logit"]) > 0.5).float()
        #print("a1_logit", torch.sigmoid(logits["q_a1_logit"]))
        total_count_correct = total_count_correct + torch.sum(predicted_a1.squeeze() == target_a1).item()
        total_num_example = total_num_example + target_a1.size(0)
        
        predicted_a2 = (torch.sigmoid(logits["q_a2_logit"]) > 0.5).float()
        #print("a2_logit", torch.sigmoid(logits["q_a2_logit"]))
        total_count_correct = total_count_correct + torch.sum(predicted_a2.squeeze() == target_a2).item()
        total_num_example = total_num_example + target_a2.size(0)
        
        loss.backward()
        optimizer.step()
        
        print("LOSS:", str(sum(total_loss)/total_num_example) + " Accuracy: " + str(total_count_correct/total_num_example) )


In [None]:
def main():
    
    config = Config()
    
    sample_list = get_data(config)
    df = pd.DataFrame(sample_list)
    train_df, test_df = train_test_split(df)
    transform = get_img_argumentation()
    train_datasets = Similarity1_Dataset(train_df, transform=transform)
    train_loader = torch.utils.data.DataLoader(train_datasets, batch_size=4)
    
    vrs1_model = VRSimilarity(config)
    #vrs1_model = torch.nn.DataParallel(vrs1_model)
    vrs1_model = vrs1_model.to(config.device)
    #if config.distributed:
        #model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[config.gpu])    
        #model = torch.nn.parallel.DistributedDataParallel(model)    
        
    optimizer = torch.optim.Adam(vrs1_model.parameters(), lr=0.0001)
    loss_fn = nn.BCELoss()
    
    train_fn(vrs1_model, train_loader, optimizer, loss_fn, config)

In [None]:
main()

In [13]:
config = Config()

sample_list = get_data(config)
df = pd.DataFrame(sample_list)
train_df, test_df = train_test_split(df)
transform = get_img_argumentation()
train_datasets = Similarity1_Dataset(train_df, transform=transform)
train_loader = torch.utils.data.DataLoader(train_datasets, batch_size=4)

In [14]:
a_model = torchvision.models.resnet50(pretrained=True)

In [15]:
device = config.device
a_model.to(config.device)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [17]:
for batch in tqdm(train_loader):   
    q_img = batch["q_img"].to(device)
    a_model(q_img)
    break

  0%|                                                                                                   | 0/593 [00:00<?, ?it/s]


RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED