# Get VOC 2007 train data

In [1]:
import os
import torchvision.transforms as transforms
import torchvision

In [2]:
def read_voc_dataset(path, year):
    T = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor(),])
    voc_data =  torchvision.datasets.VOCDetection(path, year=year, image_set='train', transform=T, download=True)
    voc_val =  torchvision.datasets.VOCDetection(path, year=year, image_set='val', transform=T, download=True)
    return voc_data, voc_val

In [3]:
train_loader2007, val_loader2007 = read_voc_dataset(path="." ,year='2007')

Using downloaded and verified file: .\VOCtrainval_06-Nov-2007.tar
Extracting .\VOCtrainval_06-Nov-2007.tar to .
Using downloaded and verified file: .\VOCtrainval_06-Nov-2007.tar
Extracting .\VOCtrainval_06-Nov-2007.tar to .


In [4]:
train_loader2007

Dataset VOCDetection
    Number of datapoints: 2501
    Root location: .
    StandardTransform
Transform: Compose(
               Resize(size=(224, 224), interpolation=bilinear, max_size=None, antialias=None)
               ToTensor()
           )

In [5]:
train_loader2007[0]

(tensor([[[0.2667, 0.2588, 0.2588,  ..., 0.2314, 0.2196, 0.2078],
          [0.2667, 0.2588, 0.2588,  ..., 0.2431, 0.2314, 0.2235],
          [0.2667, 0.2667, 0.2667,  ..., 0.2471, 0.2471, 0.2471],
          ...,
          [0.2980, 0.3020, 0.3098,  ..., 0.3333, 0.3176, 0.3137],
          [0.3098, 0.3098, 0.3176,  ..., 0.3216, 0.3216, 0.3216],
          [0.3216, 0.3216, 0.3216,  ..., 0.3137, 0.3137, 0.3098]],
 
         [[0.2667, 0.2588, 0.2588,  ..., 0.2235, 0.2235, 0.2157],
          [0.2667, 0.2588, 0.2588,  ..., 0.2353, 0.2353, 0.2314],
          [0.2667, 0.2667, 0.2667,  ..., 0.2431, 0.2549, 0.2510],
          ...,
          [0.2941, 0.2941, 0.3020,  ..., 0.3176, 0.3176, 0.3137],
          [0.3059, 0.3059, 0.3098,  ..., 0.3176, 0.3216, 0.3216],
          [0.3176, 0.3176, 0.3137,  ..., 0.3098, 0.3137, 0.3098]],
 
         [[0.2588, 0.2549, 0.2588,  ..., 0.2275, 0.2235, 0.2118],
          [0.2588, 0.2549, 0.2588,  ..., 0.2392, 0.2353, 0.2275],
          [0.2588, 0.2627, 0.2667,  ...,

In [6]:
val_loader2007

Dataset VOCDetection
    Number of datapoints: 2510
    Root location: .
    StandardTransform
Transform: Compose(
               Resize(size=(224, 224), interpolation=bilinear, max_size=None, antialias=None)
               ToTensor()
           )

# Sort Class

In [7]:
import tqdm.notebook as tq

In [8]:
classes = ['cat', 'cow', 'dog', 'bird', 'car']

In [9]:
def sort_class_extract(datasets):
    datasets_per_class = {}
    
    
    for j in classes:
        datasets_per_class[j] = {}

    for dataset in datasets:
        for i in tq.tqdm(dataset):
            
            img, target = i
            obj = target['annotation']['object']
            
            if isinstance(obj, list):
                classe = target['annotation']['object'][0]["name"]
            else:
                classe = target['annotation']['object']["name"]
                
            filename = target['annotation']['filename']

            org = {}
            
            for j in classes:
                org[j] = []
                org[j].append(img)
            
            if isinstance(obj, list):
                for j in range(len(obj)):
                    classe = obj[j]["name"]
                    if classe in classes:
                        org[classe].append([obj[j]["bndbox"], target['annotation']['size']])
            else:
                if classe in classes:
                    org[classe].append([obj["bndbox"], target['annotation']['size']])
                    
            for j in classes:
                if len(org[j]) > 1:
                    try:
                        datasets_per_class[j][filename].append(org[j])
                    except KeyError:
                        datasets_per_class[j][filename] = []
                        datasets_per_class[j][filename].append(org[j])
                        
    return datasets_per_class

In [10]:
datasets_per_class = sort_class_extract([train_loader2007])

  0%|          | 0/2501 [00:00<?, ?it/s]

In [11]:
len(datasets_per_class)

5

In [12]:
classes = ['cat', 'cow', 'dog', 'bird', 'car']

img_num = 0

for i in tq.tqdm(range(len(classes))):
    classe = classes[i]
    print("Classe " + str(classe) + "...")
    print(len(datasets_per_class[classe]))
    # print(datasets_per_class[classe])
    
    img_num += len(datasets_per_class[classe])

  0%|          | 0/5 [00:00<?, ?it/s]

Classe cat...
166
Classe cow...
71
Classe dog...
210
Classe bird...
182
Classe car...
402


In [13]:
img_num

1031

# Train

In [74]:
from config import *
import sys
from torch.autograd import Variable
# import torchvision
import torch

In [75]:
class FeatureExtractor(nn.Module):
    def __init__(self, network='vgg16'):
        print(network)
        super(FeatureExtractor, self).__init__()
        if network == 'vgg16':
            model = torchvision.models.vgg16(pretrained=True)
        elif network == 'resnet50':
            model = torchvision.models.resnet50(pretrained=True)
        else:
            model = torchvision.models.alexnet(pretrained=True)
        model.eval() # to not do dropout
        self.features = list(model.children())[0]
        if network == 'vgg16':
            self.classifier = nn.Sequential(*list(model.classifier.children())[:-2])
        else:
            self.classifier = nn.Sequential(*list(model.children())[:-2])
    def forward(self, x):
        x = self.features(x)
        return x

In [76]:
class Policy_net(nn.Module):
    def __init__(self):
        super(Policy_net, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear(in_features=81 + 25088, out_features=1024),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(in_features=1024, out_features=512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(in_features=512, out_features=9)
        )
    def forward(self, x):
        return torch.nn.functional.softmax(self.classifier(x),dim=1)

In [77]:
def extract(index, loader):
    extracted = loader[index]
    ground_truth_boxes =[]
    
    for ex in extracted:
        img = ex[0]
        bndbox = ex[1][0]
        size = ex[1][1]
        
        xmin = (float(bndbox['xmin']) /  float(size['width'])) * 224
        xmax = (float(bndbox['xmax']) /  float(size['width'])) * 224

        ymin = (float(bndbox['ymin']) /  float(size['height'])) * 224
        ymax = (float(bndbox['ymax']) /  float(size['height'])) * 224

        ground_truth_boxes.append([xmin, xmax, ymin, ymax])
        
    return img, ground_truth_boxes

In [78]:
def get_features(image, feature_extractor, dtype=FloatTensor):
    global transform
    image = image.view(1, *image.shape)
    image = Variable(image).type(dtype)
    # if use_cuda:
        # image = image.cuda()

    feature = feature_extractor(image)
    # print("Feature shape: " + str(feature.shape))

    return feature.data

In [79]:
def compose_state(image, actions_history, feature_extractor, dtype=FloatTensor):
    image_feature = get_features(image, feature_extractor, dtype)
    image_feature = image_feature.view(1,-1)

    # print("image feature: " + str(image_feature.shape))

    history_flatten = actions_history.view(1,-1).type(dtype)

    state = torch.cat((image_feature, history_flatten), 1)
    return state

In [80]:
def choose_action(state, policy_net):
    probs = policy_net(state)
    m = torch.distributions.Categorical(probs)
    action = m.sample()
    return action

In [81]:
def rewrap(coord):
    return min(max(coord, 0.0), 224.0)

In [93]:
def calculate_position_box(actions, xmin=0.0, xmax=224.0, ymin=0.0, ymax=224.0):
    alpha = 0.1
    
    alpha_h = alpha * (ymax - ymin)
    alpha_w = alpha * (xmax - xmin)
    
    real_x_min, real_x_max, real_y_min, real_y_max = 0, 224, 0, 224

    for r in actions:
        if r == 1: # Right
            real_x_min += alpha_w
            real_x_max += alpha_w
        if r == 2: # Left
            real_x_min -= alpha_w
            real_x_max -= alpha_w
        if r == 3: # Up 
            real_y_min -= alpha_h
            real_y_max -= alpha_h
        if r == 4: # Down
            real_y_min += alpha_h
            real_y_max += alpha_h
        if r == 5: # Bigger
            real_y_min -= alpha_h
            real_y_max += alpha_h
            real_x_min -= alpha_w
            real_x_max += alpha_w
        if r == 6: # Smaller
            real_y_min += alpha_h
            real_y_max -= alpha_h
            real_x_min += alpha_w
            real_x_max -= alpha_w
            
            if real_y_min >= real_y_max:
                real_y_min -= alpha_h
                real_y_max += alpha_h
                
            if real_x_min >= real_x_max:
                real_x_min -= alpha_w
                real_x_max += alpha_w
            
        if r == 7: # Fatter
            real_y_min += alpha_h
            real_y_max -= alpha_h
            
            if real_y_min >= real_y_max:
                real_y_min -= alpha_h
                real_y_max += alpha_h
                
        if r == 8: # Taller
            real_x_min += alpha_w
            real_x_max -= alpha_w
            
            if real_x_min >= real_x_max:
                real_x_min -= alpha_w
                real_x_max += alpha_w
            
    real_x_min, real_x_max, real_y_min, real_y_max = rewrap(real_x_min), rewrap(real_x_max), rewrap(real_y_min), rewrap(real_y_max)
    
    if real_x_max == 0.0:
        real_x_max += alpha_w
        
    if real_x_min == 224.0:
        real_x_min -= alpha_w
        
    if real_y_max == 0.0:
        real_y_max += alpha_h
        
    if real_y_min == 224.0:
        real_y_min -= alpha_h
    
    return [real_x_min, real_x_max, real_y_min, real_y_max]

In [83]:
def intersection_over_union(box1, box2):
    x11, x21, y11, y21 = box1
    x12, x22, y12, y22 = box2

    yi1 = max(y11, y12)
    xi1 = max(x11, x12)
    yi2 = min(y21, y22)
    xi2 = min(x21, x22)
    inter_area = max(((xi2 - xi1) * (yi2 - yi1)), 0)
    box1_area = (x21 - x11) * (y21 - y11)
    box2_area = (x22 - x12) * (y22 - y12)
    union_area = box1_area + box2_area - inter_area

    iou = inter_area / union_area
    return iou

In [84]:
def get_max_bdbox(ground_truth_boxes, coord):
    max_iou = False
    max_gt = []
    for ground_truth in ground_truth_boxes:
        iou = intersection_over_union(coord, ground_truth)
        if max_iou == False or max_iou < iou:
            max_iou = iou
            max_ground_truth = ground_truth
    return max_ground_truth

In [85]:
def compute_trigger_reward(actual_state, ground_truth):
    res = intersection_over_union(actual_state, ground_truth)
    # if res>=self.threshold:
        # return self.nu
    # return -1*self.nu
    return res

In [86]:
def update_history(action, actions_history):
    action_vector = torch.zeros(9)
    action_vector[action] = 1
    size_history_vector = len(torch.nonzero(actions_history))
    
    # print(torch.nonzero(actions_history))
    # print(size_history_vector)
    
    if size_history_vector < 9:
        actions_history[size_history_vector][action] = 1
    else:
        for i in range(8, 0, -1):
            actions_history[i][:] = actions_history[i-1][:]
        actions_history[0][:] = action_vector[:] 
        
    # print(actions_history)
    
    return actions_history

In [87]:
xmin = 0.0
xmax = 224.0
ymin = 0.0
ymax = 224.0

In [98]:
for i in tq.tqdm(range(len(classes))):
    classe = classes[i]
    print("Classe " + str(classe) + "...")
    
    num_episodes=5
    model_name='vgg16'
    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
    feature_extractor = FeatureExtractor(network=model_name)
    feature_extractor.cuda()
    
    policy_net = Policy_net()
    policy_net.cuda()
    
    optimizer = torch.optim.Adam(policy_net.parameters(), lr=1e-6)
    # optimizer.cuda()
    
    for i_episode in range(num_episodes):
        print("Episode" + " " + str(i_episode))
        
        cnt = 0
        
        sum_loss = 0.0
        
        for key, _ in datasets_per_class[classe].items():
            cnt += 1
            # print("item: ", cnt)
            
            image, ground_truth_boxes = extract(key, datasets_per_class[classe])
            # print(image)
            # print(ground_truth_boxes)
            
            original_image = image.clone()
            ground_truth = ground_truth_boxes[0]
            
            all_actions = []
            
            actions_history = torch.ones((9,9))
            state = compose_state(image, actions_history, feature_extractor)
            
            original_coordinates = [xmin, xmax, ymin, ymax]
            new_image = image
            done = False
            
            t = 0 #timer
            
            actual_equivalent_coord = original_coordinates
            
            transition_dict = {
                'states': [],
                'actions': [],
                'next_states': [],
                'rewards': [],
                'dones': []
            }
            
            while not done:
                t += 1
                action = choose_action(state, policy_net)
                # print(action)
                all_actions.append(action)
                
                if action == 0:
                    # print("yes")
                    next_state = None
                    new_equivalent_coord = calculate_position_box(all_actions)
                    # print(new_equivalent_coord)
                    closest_ground_box = get_max_bdbox(ground_truth_boxes, new_equivalent_coord)
                    # print(closest_ground_box)
                    reward = compute_trigger_reward(new_equivalent_coord, closest_ground_box)
                    # print(reward)
                    done = True
                    
                else:
                    # print("no")
                    # next_state = None
                    
                    actions_history = update_history(action, actions_history)
                    
                    new_equivalent_coord = calculate_position_box(all_actions)
                    # print(new_equivalent_coord)
                    
                    new_image = original_image[:, int(new_equivalent_coord[2]):int(new_equivalent_coord[3]), int(new_equivalent_coord[0]):int(new_equivalent_coord[1])]
                    new_image = transform(new_image)
                    next_state = compose_state(new_image, actions_history, feature_extractor)
                    
                    closest_ground_box = get_max_bdbox(ground_truth_boxes, new_equivalent_coord)
                    # print(closest_ground_box)
                    reward = compute_trigger_reward(new_equivalent_coord, closest_ground_box)
                    # print(reward)
                    
                    actual_equivalent_coord = new_equivalent_coord
                    
                if t == 20:
                    done = True
                    
                transition_dict['states'].append(state)
                transition_dict['actions'].append(action)
                transition_dict['next_states'].append(next_state)
                transition_dict['rewards'].append(reward)
                transition_dict['dones'].append(done)
                
                # print(transition_dict)
                
                state = next_state
                image = new_image
                
                # break
                
            reward_list = transition_dict['rewards']
            state_list = transition_dict['states']
            action_list = transition_dict['actions']
            
            G = 0
            gamma = 0.98
            
            optimizer.zero_grad()
            
            for i in reversed(range(len(reward_list))):
                reward = reward_list[i]
                state = torch.tensor(state_list[i]).to(device)
                action = torch.tensor(action_list[i]).to(device)
                
                log_prob = torch.log(policy_net(state))
                
                # print(log_prob)
                
                # probs = policy_network(state)
                
                # m = Categorical(probs)
                # action = m.sample()
                # next_state, reward = env.step(action)
                # loss = -m.log_prob(action) * reward
                
                probs = policy_net(state)
                m = torch.distributions.Categorical(probs)
                # action = m.sample()
                
                # print(m.log_prob(action))

                # break
                                  
                G = gamma * G + reward
                loss = -m.log_prob(action) * G
                
                if i == 0:
                    # print(loss)
                    sum_loss += loss.item()
                
                loss.backward()
                
            optimizer.step()
            
        print(sum_loss)
        
    break

  0%|          | 0/5 [00:00<?, ?it/s]

Classe cat...
vgg16
Episode 0


  state = torch.tensor(state_list[i]).to(device)
  action = torch.tensor(action_list[i]).to(device)


1013.1519567798823
Episode 1
1039.239423070103
Episode 2
1180.7061550319195
Episode 3
1248.5002516228706
Episode 4
1277.7404329078272


In [None]:
class Policy_net(nn.Module):
    def __init__(self):
        super(Policy_net, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear(in_features=81 + 25088, out_features=1024),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(in_features=1024, out_features=512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(in_features=512, out_features=9)
        )
    def forward(self, x):
        return F.softmax(self.classifier(x),dim=1)

In [None]:
class Agent():
    def __init__(self, classe, num_episodes=15, model_name='vgg16'):
        self.BATCH_SIZE = 100
        self.discount_factor = 0.95
        self.model_name = model_name
        screen_height = 224
        screen_width = 224
        self.n_actions = 9
        self.classe = classe
        
        self.feature_extractor = FeatureExtractor(network=self.model_name)
        
        self.policy_net = Policy_net()
        
        self.actions_history = []
        self.num_episodes = num_episodes
        
    def train(self, train_loader):
        xmin = 0.0
        xmax = 224.0
        ymin = 0.0
        ymax = 224.0
        
        for i_episode in range(self.num_episodes):
            print("Episode" + " " + str(i_episode))
            
            for key, _ in train_loader.items():
                image, ground_truth_boxes = extract(key, train_loader)
                original_image = image.clone()
                ground_truth = ground_truth_boxes[0]
                all_actions = []
                
            self.actions_history = torch.ones((9,9))
            state = self.compose_state(image)
            
            original_coordinates = [xmin, xmax, ymin, ymax]
            new_image = image
            done = False
            t = 0
            
            actual_equivalent_coord = original_coordinates
            
            while not done:
                t += 1
                
            
    def compose_state(self, image, dtype=FloatTensor):
        image_feature = self.get_features(image, dtype)
        image_feature = image_feature.view(1,-1)
        print("image feature: " + str(image_feature.shape))

        history_flatten = self.actions_history.view(1,-1).type(dtype)

        state = torch.cat((image_feature, history_flatten), 1)
        return state
    
    def get_features(self, image, dtype=FloatTensor):
        global transform
        image = image.view(1, *image.shape)
        image = Variable(image).type(dtype)
        if use_cuda:
            image = image.cuda()
            
        feature = self.feature_extractor(image)
        print("Feature shape: " + str(feature.shape))
        
        return feature.data
    
    def select_action_model(self, state):
        with torch.no_grad():
            if use_cuda:
                inpu = Variable(state).cuda()
            else:
                inpu = Variable(state)
                
            prob = self.policy_net(inpu)
            
            
            
            # _, predicted = torch.max(qval.data, 1)
            # print("Predicted : "+str(qval.data))
            # action = predicted[0] # + 1
            # print(action)
            
            
            return action