In [1]:
!unzip -q PPM-100

In [2]:
import numpy as np
from tqdm import tqdm
import cv2 as cv
import math
from PIL import Image
import time
import torch
import torch.nn as nn
import torch.nn.functional as F

In [3]:
# Data loading
fg_path = 'PPM-100/image/'

# path to provided alpha mattes
a_path = 'PPM-100/matte/'

# Path to background images (MSCOCO)
bg_path = 'PPM-100/background/'

# Path to folder where you want the composited images to go
out_path = 'merged/'

def composite4(fg, bg, a, w, h):
    fg = np.array(fg, np.float32)
    bg = np.array(bg[0:h, 0:w], np.float32)
    alpha = np.zeros((h, w, 1), np.float32)
    alpha[:, :, 0] = a / 255.
    comp = alpha * fg + (1 - alpha) * bg
    comp = comp.astype(np.uint8)
    print(a.shape)
    return comp


def process(im_name, bg_name, fcount, bcount):
    im = cv.imread(fg_path + im_name)
    a = cv.imread(a_path + im_name, 0)
    h, w = im.shape[:2]
    bg = cv.imread(bg_path + bg_name)
    bh, bw = bg.shape[:2]
    if not (bh < h or bw < w):
        left = int((bw - w) / 2)
        top = int((bh - h) / 2)
        right = int((bw + w) / 2)
        lower = int((bh + h) / 2)
        cropped = bg[top:lower, left:right, :]
        out = composite4(im, cropped, a, w, h)
        filename = out_path + str(fcount) + '_' + str(bcount) + '.png'
        cv.imwrite(filename, out)

In [4]:
with open('PPM-100/background.txt') as f:
    bg_files = f.read().splitlines()
with open('PPM-100/image.txt') as f:
    fg_files = f.read().splitlines()

In [None]:
with open('PPM-100/background.txt') as f:
    bg_files = f.read().splitlines()
with open('PPM-100/image.txt') as f:
    fg_files = f.read().splitlines()

num_bgs = 20

num_samples = len(fg_files) * num_bgs

print(bg_files)
print(fg_files)


start = time.time()
bcount = 0
for fcount in tqdm(range(len(fg_files))):
    im_name = fg_files[fcount]
    bcount = 0
    for i in range(num_bgs):
        bg_name = bg_files[bcount]
        process(im_name, bg_name, fcount, bcount)
        bcount += 1


end = time.time()
elapsed = end - start
print('elapsed: {} seconds'.format(elapsed))

['al-ghazali-3KmWk2WC_Z0-unsplash.jpg', 'alina-grubnyak-8yT8YL-x8CQ-unsplash.jpg', 'annie-spratt-_iH19KS6e2c-unsplash.jpg', 'hieu-vu-minh-He8-FZl-o10-unsplash.jpg', 'house-method-CqVHT8g45R8-unsplash.jpg', 'israa-hilles-xP0gM0Dh-MY-unsplash.jpg', 'jason-goodman-nF0nQuqBsrI-unsplash.jpg', 'jorgen-haland-8UE83jPlNXg-unsplash.jpg', 'jose-losada-Sm8TAus1pGs-unsplash.jpg', 'kevin-wolf-3AbwSH1y9dc-unsplash.jpg', 'lexie-barnhorn-rWjd8kNuT7Q-unsplash.jpg', 'lukasz-szmigiel-jFCViYFYcus-unsplash.jpg', 'luke-stackpoole-x2qSNIEZuEE-unsplash.jpg', 'mathias-adam-JKHUw0Xujf8-unsplash.jpg', 'michal-pechardo-bpt7mjgrBRQ-unsplash.jpg', 'mickey-o-neil-xL66l--msXU-unsplash.jpg', 'mike-benna-SBiVq9eWEtQ-unsplash.jpg', 'nolan-issac-K5sjajgbTFw-unsplash.jpg', 'ricardo-frantz-sC-BXbi9ajw-unsplash.jpg', 'rune-enstad-UXFJ-6Zj27M-unsplash.jpg']
['13179159164_1a4ae8d085_o.jpg', '14299313536_ea3e61076c_o.jpg', '14429083354_23c8fddff5_o.jpg', '14559969490_d33552a324_o.jpg', '14561870264_b21b665f1f_o.jpg', '14996438

  0%|          | 0/100 [00:00<?, ?it/s]

(3229, 4843)
(3229, 4843)


  0%|          | 0/100 [00:07<?, ?it/s]


KeyboardInterrupt: 

In [None]:
from google.colab.patches import cv2_imshow
for fcount in tqdm(range(len(fg_files))):
  im_name = fg_files[fcount]
  a = cv.imread(a_path + im_name, 0)
  blurred_image = cv.GaussianBlur(a, (15, 15), 0)
  # cv2_imshow(blurred_image)

  # cv.waitKey(0)
  cv.imwrite('blurred_matte/'+im_name, blurred_image)


100%|██████████| 100/100 [00:04<00:00, 20.95it/s]


In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Unpooling(nn.Module):
    def __init__(self):
        super(Unpooling, self).__init__()

    def forward(self, inputs):
        x = inputs[:, 1]
        bool_mask = inputs[:, 0] >= inputs[:, 1]
        mask = bool_mask.float()
        x = mask * x
        return x

In [11]:
class EncoderDecoder(nn.Module):
    def __init__(self):
        super(EncoderDecoder, self).__init__()

        # Encoder
        self.conv1_1 = nn.Conv2d(4, 64, kernel_size=3, padding=1)
        self.conv1_2 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)

        self.conv2_1 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.conv2_2 = nn.Conv2d(128, 128, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)

        self.conv3_1 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
        self.conv3_2 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
        self.conv3_3 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)

        self.conv4_1 = nn.Conv2d(256, 512, kernel_size=3, padding=1)
        self.conv4_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
        self.conv4_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
        self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2)

        self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
        self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
        self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
        self.pool5 = nn.MaxPool2d(kernel_size=2, stride=2)

        # Decoder
        self.unpool = Unpooling()

        self.deconv6 = nn.Conv2d(512, 512, kernel_size=1, padding='same')
        self.bn6 = nn.BatchNorm2d(512)
        self.upsample6 = nn.Upsample(scale_factor=2, mode='nearest')

        self.deconv5 = nn.Conv2d(512, 512, kernel_size=1, padding='same')
        self.bn5 = nn.BatchNorm2d(512)
        self.upsample5 = nn.Upsample(scale_factor=2, mode='nearest')

        self.deconv4 = nn.Conv2d(512, 256, kernel_size=5, padding='same')
        self.bn4 = nn.BatchNorm2d(256)
        self.upsample4 = nn.Upsample(scale_factor=2, mode='nearest')

        self.deconv3 = nn.Conv2d(256, 128, kernel_size=5, padding='same')
        self.bn3 = nn.BatchNorm2d(128)
        self.upsample3 = nn.Upsample(scale_factor=2, mode='nearest')

        self.deconv2 = nn.Conv2d(128, 64, kernel_size=5, padding='same')
        self.bn2 = nn.BatchNorm2d(64)
        self.upsample2 = nn.Upsample(scale_factor=2, mode='nearest')

        self.deconv1 = nn.Conv2d(64, 64, kernel_size=5, padding='same')
        self.bn1 = nn.BatchNorm2d(64)
        self.upsample1 = nn.Upsample(scale_factor=2, mode='nearest')

        self.deconv0 = nn.Conv2d(64, 1, kernel_size=5, padding='same')

    def forward(self, x):
        # Encoder
        x = F.relu(self.conv1_1(x))
        x = F.relu(self.conv1_2(x))
        orig_1 = x
        x = self.pool1(x)

        x = F.relu(self.conv2_1(x))
        x = F.relu(self.conv2_2(x))
        orig_2 = x
        x = self.pool2(x)

        x = F.relu(self.conv3_1(x))
        x = F.relu(self.conv3_2(x))
        x = F.relu(self.conv3_3(x))
        orig_3 = x
        x = self.pool3(x)

        x = F.relu(self.conv4_1(x))
        x = F.relu(self.conv4_2(x))
        x = F.relu(self.conv4_3(x))
        orig_4 = x
        x = self.pool4(x)

        x = F.relu(self.conv5_1(x))
        x = F.relu(self.conv5_2(x))
        x = F.relu(self.conv5_3(x))
        orig_5 = x
        x = self.pool5(x)

        Decoder
        x = F.relu(self.deconv6(x))
        x = self.bn6(x)
        x = self.upsample6(x)
        x = torch.cat((orig_5, x), dim=1)
        x = self.unpool(x)

        x = F.relu(self.deconv5(x))
        x = self.bn5(x)
        x = self.upsample5(x)
        x = torch.cat((orig_4, x), dim=1)
        x = self.unpool(x)

        x = F.relu(self.deconv4(x))
        x = self.bn4(x)
        x = self.upsample4(x)
        x = torch.cat((orig_3, x), dim=1)
        x = self.unpool(x)

        x = F.relu(self.deconv3(x))
        x = self.bn3(x)
        x = self.upsample3(x)
        x = torch.cat((orig_2, x), dim=1)
        x = self.unpool(x)

        x = F.relu(self.deconv2(x))
        x = self.bn2(x)
        x = self.upsample2(x)
        x = torch.cat((orig_1, x), dim=1)
        x = self.unpool(x)

        x = F.relu(self.deconv1(x))
        x = self.bn1(x)

        x = F.sigmoid(self.deconv0(x))

        return x

# Example of creating a model instance and printing the architecture
model = EncoderDecoder()
print(model)


EncoderDecoder(
  (conv1_1): Conv2d(4, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv1_2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2_1): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2_2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3_1): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3_2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3_3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (unpool): Unpooling()
  (deconv3): Conv2d(256, 128, kernel_size=(5, 5), stride=(1, 1), padding=same)
  (bn3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True

In [7]:
epsilon = 1e-6
epsilon_sqr = epsilon ** 2
def alpha_pred_loss(y_true, y_pred):
  num_pixels = y_true.shape[0] * y_true.shape[1]
  diff = y_true - y_pred
  return torch.sum(torch.sqrt(torch.square(diff) + epsilon_sqr)) / (num_pixels + epsilon)

In [8]:
model = EncoderDecoder().cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
  print("Using the GPU!")
else:
  print("WARNING: Could not find GPU! Using CPU only. If you want to enable GPU, please to go Edit > Notebook Settings > Hardware Accelerator and select GPU.")


Using the GPU!


In [9]:
def train(model, optimizer, n_iters=90):


  for i in range(n_iters):
    im_name = fg_files[i]
    bmat = cv.imread("PPM-100/blurred_matte/" + im_name, 0)
    img = cv.imread("PPM-100/image/" + im_name)
    img = cv.cvtColor(img, cv.COLOR_BGR2RGB)
    bmat_tensor = torch.from_numpy(bmat).float()/255.0
    bmat_tensor = bmat_tensor.unsqueeze(dim=2).cuda()
    img_tensor = torch.from_numpy(img).float() / 255.0
    img_tensor = img_tensor.cuda()
    x = torch.cat((bmat_tensor, img_tensor), dim=2)
    y_true = cv.imread(a_path + im_name, 0)
    y_true = torch.from_numpy(y_true).float() / 255.0
    y_true = y_true.cuda()

    # print(bmat_tensor.dtype)
    # print(img_tensor.dtype)
    # print(x.shape)


    optimizer.zero_grad()

    y_pred = model(x.permute(2, 0, 1).unsqueeze(0))

    loss = alpha_pred_loss(y_true, y_pred)
    print("Iteration: {}, loss = {}".format(i, loss))

    # Backward pass: compute gradient and do optimizer step
    loss.backward()
    optimizer.step()

In [13]:
train(model.to(device), optimizer)

OutOfMemoryError: CUDA out of memory. Tried to allocate 3.73 GiB. GPU 0 has a total capacity of 14.75 GiB of which 487.06 MiB is free. Process 2690 has 14.27 GiB memory in use. Of the allocated memory 14.01 GiB is allocated by PyTorch, and 141.04 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)